hemingkx/CLUENER2020

Bert-BiLSTM-CRF 损失值持续增大最终变为00nan

Candysad opened this issue · 2 comments

减小学习率只能略微减慢损失达到0的速度

模型和训练过程都是照着写的

模型代码如下
'''

class BertBiLSTMCRF(BertPreTrainedModel):
    def __init__(self, config):
        super(BertBiLSTMCRF, self).__init__(config)
        self.num_labels = config.num_labels
        self.lr = self.config.learning_rate
        self.wd = self.config.weight_decay

        self.bert = BertModel.from_pretrained("./model/bert-base-chinese")
        self.bert_dropout = nn.Dropout(self.config.lstm_layer_dropout)
        self.bilstm = nn.LSTM(input_size=768,
                              hidden_size=384,
                              batch_first=True,
                              num_layers=2,
                              dropout=self.config.lstm_dropout_prob,
                              bidirectional=True)
        self.liner_classifier = nn.Linear(in_features=768, out_features=self.num_labels)
        self.crf = CRF(self.num_labels, batch_first=True)
        self.init_weights()

    def forward(self,  input_ids, attention_mask, labels=None):
        bert_last_state = self.bert(input_ids=input_ids,
                                    attention_mask=attention_mask).last_hidden_state
        bert_dropout = self.bert_dropout(bert_last_state)
        lstm_output, _ = self.bilstm(bert_dropout)
        liner_classify_output = self.liner_classifier(lstm_output)

        loss = None
        if labels is not None:
            loss = - self.crf(emissions=liner_classify_output, tags=labels, mask=attention_mask.byte(), reduction="mean")
        output = self.crf.decode(liner_classify_output)
        return output, loss

    def get_grouped_parameters(self, full_finetuning=False):
        if full_finetuning:
            bert_parameters = list(self.bert.named_parameters())
            lstm_parameters = list(self.bilstm.named_parameters())
            liner_parameters = list(self.liner_classifier.named_parameters())
            crf_parameters = list(self.crf.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {"params": [p for n, p in bert_parameters if not any(nd in n for nd in no_decay)],
                 "weight_decay": self.wd},
                {"params": [p for n, p in bert_parameters if any(nd in n for nd in no_decay)],
                 "weight_decay": 0.0},

                {"params": [p for n, p in lstm_parameters if not any(nd in n for nd in no_decay)],
                 "lr": self.lr, "weight_decay": self.wd},
                {"params": [p for n, p in lstm_parameters if any(nd in n for nd in no_decay)],
                 "lr": self.lr, "weight_decay": 0.0},

                {"params": [p for n, p in liner_parameters if not any(nd in n for nd in no_decay)],
                 "lr": self.lr, "weight_decay": self.wd},
                {"params": [p for n, p in liner_parameters if any(nd in n for nd in no_decay)],
                 "lr": self.lr, "weight_decay": 0.0},

                {'params': [p for n, p in crf_parameters],
                 'lr': self.lr}
            ]

        else:
            lstm_parameters = list(self.bilstm.named_parameters())
            liner_parameters = list(self.liner_classifier.named_parameters())
            # crf_parameters = list(self.crf.named_parameters())
            no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
            optimizer_grouped_parameters = [
                {"params": [p for n, p in lstm_parameters if not any(nd in n for nd in no_decay)],
                 "lr": self.lr * 0.001, "weight_decay": self.wd},
                {"params": [p for n, p in lstm_parameters if any(nd in n for nd in no_decay)],
                 "lr": self.lr * 0.001, "weight_decay": 0.0},

                {"params": [p for n, p in liner_parameters],
                 "lr": self.lr * 0.001},

                {'params': [p for n, p in crf_parameters],
                 'lr': self.lr * 0.001}
            ]

        return optimizer_grouped_parameters

‘’‘

optimizer使用transformers的AdamW
scheduler使用transformers的get_linear_schedule_with_warmup
learning_rate为3e-5
weight_decay为0.01
clip_grad为5

batch_size 为1的时候稳定在100步左右达到00nan

目前修改optimizer、scheduler和以上参数都不能使loss变小

破案了...模型里的参数不能用函数返回出来...
没搞懂原理加的函数,学艺不精,吃一堑长一智了