loss does not decrease during training
ChiuHsin opened this issue · 1 comments
ChiuHsin commented
Hello, I tried to simplify your code for NER task. I made a model as below
def load_model(self):
self.encoder = create_transformer(embedding_layer_norm=True,
neg_inf=-10000.0,
use_attn_mask=self.config.use_attn_mask,
vocab_size=self.bert_config.vocab_size,
accurate_gelu=True,
layer_norm_epsilon=1e-12,
max_len=self.config.max_len,
use_one_embedding_dropout=True,
d_hid=self.bert_config.intermediate_size,
embedding_dim=self.bert_config.hidden_size,
num_layers=self.bert_config.num_hidden_layers,
num_heads=self.bert_config.num_attention_heads,
residual_dropout=self.bert_config.hidden_dropout_prob,
attention_dropout=self.bert_config.attention_probs_dropout_prob)
self.encoder = load_google_bert(self.encoder, self.bert_config.vocab_size, self.config.bert_dir_path, self.config.max_len, self.config.verbose)
decoder = Dense(units=self.config.num_classes)
logits = TimeDistributed(decoder)(
Dropout(self.config.dropout)(self.encoder.outputs[0]))
task_target = Input(batch_shape=(None, self.config.max_len,), dtype='int32')
task_mask = Input(batch_shape=(None, self.config.max_len), dtype='int32')
task_loss = Lambda(lambda x: masked_classification_loss(x[0], x[1], x[2]))([task_target, logits, task_mask])
# sharing layers between training model and prediction model
self.train_model = Model(inputs=self.encoder.inputs+[task_target, task_mask], outputs=task_loss)
self.model = Model(inputs=self.encoder.inputs, outputs=logits)
def compile(self, *args, **kwargs):
return self.train_model.compile(*args, loss=pass_through_loss, **kwargs)
Then train the model by
model = XXXX(config)
model.compile(optimizer='adam')
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=10, verbose=1)
checkpoint = ModelCheckpoint(
os.path.join(config.dir_output, 'best-weights.h5'),
monitor='val_loss',
verbose=1,
save_best_only=True,
save_weights_only=True
)
model.train_model.fit_generator(train_generator, steps_per_epoch=steps_per_epoch,
validation_data=dev_generator,
validation_steps=dev_steps, verbose=1, callbacks=[earlystop, checkpoint],
shuffle=False, epochs=100)
```.
In addition, I modified the function load_google_bert, commented the line
`weights[w_id][vocab_size + TextEncoder.EOS_OFFSET] = saved[3 + TextEncoder.BERT_UNUSED_COUNT]`
because the variable `TextEncoder.BERT_SPECIAL_COUNT` is 4 instead of 5,
so the created model does not have so many weigths.