ELECTRA continual pre-training doesn't accept xlm-roberta type tokenizer.
aloka-fernando opened this issue · 0 comments
aloka-fernando commented
Describe the bug
I need to conduct electra continual pre-training by initializing the generator and discriminator each with a xlm-roberta model. This works fine if I pass the tokenizer as "google/electra-small-discriminator".
The moment I change the tokenizer to accept the xlm-roberta tokenizer (which should be the correct tokenizer to use for my scenario) I get the below error.
To Reproduce
train_args = {
"reprocess_input_data": True,
"overwrite_output_dir":False,
"output_dir":output_dir,
"num_train_epochs": 5,
"save_eval_checkpoints": True,
"save_model_every_epoch": False,
"learning_rate": 5e-4,
"warmup_steps": 1000,
"train_batch_size": 32,
"eval_batch_size": 32,
"gradient_accumulation_steps": 1,
"block_size": 100, #text overflow parameter for chunking
"max_seq_length": max_length,
"logging_steps": 5000,
"evaluate_during_training": True,
"evaluate_during_training_steps": 5000,
"evaluate_during_training_verbose": True,
"use_cached_eval_features": True,
"sliding_window": True,
"strip_accents":False,
"tokenizer_name": "xlm-roberta-base",
"fp16":False,
"early_stopping_consider_epochs":True,
"early_stopping_patience":5,
"encoding":"utf-8"
}
model = LanguageModelingModel(
model_name="electra",
model_type="electra",
args=train_args,
generator_name="3_baselines/xlm-roberta-base",
discriminator_name="3_baselines/xlm-roberta-base",
use_cuda=False
)
model.train_model(
train_file=train.txt,
eval_file=valid.txt,
)
Error
Epoch 1 of 5: 0%| | 0/5 [00:00<?, ?it/s]
Traceback (most recent call last):
File "electra_cp_st.py", line 80, in <module>
eval_file=valid_file,
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py", line 470, in train_model
**kwargs,
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py", line 787, in train
for step, batch in enumerate(batch_iterator):
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/tqdm/std.py", line 1195, in __iter__
for obj in iterable:
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 363, in __next__
data = self._next_data()
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/torch/utils/data/dataloader.py", line 403, in _next_data
data = self._dataset_fetcher.fetch(index) # may raise StopIteration
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in fetch
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py", line 44, in <listcomp>
data = [self.dataset[idx] for idx in possibly_batched_index]
File "/userdirs/aloka/anaconda3/envs/env_pt_st/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_utils.py", line 212, in __getitem__
return torch.tensor(self.examples[item], dtype=torch.long)
TypeError: an integer is required (got type NoneType)
I have tried the following
- Passed the xlm-roberta tokenizer contained folder in the train args "tokenizer_name": "3_baselines/xlm-roberta-base"
- Passing the tokenized train and valid data. But seems like this kind of input is not acceptable in SimpleTransformers.
tokenizer=AutoTokenizer.from_pretrained("xlm-roberta-base")
class TokenizedSentencesDataset:
def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
self.tokenizer = tokenizer
self.sentences = sentences
self.max_length = max_length
self.cache_tokenization = cache_tokenization
def __getitem__(self, item):
if not self.cache_tokenization:
return self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
if isinstance(self.sentences[item], str):
self.sentences[item] = self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
return self.sentences[item]
def __len__(self):
return len(self.sentences)
train_inputs = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
Appreciate if a workaround or a possible solution can be suggested.