ThilinaRajapakse/simpletransformers

TypeError: an integer is required (got type NoneType)

ijazul-haq opened this issue · 1 comments

I want to pretrain BERT on custom corpus but got this error,

Source Code
from simpletransformers.language_modeling import LanguageModelingModel, LanguageModelingArgs
from sklearn.model_selection import train_test_split
import pandas as pd
import tokenizers

orpus_path = "/kaggle/input/line_by_line_corpus.txt"

print('Training Tokenizer...')
tokenizer = tokenizers.BertWordPieceTokenizer()
tokenizer.train(
files=[corpus_path],
vocab_size=50000,
min_frequency=2,
limit_alphabet=1000
)
tokenizer.save("/kaggle/working/tokenizer.json", pretty=True)

model_args = LanguageModelingArgs(
tokenizer_name='/kaggle/working/tokenizer.json',
overwrite_output_dir = True,
save_steps = 10_000,
)

model = LanguageModelingModel(
"bert",
None,
args=model_args,
train_files=corpus_path

model.train_model(corpus_path)

Enviroment

  • Kaggle with P100 GPU

Error!!

TypeError Traceback (most recent call last)
/tmp/ipykernel_23/3446545645.py in
17
18 print('\nTraining Started...')
---> 19 model.train_model(corpus_path)
20 print('\nTraining Finished...')

/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py in train_model(self, train_file, output_dir, show_running_loss, args, eval_file, verbose, **kwargs)
468 eval_file=eval_file,
469 verbose=verbose,
--> 470 **kwargs,
471 )
472

/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_model.py in train(self, train_dataset, output_dir, show_running_loss, eval_file, verbose, **kwargs)
785 mininterval=0,
786 )
--> 787 for step, batch in enumerate(batch_iterator):
788 if steps_trained_in_current_epoch > 0:
789 steps_trained_in_current_epoch -= 1

/opt/conda/lib/python3.7/site-packages/tqdm/notebook.py in iter(self)
257 try:
258 it = super(tqdm_notebook, self).iter()
--> 259 for obj in it:
260 # return super(tqdm...) will not catch exception
261 yield obj

/opt/conda/lib/python3.7/site-packages/tqdm/std.py in iter(self)
1193
1194 try:
-> 1195 for obj in iterable:
1196 yield obj
1197 # Update and possibly print the progressbar.

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in next(self)
626 # TODO(https://github.com/pytorch/pytorch/issues/76750)%3C/span%3E)
627 self._reset() # type: ignore[call-arg]
--> 628 data = self._next_data()
629 self._num_yielded += 1
630 if self._dataset_kind == _DatasetKind.Iterable and \

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
669 def _next_data(self):
670 index = self._next_index() # may raise StopIteration
--> 671 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
672 if self._pin_memory:
673 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
56 data = self.dataset.getitems(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in (.0)
56 data = self.dataset.getitems(possibly_batched_index)
57 else:
---> 58 data = [self.dataset[idx] for idx in possibly_batched_index]
59 else:
60 data = self.dataset[possibly_batched_index]

/opt/conda/lib/python3.7/site-packages/simpletransformers/language_modeling/language_modeling_utils.py in getitem(self, item)
210
211 def getitem(self, item):
--> 212 return torch.tensor(self.examples[item], dtype=torch.long)
213
214

TypeError: an integer is required (got type NoneType)

stale commented

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.