Pickling error while retraining [BUG]
ChargedMonk opened this issue · 15 comments
Describe the bug
---------------------------------------------------------------------------
OSError Traceback (most recent call last)
Input In [4], in <module>
7 # The path to save our checkpoints
8 logging_path = "checkpoints"
---> 10 address_parser.retrain(training_container, 0.8, epochs=5, batch_size=2, num_workers=1, callbacks=[lr_scheduler], prediction_tags=tag_dictionary, logging_path=logging_path)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\parser\address_parser.py:517, in AddressParser.retrain(self, dataset_container, train_ratio, batch_size, epochs, num_workers, learning_rate, callbacks, seed, logging_path, disable_tensorboard, prediction_tags, seq2seq_params)
511 print(
512 "You are using a older version of Poutyne that does not support properly error management."
513 " Due to that, we cannot show retrain progress. To fix that, update Poutyne to "
514 "the newest version."
515 )
516 with_capturing_context = True
--> 517 train_res = self._retrain(
518 experiment=exp,
519 train_generator=train_generator,
520 valid_generator=valid_generator,
521 epochs=epochs,
522 seed=seed,
523 callbacks=callbacks,
524 disable_tensorboard=disable_tensorboard,
525 capturing_context=with_capturing_context,
526 )
527 except RuntimeError as error:
528 list_of_file_path = os.listdir(path=".")
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\parser\address_parser.py:849, in AddressParser._retrain(self, experiment, train_generator, valid_generator, epochs, seed, callbacks, disable_tensorboard, capturing_context)
834 def _retrain(
835 self,
836 experiment: Experiment,
(...)
846 # If Poutyne 1.7 and before, we capture poutyne print since it print some exception.
847 # Otherwise, we use a null context manager.
848 with Capturing() if capturing_context else contextlib.nullcontext():
--> 849 train_res = experiment.train(
850 train_generator,
851 valid_generator=valid_generator,
852 epochs=epochs,
853 seed=seed,
854 callbacks=callbacks,
855 verbose=self.verbose,
856 disable_tensorboard=disable_tensorboard,
857 )
858 return train_res
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\experiment.py:519, in Experiment.train(self, train_generator, valid_generator, **kwargs)
471 def train(self, train_generator, valid_generator=None, **kwargs) -> List[Dict]:
472 """
473 Trains or finetunes the model on a dataset using a generator. If a previous training already occurred
474 and lasted a total of `n_previous` epochs, then the model's weights will be set to the last checkpoint and the
(...)
517 List of dict containing the history of each epoch.
518 """
--> 519 return self._train(self.model.fit_generator, train_generator, valid_generator, **kwargs)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\experiment.py:668, in Experiment._train(self, training_func, callbacks, lr_schedulers, keep_only_last_best, save_every_epoch, disable_tensorboard, seed, *args, **kwargs)
665 expt_callbacks += callbacks
667 try:
--> 668 return training_func(*args, initial_epoch=initial_epoch, callbacks=expt_callbacks, **kwargs)
669 finally:
670 if self.logging:
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\model.py:542, in Model.fit_generator(self, train_generator, valid_generator, epochs, steps_per_epoch, validation_steps, batches_per_step, initial_epoch, verbose, progress_options, callbacks)
540 self._fit_generator_n_batches_per_step(epoch_iterator, callback_list, batches_per_step)
541 else:
--> 542 self._fit_generator_one_batch_per_step(epoch_iterator, callback_list)
544 return epoch_iterator.epoch_logs
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\model.py:613, in Model._fit_generator_one_batch_per_step(self, epoch_iterator, callback_list)
611 for train_step_iterator, valid_step_iterator in epoch_iterator:
612 with self._set_training_mode(True):
--> 613 for step, (x, y) in train_step_iterator:
614 step.loss, step.metrics, _ = self._fit_batch(x, y, callback=callback_list, step=step.number)
615 step.size = self.get_batch_size(x, y)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\iterators.py:73, in StepIterator.__iter__(self)
71 def __iter__(self):
72 time_since_last_batch = timeit.default_timer()
---> 73 for step, data in _get_step_iterator(self.steps_per_epoch, self.generator):
74 self.on_batch_begin(step, {})
76 step_data = Step(step)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\iterators.py:18, in cycle(iterable)
16 def cycle(iterable): # Equivalent to itertools cycle, without any extra memory requirement
17 while True:
---> 18 for x in iterable:
19 yield x
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:359, in DataLoader.__iter__(self)
357 return self._iterator
358 else:
--> 359 return self._get_iterator()
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:305, in DataLoader._get_iterator(self)
303 else:
304 self.check_worker_number_rationality()
--> 305 return _MultiProcessingDataLoaderIter(self)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:918, in _MultiProcessingDataLoaderIter.__init__(self, loader)
911 w.daemon = True
912 # NB: Process.start() actually take some time as it needs to
913 # start a process and pass the arguments over via a pipe.
914 # Therefore, we only add a worker to self._workers list after
915 # it started, so that we do not call .join() if program dies
916 # before it starts, and __del__ tries to join but will get:
917 # AssertionError: can only join a started process.
--> 918 w.start()
919 self._index_queues.append(index_queue)
920 self._workers.append(w)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\process.py:121, in BaseProcess.start(self)
118 assert not _current_process._config.get('daemon'), \
119 'daemonic processes are not allowed to have children'
120 _cleanup()
--> 121 self._popen = self._Popen(self)
122 self._sentinel = self._popen.sentinel
123 # Avoid a refcycle if the target function holds an indirect
124 # reference to the process object (see bpo-30775)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\context.py:224, in Process._Popen(process_obj)
222 @staticmethod
223 def _Popen(process_obj):
--> 224 return _default_context.get_context().Process._Popen(process_obj)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\context.py:327, in SpawnProcess._Popen(process_obj)
324 @staticmethod
325 def _Popen(process_obj):
326 from .popen_spawn_win32 import Popen
--> 327 return Popen(process_obj)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\popen_spawn_win32.py:93, in Popen.__init__(self, process_obj)
91 try:
92 reduction.dump(prep_data, to_child)
---> 93 reduction.dump(process_obj, to_child)
94 finally:
95 set_spawning_popen(None)
File C:\Program Files\WindowsApps\PythonSoftwareFoundation.Python.3.8_3.8.2800.0_x64__qbz5n2kfra8p0\lib\multiprocessing\reduction.py:60, in dump(obj, file, protocol)
58 def dump(obj, file, protocol=None):
59 '''Replacement for pickle.dump() using ForkingPickler.'''
---> 60 ForkingPickler(file, protocol).dump(obj)
OSError: [Errno 22] Invalid argument
To Reproduce
I'm trying to train on custom tags on my own data like this -
lr_scheduler = poutyne.StepLR(step_size=1, gamma=0.1)
tag_dictionary = {'STREET_NUMBER': 0, 'STREET_NAME': 1, 'UNSTRUCTURED_STREET_ADDRESS': 2, 'CITY': 3, 'COUNTRY_SUB_ENTITY': 4, 'COUNTRY': 5, 'POSTAL_CODE': 6, 'EOS': 7}
logging_path = "checkpoints"
address_parser.retrain(training_container, 0.8, epochs=5, batch_size=2, num_workers=1, callbacks=[lr_scheduler], prediction_tags=tag_dictionary, logging_path=logging_path)
Desktop (please complete the following information):
- OS: Windows 10
- Using CPU for training (as dataset is small)
- Which model are you trying to retrain (fasttext or bpemb) (with attention or not)?
- Which version of Python?
- Which version of Poutyne?
@davebulaval, Fasttext without attention. Python 3.8.10, Poutyne 1.8
That is what I thought, we have a lot of difficulties with fasttext on Windows.
Can you test with BPEmb to see if the error persists?
@davebulaval, I tried it on BPEmb and ran into this error -
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
Input In [6], in <module>
7 # The path to save our checkpoints
8 logging_path = "checkpoints"
---> 10 address_parser.retrain(training_container, 0.8, epochs=5, batch_size=2, num_workers=1, callbacks=[lr_scheduler], prediction_tags=tag_dictionary, logging_path=logging_path)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\parser\address_parser.py:517, in AddressParser.retrain(self, dataset_container, train_ratio, batch_size, epochs, num_workers, learning_rate, callbacks, seed, logging_path, disable_tensorboard, prediction_tags, seq2seq_params)
511 print(
512 "You are using a older version of Poutyne that does not support properly error management."
513 " Due to that, we cannot show retrain progress. To fix that, update Poutyne to "
514 "the newest version."
515 )
516 with_capturing_context = True
--> 517 train_res = self._retrain(
518 experiment=exp,
519 train_generator=train_generator,
520 valid_generator=valid_generator,
521 epochs=epochs,
522 seed=seed,
523 callbacks=callbacks,
524 disable_tensorboard=disable_tensorboard,
525 capturing_context=with_capturing_context,
526 )
527 except RuntimeError as error:
528 list_of_file_path = os.listdir(path=".")
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\parser\address_parser.py:849, in AddressParser._retrain(self, experiment, train_generator, valid_generator, epochs, seed, callbacks, disable_tensorboard, capturing_context)
834 def _retrain(
835 self,
836 experiment: Experiment,
(...)
846 # If Poutyne 1.7 and before, we capture poutyne print since it print some exception.
847 # Otherwise, we use a null context manager.
848 with Capturing() if capturing_context else contextlib.nullcontext():
--> 849 train_res = experiment.train(
850 train_generator,
851 valid_generator=valid_generator,
852 epochs=epochs,
853 seed=seed,
854 callbacks=callbacks,
855 verbose=self.verbose,
856 disable_tensorboard=disable_tensorboard,
857 )
858 return train_res
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\experiment.py:519, in Experiment.train(self, train_generator, valid_generator, **kwargs)
471 def train(self, train_generator, valid_generator=None, **kwargs) -> List[Dict]:
472 """
473 Trains or finetunes the model on a dataset using a generator. If a previous training already occurred
474 and lasted a total of `n_previous` epochs, then the model's weights will be set to the last checkpoint and the
(...)
517 List of dict containing the history of each epoch.
518 """
--> 519 return self._train(self.model.fit_generator, train_generator, valid_generator, **kwargs)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\experiment.py:668, in Experiment._train(self, training_func, callbacks, lr_schedulers, keep_only_last_best, save_every_epoch, disable_tensorboard, seed, *args, **kwargs)
665 expt_callbacks += callbacks
667 try:
--> 668 return training_func(*args, initial_epoch=initial_epoch, callbacks=expt_callbacks, **kwargs)
669 finally:
670 if self.logging:
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\model.py:542, in Model.fit_generator(self, train_generator, valid_generator, epochs, steps_per_epoch, validation_steps, batches_per_step, initial_epoch, verbose, progress_options, callbacks)
540 self._fit_generator_n_batches_per_step(epoch_iterator, callback_list, batches_per_step)
541 else:
--> 542 self._fit_generator_one_batch_per_step(epoch_iterator, callback_list)
544 return epoch_iterator.epoch_logs
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\model.py:613, in Model._fit_generator_one_batch_per_step(self, epoch_iterator, callback_list)
611 for train_step_iterator, valid_step_iterator in epoch_iterator:
612 with self._set_training_mode(True):
--> 613 for step, (x, y) in train_step_iterator:
614 step.loss, step.metrics, _ = self._fit_batch(x, y, callback=callback_list, step=step.number)
615 step.size = self.get_batch_size(x, y)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\iterators.py:73, in StepIterator.__iter__(self)
71 def __iter__(self):
72 time_since_last_batch = timeit.default_timer()
---> 73 for step, data in _get_step_iterator(self.steps_per_epoch, self.generator):
74 self.on_batch_begin(step, {})
76 step_data = Step(step)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\poutyne\framework\iterators.py:18, in cycle(iterable)
16 def cycle(iterable): # Equivalent to itertools cycle, without any extra memory requirement
17 while True:
---> 18 for x in iterable:
19 yield x
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:521, in _BaseDataLoaderIter.__next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
524 self._IterableDataset_len_called is not None and \
525 self._num_yielded > self._IterableDataset_len_called:
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:1203, in _MultiProcessingDataLoaderIter._next_data(self)
1201 else:
1202 del self._task_info[idx]
-> 1203 return self._process_data(data)
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\dataloader.py:1229, in _MultiProcessingDataLoaderIter._process_data(self, data)
1227 self._try_put_index()
1228 if isinstance(data, ExceptionWrapper):
-> 1229 data.reraise()
1230 return data
File c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\_utils.py:434, in ExceptionWrapper.reraise(self)
430 except TypeError:
431 # If the exception takes multiple arguments, don't try to
432 # instantiate since we don't know how to
433 raise RuntimeError(msg) from None
--> 434 raise exception
KeyError: Caught KeyError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\_utils\worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\torch\utils\data\_utils\fetch.py", line 52, in fetch
return self.collate_fn(data)
File "c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\converter\data_transform.py", line 49, in teacher_forcing_transform
vectorize_batch_pairs = self.vectorizer(batch_pairs)
File "c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\vectorizer\train_vectorizer.py", line 25, in __call__
self.embedding_vectorizer([address[0] for address in addresses])
File "c:\VB\AddressParsing\fresh\freshenv\lib\site-packages\deepparse\vectorizer\train_vectorizer.py", line 25, in <listcomp>
self.embedding_vectorizer([address[0] for address in addresses])
KeyError: 0
Yes, see your other issue (#112). To me, it looks like some data points are empty. I will try to investigate Windows and fasttext error next week.
@davebulaval, you're right there were some missing data points. Sorry to trouble you. I'll close both the issues.
@ChargedMonk no problem. I'm working RN on a fix to improve the error handling to give users more insight into the problem.
@davebulaval, I've ran into a couple of issues now -
- When retraining on fasttext, the checkpoint didn't get saved. There was log.tsv and plots but no .ckpt
- On retraining on BPEmb (both on windows and colab), the following error occurs:
ValueError: Caught ValueError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/usr/local/lib/python3.7/dist-packages/deepparse/converter/data_transform.py", line 49, in teacher_forcing_transform
vectorize_batch_pairs = self.vectorizer(batch_pairs)
File "/usr/local/lib/python3.7/dist-packages/deepparse/vectorizer/train_vectorizer.py", line 27, in __call__
self.embedding_vectorizer([address[0] for address in addresses])
File "/usr/local/lib/python3.7/dist-packages/deepparse/vectorizer/bpemb_vectorizer.py", line 31, in __call__
batch = [self._vectorize_sequence(address) for address in addresses]
File "/usr/local/lib/python3.7/dist-packages/deepparse/vectorizer/bpemb_vectorizer.py", line 31, in <listcomp>
batch = [self._vectorize_sequence(address) for address in addresses]
File "/usr/local/lib/python3.7/dist-packages/deepparse/vectorizer/bpemb_vectorizer.py", line 52, in _vectorize_sequence
self._max_length = max(self._max_length, max(word_decomposition_lengths))
ValueError: max() arg is an empty sequence
Update: this (1st issue) happens when the model early stops in the middle of 1st epoch.
The first issue is normal if at least one epoch is not processed.
Second, it seems again like empty addresses. Which language are you working on?
Second, it seems again like empty addresses. Which language are you working on?
The language is English but this time there are no empty addresses as Fasttext did train on it but BPEmb ran into this issue (on both Windows & Linux)
Will take a look. Can you share you're dataset and code to ease the debug?
I'm not allowed to share the dataset but I'll share with you some examples and the code.
So the dataset contains international addresses like these -
Address = '7TH, FLOOR LADDER C, ZHINGPING ELECTRONICS FACTORY NO 264, QUANZHOU, FJ, CHINA, 02116'
Tags = ['STREET_NUMBER', 'STREET_NAME', 'STREET_NAME', 'STREET_NAME', 'UNSTRUCTURED_STREET_ADDRESS', 'UNSTRUCTURED_STREET_ADDRESS', 'UNSTRUCTURED_STREET_ADDRESS', 'UNSTRUCTURED_STREET_ADDRESS', 'UNSTRUCTURED_STREET_ADDRESS', 'CITY', 'COUNTRY_SUB_ENTITY', 'COUNTRY', 'POSTAL_CODE']
https://drive.google.com/drive/folders/1g1YMAcYJgG9yQKEgqArk4iSdqlaVU31r?usp=sharing
Looking again at the part where it break, I am 100% sure that it is an "empty" address. This part of the code sends to bpemb package the address to construct a byte-pair embedding. Then, we construct a list of the decomposition len.
My hypothesis is that you have at least one address that is just a " "
. So the for word in address.split():
return an empty iterator and when we are doing the max on word_decomposition_lengths
, an error is raised.
Yeah maybe it was buggy on my part but I don't know why fasttext trained on it while bpemb didn't. Anyways, I recreated the dataset (carefully this time) and it seems to be working. Sorry for all the trouble. Great project btw.
Yeah maybe it was buggy on my part but I don't know why fasttext trained on it while bpemb didn't. Anyways, I recreated the dataset (carefully this time) and it seems to be working. Sorry for all the trouble. Great project btw.
It is normal since fasttext and BPEmb don't use the same parsing of words tokens. BPEmb iterates over the token and uses byte-pair encoding. Thus, an address of only whitespace (" "
) will raise an error when we loop on the tokens. While fasttext will work since we don't loop similarly and it exists an embedding token for a whitespace character.
In the next version, this type of error will be handled better.