Ch6 padding issue
Closed this issue · 2 comments
JacobsJohan commented
I'm following the video series on O'Reilly and am getting an error when running the notebook from chapter six.
Code cell:
# Get initial metrics
trainer.evaluate()
Error:
***** Running Evaluation *****
Num examples = 2617
Batch size = 32
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:705, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
704 if not is_tensor(value):
--> 705 tensor = as_tensor(value)
707 # Removing this for now in favor of controlling the shape with `prepend_batch_axis`
708 # # at-least2d
709 # if tensor.ndim > 2:
710 # tensor = tensor.squeeze(0)
711 # elif tensor.ndim < 2:
712 # tensor = tensor[None, :]
ValueError: too many dimensions 'str'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[85], line 2
1 # Get initial metrics
----> 2 trainer.evaluate()
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/trainer.py:2151, in Trainer.evaluate(self, eval_dataset, ignore_keys, metric_key_prefix)
2148 start_time = time.time()
2150 eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop
-> 2151 output = eval_loop(
2152 eval_dataloader,
2153 description="Evaluation",
2154 # No point gathering the predictions if there are no metrics, otherwise we defer to
2155 # self.args.prediction_loss_only
2156 prediction_loss_only=True if self.compute_metrics is None else None,
2157 ignore_keys=ignore_keys,
2158 metric_key_prefix=metric_key_prefix,
2159 )
2161 total_batch_size = self.args.eval_batch_size * self.args.world_size
2162 output.metrics.update(
2163 speed_metrics(
2164 metric_key_prefix,
(...)
2168 )
2169 )
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/trainer.py:2313, in Trainer.evaluation_loop(self, dataloader, description, prediction_loss_only, ignore_keys, metric_key_prefix)
2311 observed_num_examples = 0
2312 # Main evaluation loop
-> 2313 for step, inputs in enumerate(dataloader):
2314 # Update the observed num examples
2315 observed_batch_size = find_batch_size(inputs)
2316 if observed_batch_size is not None:
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/torch/utils/data/dataloader.py:681, in _BaseDataLoaderIter.__next__(self)
678 if self._sampler_iter is None:
679 # TODO(https://github.com/pytorch/pytorch/issues/76750)
680 self._reset() # type: ignore[call-arg]
--> 681 data = self._next_data()
682 self._num_yielded += 1
683 if self._dataset_kind == _DatasetKind.Iterable and \
684 self._IterableDataset_len_called is not None and \
685 self._num_yielded > self._IterableDataset_len_called:
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/torch/utils/data/dataloader.py:721, in _SingleProcessDataLoaderIter._next_data(self)
719 def _next_data(self):
720 index = self._next_index() # may raise StopIteration
--> 721 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
722 if self._pin_memory:
723 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/torch/utils/data/_utils/fetch.py:52, in _MapDatasetFetcher.fetch(self, possibly_batched_index)
50 else:
51 data = self.dataset[possibly_batched_index]
---> 52 return self.collate_fn(data)
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/data/data_collator.py:246, in DataCollatorWithPadding.__call__(self, features)
245 def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]:
--> 246 batch = self.tokenizer.pad(
247 features,
248 padding=self.padding,
249 max_length=self.max_length,
250 pad_to_multiple_of=self.pad_to_multiple_of,
251 return_tensors=self.return_tensors,
252 )
253 if "label" in batch:
254 batch["labels"] = batch["label"]
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:2804, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, return_attention_mask, return_tensors, verbose)
2801 batch_outputs[key] = []
2802 batch_outputs[key].append(value)
-> 2804 return BatchEncoding(batch_outputs, tensor_type=return_tensors)
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:210, in BatchEncoding.__init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
206 n_sequences = encoding[0].n_sequences
208 self._n_sequences = n_sequences
--> 210 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
File ~/miniconda3/envs/hugging/lib/python3.9/site-packages/transformers/tokenization_utils_base.py:721, in BatchEncoding.convert_to_tensors(self, tensor_type, prepend_batch_axis)
716 if key == "overflowing_tokens":
717 raise ValueError(
718 "Unable to create tensor returning overflowing tokens of different lengths. "
719 "Please see if a fast version of this tokenizer is available to have this feature available."
720 )
--> 721 raise ValueError(
722 "Unable to create tensor, you should probably activate truncation and/or padding "
723 "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
724 )
726 return self
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length.
I've tried to add padding to the preprocessing function, but this didn't help either.
def preprocess_function(examples):
return tokenizer(examples["utterance"], truncation=True, padding=True)
Any clue how to solve this?
sinanuozdemir commented
Hmm,
- I'm able to run this code as is from the notebook on github, have you modified things in any way?
- Try running
seq_clf_tokenized_snips = seq_clf_tokenized_snips.remove_columns(['utterance'])
before training to remove the one column that is a string. This may help.
If it's still happening, please let me know!
JacobsJohan commented
Huh, I didn't change anything but just ran my notebook again from the start and the problem disappeared. I remember playing around with it a bit when I initially tested this, so perhaps running the cells in a different order can cause a problem.
Regardless, thanks for your response. I will close this issue.