ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Opened this issue · 0 comments
youssefabdelm commented
Hi! Thanks for making this amazing package.
When I do:
import torch
from lm_scorer.models.auto import AutoLMScorer as LMScorer
# Load model to cpu or cuda
device = "cuda:0" if torch.cuda.is_available() else "cpu"
batch_size = 100
scorer = LMScorer.from_pretrained("distilgpt2", device=device, batch_size=batch_size)
And then:
scorer.sentence_score(["Sasgdkjlasdjglakjsdg", "Sentence 2"], log=True)
(Or any sentences of different length while trying to batch, I get this error):
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
714 if not is_tensor(value):
--> 715 tensor = as_tensor(value)
716
ValueError: expected sequence of length 237 at dim 1 (got 232)
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
8 frames
<ipython-input-15-08e953660864> in <module>
71 j+=1
72
---> 73 passages_relevance_scores = scorer.sentence_score(passages_relevance_text, reduce='hmean')
74 passages_trans_scores = scorer.sentence_score(passages_trans_text, reduce='hmean')
75 for i, passage in enumerate(passage_texts):
/usr/local/lib/python3.7/dist-packages/lm_scorer/models/abc/base.py in sentence_score(self, text, log, reduce)
31 return scores
32
---> 33 outputs = self._tokens_log_prob(sentences)
34 for output in outputs:
35 log_probs = output[0]
/usr/local/lib/python3.7/dist-packages/lm_scorer/models/abc/batch.py in _tokens_log_prob(self, text)
26 for i in range(0, len(text), self.batch_size):
27 batch = text[i : i + self.batch_size]
---> 28 outputs.extend(self._tokens_log_prob_for_batch(batch))
29 return outputs
30
/usr/local/lib/python3.7/dist-packages/lm_scorer/models/gpt2.py in _tokens_log_prob_for_batch(self, text)
45 text = list(map(self._add_special_tokens, text))
46 encoding: BatchEncoding = self.tokenizer.batch_encode_plus(
---> 47 text, return_tensors="pt",
48 )
49 with torch.no_grad():
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding, truncation, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose, **kwargs)
2780 return_length=return_length,
2781 verbose=verbose,
-> 2782 **kwargs,
2783 )
2784
/usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/tokenization_gpt2_fast.py in _batch_encode_plus(self, *args, **kwargs)
164 )
165
--> 166 return super()._batch_encode_plus(*args, **kwargs)
167
168 def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _batch_encode_plus(self, batch_text_or_text_pairs, add_special_tokens, padding_strategy, truncation_strategy, max_length, stride, is_split_into_words, pad_to_multiple_of, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, return_length, verbose)
475 for input_ids in sanitized_tokens["input_ids"]:
476 self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
--> 477 return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
478
479 def _encode_plus(
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in __init__(self, data, encoding, tensor_type, prepend_batch_axis, n_sequences)
208 self._n_sequences = n_sequences
209
--> 210 self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis)
211
212 @property
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_base.py in convert_to_tensors(self, tensor_type, prepend_batch_axis)
730 )
731 raise ValueError(
--> 732 "Unable to create tensor, you should probably activate truncation and/or padding with"
733 " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your"
734 f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is"
ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).
Would love any help whatsoever!