SentencepieceTokenizer not loading models
datapythonista opened this issue · 2 comments
I'm trying to use the SentencepieceTokenizer
, but I'm getting an error. I downloaded the model from huggingface: https://huggingface.co/xlm-roberta-base/resolve/main/sentencepiece.bpe.model
If I try the original sentencepiece implementation it works as expected:
>>> import sentencepiece
>>> VOCAB_FILE = 'sentencepiece.bpe.model'
>>> TEXT = 'my tailor is rich'
>>> tokenizer = sentencepiece.SentencePieceProcessor()
>>> tokenizer.Load(VOCAB_FILE)
>>> tokenizer.EncodeAsPieces(TEXT)
['▁my', '▁ta', 'ilor', '▁is', '▁rich']
But when I try to do the same with tensorflow_text
, I get the next error:
>>> from tensorflow_text import SentencepieceTokenizer
>>> tokenizer = SentencepieceTokenizer(model=VOCAB_FILE)
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-15-05f4d1591f8d> in <module>
1 from tensorflow_text import SentencepieceTokenizer
2
----> 3 tokenizer = SentencepieceTokenizer(model=VOCAB_FILE)
4 tokenizer.tokenize(TEXT)
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in __init__(self, model, out_type, nbest_size, alpha, reverse, add_bos, add_eos, name)
98 self.add_bos = add_bos
99 self.add_eos = add_eos
--> 100 self._model_resource = _SentencepieceModelResource(model, name)
101
102 def tokenize(self, input, name=None): # pylint: disable=redefined-builtin
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in __init__(self, model, name)
47 self._model = model
48 self._name = name
---> 49 _ = self.resource_handle # Accessing this property creates the resource.
50
51 def _create_resource(self):
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow/python/training/tracking/tracking.py in resource_handle(self)
242 if self._resource_handle is None:
243 with ops.device(self._resource_device):
--> 244 self._resource_handle = self._create_resource()
245 return self._resource_handle
246
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in _create_resource(self)
52 model, name = self._model, self._name
53 with ops.name_scope(name, "SentenceTokenizerInitializer", [model]):
---> 54 return gen_sentencepiece_tokenizer.sentencepiece_op(model=model)
55
56
<string> in sentencepiece_op(model, container, shared_name, use_node_name_sharing, name)
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
6841 message = e.message + (" name: " + name if name is not None else "")
6842 # pylint: disable=protected-access
-> 6843 six.raise_from(core._status_to_exception(e.code, message), None)
6844 # pylint: enable=protected-access
6845
~/miniconda3/envs/pydata/lib/python3.8/site-packages/six.py in raise_from(value, from_value)
InternalError: external/com_google_sentencepiece/src/sentencepiece_processor.cc(73) [model_proto->ParseFromArray(serialized.data(), serialized.size())] [Op:SentencepieceOp]
Using the vocabulary from the test_data directory, I get the same error:
>>> from tensorflow_text import SentencepieceTokenizer
>>> tokenizer = SentencepieceTokenizer(model='test_oss_model.model')
---------------------------------------------------------------------------
InternalError Traceback (most recent call last)
<ipython-input-2-caf4e378ab07> in <module>
1 from tensorflow_text import SentencepieceTokenizer
2
----> 3 tokenizer = SentencepieceTokenizer(model='test_oss_model.model')
4 tokenizer.tokenize(TEXT)
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in __init__(self, model, out_type, nbest_size, alpha, reverse, add_bos, add_eos, name)
98 self.add_bos = add_bos
99 self.add_eos = add_eos
--> 100 self._model_resource = _SentencepieceModelResource(model, name)
101
102 def tokenize(self, input, name=None): # pylint: disable=redefined-builtin
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in __init__(self, model, name)
47 self._model = model
48 self._name = name
---> 49 _ = self.resource_handle # Accessing this property creates the resource.
50
51 def _create_resource(self):
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow/python/training/tracking/tracking.py in resource_handle(self)
242 if self._resource_handle is None:
243 with ops.device(self._resource_device):
--> 244 self._resource_handle = self._create_resource()
245 return self._resource_handle
246
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow_text/python/ops/sentencepiece_tokenizer.py in _create_resource(self)
52 model, name = self._model, self._name
53 with ops.name_scope(name, "SentenceTokenizerInitializer", [model]):
---> 54 return gen_sentencepiece_tokenizer.sentencepiece_op(model=model)
55
56
<string> in sentencepiece_op(model, container, shared_name, use_node_name_sharing, name)
~/miniconda3/envs/pydata/lib/python3.8/site-packages/tensorflow/python/framework/ops.py in raise_from_not_ok_status(e, name)
6841 message = e.message + (" name: " + name if name is not None else "")
6842 # pylint: disable=protected-access
-> 6843 six.raise_from(core._status_to_exception(e.code, message), None)
6844 # pylint: enable=protected-access
6845
~/miniconda3/envs/pydata/lib/python3.8/site-packages/six.py in raise_from(value, from_value)
InternalError: external/com_google_sentencepiece/src/sentencepiece_processor.cc(73) [model_proto->ParseFromArray(serialized.data(), serialized.size())] [Op:SentencepieceOp]
The versions I'm using are:
$ python --version
Python 3.8.6
$ pip freeze | grep tensorflow
tensorflow==2.3.1
tensorflow-estimator==2.3.0
tensorflow-text==2.3.0
model
arg is the sentencepiece model in serialized string [1], not filepath to model. You can use tf.io.read_file
[2] or tf.io.gfile.GFile
[3] to read the model to a serialized string.
[1] https://github.com/tensorflow/text/blob/master/docs/api_docs/python/text/SentencepieceTokenizer.md#args
[2] https://www.tensorflow.org/api_docs/python/tf/io/read_file
[3] https://www.tensorflow.org/api_docs/python/tf/io/gfile/GFile
Thanks for the information @thuang513, this is very useful. I think the documentation could be probably improved and make it clear, but this fixes my problem, thanks!