使用matchzoo 处理中文，报错TypeError: expected string or bytes-like object，无法定位问题位置

Question

使用matchzoo 处理中文，报错TypeError: expected string or bytes-like object，无法定位问题位置

Apollo2Mars opened this issue 5 years ago · 2 comments

Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 3%|▎ | 62441/2345889 [00:06<03:44, 10189.40it/s]

TypeError Traceback (most recent call last)
in
14 preprocessor = mz.preprocessors.BasicPreprocessor()
15 # preprocessor = mz.preprocessors.DSSMPreprocessor()
---> 16 preprocessor.fit(train_raw, verbose=5) ## init preprocessor inner state.
17 train_processed = preprocessor.transform(train_raw, verbose=5)
18 test_processed = preprocessor.transform(test_raw, verbose=5)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/preprocessors/basic_preprocessor.py in fit(self, data_pack, verbose)
93 """
94 data_pack = data_pack.apply_on_text(chain_transform(self._units),
---> 95 verbose=verbose)
96 fitted_filter_unit = build_unit_from_data_pack(self._filter_unit,
97 data_pack,

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in wrapper(self, inplace, *args, **kwargs)
247 target = self.copy()
248
--> 249 func(target, *args, **kwargs)
250
251 if not inplace:

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in apply_on_text(self, func, mode, rename, verbose)
379 """
380 if mode == 'both':
--> 381 self._apply_on_text_both(func, rename, verbose=verbose)
382 elif mode == 'left':
383 self._apply_on_text_left(func, rename, verbose=verbose)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in _apply_on_text_both(self, func, rename, verbose)
406 def _apply_on_text_both(self, func, rename, verbose=1):
407 left_name, right_name = rename or ('text_left', 'text_right')
--> 408 self._apply_on_text_left(func, rename=left_name, verbose=verbose)
409 self._apply_on_text_right(func, rename=right_name, verbose=verbose)
410

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/data_pack/data_pack.py in _apply_on_text_left(self, func, rename, verbose)
400 if verbose:
401 tqdm.pandas(desc="Processing " + name + " with " + func.name)
--> 402 self._left[name] = self._left['text_left'].progress_apply(func)
403 else:
404 self._left[name] = self._left['text_left'].apply(func)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/tqdm/std.py in inner(df, func, *args, **kwargs)
751 # Apply the provided function (in **kwargs)
752 # on the df using our wrapper (which provides bar updating)
--> 753 result = getattr(df, df_function)(wrapper, **kwargs)
754
755 # Close bar and return pandas calculation result

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/pandas/core/series.py in apply(self, func, convert_dtype, args, **kwds)
4043 else:
4044 values = self.astype(object).values
-> 4045 mapped = lib.map_infer(values, f, convert=convert_dtype)
4046
4047 if len(mapped) and isinstance(mapped[0], Series):

pandas/_libs/lib.pyx in pandas._libs.lib.map_infer()

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/tqdm/std.py in wrapper(*args, **kwargs)
747 # take a fast or slow code path; so stop when t.total==t.n
748 t.update(n=1 if not t.total or t.n < t.total else 0)
--> 749 return func(*args, **kwargs)
750
751 # Apply the provided function (in **kwargs)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/preprocessors/chain_transform.py in wrapper(arg)
17 """Wrapper function of transformations composition."""
18 for unit in units:
---> 19 arg = unit.transform(arg)
20 return arg
21

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/matchzoo/preprocessors/units/tokenize.py in transform(self, input_)
15 :return tokens: tokenized tokens as a list.
16 """
---> 17 return nltk.word_tokenize(input_)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/init.py in word_tokenize(text, language, preserve_line)
142 :type preserve_line: bool
143 """
--> 144 sentences = [text] if preserve_line else sent_tokenize(text, language)
145 return [
146 token for sent in sentences for token in _treebank_word_tokenizer.tokenize(sent)

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/init.py in sent_tokenize(text, language)
104 """
105 tokenizer = load('tokenizers/punkt/{0}.pickle'.format(language))
--> 106 return tokenizer.tokenize(text)
107
108

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in tokenize(self, text, realign_boundaries)
1275 Given a text, returns a list of the sentences in that text.
1276 """
-> 1277 return list(self.sentences_from_text(text, realign_boundaries))
1278
1279 def debug_decisions(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in sentences_from_text(self, text, realign_boundaries)
1329 follows the period.
1330 """
-> 1331 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1332
1333 def _slices_from_text(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in (.0)
1329 follows the period.
1330 """
-> 1331 return [text[s:e] for s, e in self.span_tokenize(text, realign_boundaries)]
1332
1333 def _slices_from_text(self, text):

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in span_tokenize(self, text, realign_boundaries)
1319 if realign_boundaries:
1320 slices = self._realign_boundaries(text, slices)
-> 1321 for sl in slices:
1322 yield (sl.start, sl.stop)
1323

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _realign_boundaries(self, text, slices)
1360 """
1361 realign = 0
-> 1362 for sl1, sl2 in _pair_iter(slices):
1363 sl1 = slice(sl1.start + realign, sl1.stop)
1364 if not sl2:

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _pair_iter(it)
316 it = iter(it)
317 try:
--> 318 prev = next(it)
319 except StopIteration:
320 return

~/anaconda3/envs/tf_base/lib/python3.6/site-packages/nltk/tokenize/punkt.py in _slices_from_text(self, text)
1333 def _slices_from_text(self, text):
1334 last_break = 0
-> 1335 for match in self._lang_vars.period_context_re().finditer(text):
1336 context = match.group() + match.group('after_tok')
1337 if self.text_contains_sentbreak(context):

TypeError: expected string or bytes-like object

Answer 1 · 2020-01-15T04:16:31.000Z

Apparently, it is not a bug.

The text_left and text_right should be strings. If you have segmented them using jieba, then the text fields became lists.

Answer 2 · 2020-01-16T12:38:34.000Z

应该是我处理后边的数据有空行，剔除空行后正常了