Fault for Ch7 normalize_corpus

Question

Fault for Ch7 normalize_corpus

Closed this issue 7 years ago · 1 comments

xiaoguo1995 commented 7 years ago

When I try to pick a small amount of reviews, test_reviews = reviews[35000:35005], it gives a default as below:

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

how could I fix it? thank u!

`---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
in ()
2 # normalize dataset
3 #norm_test_reviews = tn.normalize_corpus(test_reviews)
----> 4 normalize_corpus([test_reviews],text_lemmatization=False, stopword_removal=False,text_lower_case=False)

D:\Jupyter_Notebook\Deep_Learning\Sentimen Analysis\L1\text_normalizer.py in normalize_corpus(corpus, html_stripping, contraction_expansion, accented_char_removal, text_lower_case, text_lemmatization, special_char_removal, stopword_removal)
92
93 if html_stripping:
---> 94 doc = strip_html_tags(doc)
95
96 if accented_char_removal:

D:\Jupyter_Notebook\Deep_Learning\Sentimen Analysis\L1\text_normalizer.py in strip_html_tags(text)
26 # # Cleaning Text - strip HTML
27 def strip_html_tags(text):
---> 28 soup = BeautifulSoup(text, "html.parser")
29 stripped_text = soup.get_text()
30 return stripped_text

~\Anaconda3\envs\deeplearning\lib\site-packages\bs4_init_.py in init(self, markup, features, builder, parse_only, from_encoding, exclude_encodings, **kwargs)
223 self.contains_replacement_characters) in (
224 self.builder.prepare_markup(
--> 225 markup, from_encoding, exclude_encodings=exclude_encodings)):
226 self.reset()
227 try:

~\Anaconda3\envs\deeplearning\lib\site-packages\bs4\builder_htmlparser.py in prepare_markup(self, markup, user_specified_encoding, document_declared_encoding, exclude_encodings)
203 try_encodings = [user_specified_encoding, document_declared_encoding]
204 dammit = UnicodeDammit(markup, try_encodings, is_html=True,
--> 205 exclude_encodings=exclude_encodings)
206 yield (dammit.markup, dammit.original_encoding,
207 dammit.declared_html_encoding,

~\Anaconda3\envs\deeplearning\lib\site-packages\bs4\dammit.py in init(self, markup, override_encodings, smart_quotes_to, is_html, exclude_encodings)
350 self.log = logging.getLogger(name)
351 self.detector = EncodingDetector(
--> 352 markup, override_encodings, is_html, exclude_encodings)
353
354 # Short-circuit if the data is in Unicode to begin with.

~\Anaconda3\envs\deeplearning\lib\site-packages\bs4\dammit.py in init(self, markup, override_encodings, is_html, exclude_encodings)
226
227 # First order of business: strip a byte-order mark.
--> 228 self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
229
230 def _usable(self, encoding, tried):

~\Anaconda3\envs\deeplearning\lib\site-packages\bs4\dammit.py in strip_byte_order_mark(cls, data)
278 # Unicode data cannot have a byte-order mark.
279 return data, encoding
--> 280 if (len(data) >= 4) and (data[:2] == b'\xfe\xff')
281 and (data[2:4] != '\x00\x00'):
282 encoding = 'utf-16be'

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
`

Answer 1 · 2018-02-22T23:11:04.000Z

remove '[....]' helps, like this way:

norm_test_reviews = tn.normalize_corpus(test_reviews,text_lemmatization=False, stopword_removal=False,text_lower_case=False)