budou.py returns error when input a text is recognized as 'zh'
yaboo-oyabu opened this issue · 2 comments
Got HttpError when I input a text which is recognized as 'zh'.
Budou must handle CJK texts...
For example:
result = parser.parse(u'再会', 'wordwrap')
Traceback (most recent call last):
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 113, in parse
chunks = self._get_source_chunks(input_text)
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 178, in _get_source_chunks
tokens = self._get_annotations(input_text)
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 150, in _get_annotations
response = request.execute()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/util.py", line 137, in positional_wrapper
return wrapped(_args, *_kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/http.py", line 838, in execute
raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://language.googleapis.com/v1beta1/documents:annotateText?alt=json returned "The language zh is not supported for syntax analysis.">
I found that this issue is caused by Natural Language API when the API recognizes a Japanese text as a Chinese text and can be avoided by setting a language parameter. I wrote some codes to pass a language parameter to the API. Please let me have a permission to do pull request :)
Diff:
diff --git a/budou/budou.py b/budou/budou.py
index d9d3e53..df802d6 100644
--- a/budou/budou.py
+++ b/budou/budou.py
@@ -90,13 +90,15 @@ class Budou(object):
service = discovery.build('language', 'v1beta1', http=http)
return cls(service)
- def parse(self, source, classname=DEFAULT_CLASS_NAME, use_cache=True):
+ def parse(self, source, classname=DEFAULT_CLASS_NAME, use_cache=True,
+ language=''):
"""Parses input HTML code into word chunks and organized code.
Args:
source: HTML code to be processed (unicode).
classname: A class name of each word chunk in the HTML code (string).
user_cache: Whether to use cache (boolean).
+ language: A language used to parse text (string).
Returns:
A dictionary with the list of word chunks and organized HTML code.
@@ -110,7 +112,7 @@ class Budou(object):
source = self._preprocess(source)
dom = html.fragment_fromstring(source, create_parent='body')
input_text = dom.text_content()
- chunks = self._get_source_chunks(input_text)
+ chunks = self._get_source_chunks(input_text, language)
chunks = self._concatenate_punctuations(chunks)
chunks = self._concatenate_by_label(chunks, True)
chunks = self._concatenate_by_label(chunks, False)
@@ -132,7 +134,7 @@ class Budou(object):
return hashlib.md5(key_source.encode('utf8')).hexdigest()
- def _get_annotations(self, text, encoding='UTF32'):
+ def _get_annotations(self, text, language='', encoding='UTF32'):
"""Returns the list of annotations from the given text."""
body = {
'document': {
@@ -145,6 +147,9 @@ class Budou(object):
'encodingType': encoding,
}
+ if language:
+ body['document']['language'] = language
+
request = self.service.documents().annotateText(body=body)
response = request.execute()
return response.get('tokens', [])
@@ -163,18 +168,19 @@ class Budou(object):
source = re.sub(r'\s\s+', u' ', source)
return source
- def _get_source_chunks(self, input_text):
+ def _get_source_chunks(self, input_text, language=''):
"""Returns the words chunks.
Args:
input_text: An input text to annotate (unicode).
+ language: A language used to parse text (string).
Returns:
A list of word chunk objects (list).
"""
chunks = []
sentence_length = 0
- tokens = self._get_annotations(input_text)
+ tokens = self._get_annotations(input_text, language)
for token in tokens:
word = token['text']['content']
begin_offset = token['text']['beginOffset']
Result:
>>> import budou
>>> parser = budou.authenticate('xxxxx.json')
>>> parser.parse(u'再会')
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 114, in parse
File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 183, in _get_source_chunks
File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 154, in _get_annotations
File "/Users/yaboo/resources/anaconda/lib/python2.7/site-packages/oauth2client/util.py", line 137, in positional_wrapper
return wrapped(*args, **kwargs)
File "/Users/yaboo/resources/anaconda/lib/python2.7/site-packages/googleapiclient/http.py", line 838, in execute
raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://language.googleapis.com/v1beta1/documents:annotateText?alt=json returned "The language zh is not supported for syntax analysis.">
>>> parser.parse(u'再会', language='ja')
{'chunks': [Chunk(word=u'\u518d\u4f1a', pos=u'NOUN', label=u'ROOT', forward=False)], 'html_code': u'<span class="ww">\u518d\u4f1a</span>'}
Thank you for contribution. I merged your change to specify language parameter.