google/budou

budou.py returns error when input a text is recognized as 'zh'

yaboo-oyabu opened this issue · 2 comments

Got HttpError when I input a text which is recognized as 'zh'.
Budou must handle CJK texts...

For example:
result = parser.parse(u'再会', 'wordwrap')
Traceback (most recent call last):
File "", line 1, in
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 113, in parse
chunks = self._get_source_chunks(input_text)
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 178, in _get_source_chunks
tokens = self._get_annotations(input_text)
File "/usr/local/lib/python2.7/dist-packages/budou/budou.py", line 150, in _get_annotations
response = request.execute()
File "/usr/local/lib/python2.7/dist-packages/oauth2client/util.py", line 137, in positional_wrapper
return wrapped(_args, *_kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/http.py", line 838, in execute
raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://language.googleapis.com/v1beta1/documents:annotateText?alt=json returned "The language zh is not supported for syntax analysis.">

I found that this issue is caused by Natural Language API when the API recognizes a Japanese text as a Chinese text and can be avoided by setting a language parameter. I wrote some codes to pass a language parameter to the API. Please let me have a permission to do pull request :)

Diff:

diff --git a/budou/budou.py b/budou/budou.py
index d9d3e53..df802d6 100644
--- a/budou/budou.py
+++ b/budou/budou.py
@@ -90,13 +90,15 @@ class Budou(object):
     service = discovery.build('language', 'v1beta1', http=http)
     return cls(service)

-  def parse(self, source, classname=DEFAULT_CLASS_NAME, use_cache=True):
+  def parse(self, source, classname=DEFAULT_CLASS_NAME, use_cache=True,
+            language=''):
     """Parses input HTML code into word chunks and organized code.

     Args:
       source: HTML code to be processed (unicode).
       classname: A class name of each word chunk in the HTML code (string).
       user_cache: Whether to use cache (boolean).
+      language: A language used to parse text (string).

     Returns:
       A dictionary with the list of word chunks and organized HTML code.
@@ -110,7 +112,7 @@ class Budou(object):
     source = self._preprocess(source)
     dom = html.fragment_fromstring(source, create_parent='body')
     input_text = dom.text_content()
-    chunks = self._get_source_chunks(input_text)
+    chunks = self._get_source_chunks(input_text, language)
     chunks = self._concatenate_punctuations(chunks)
     chunks = self._concatenate_by_label(chunks, True)
     chunks = self._concatenate_by_label(chunks, False)
@@ -132,7 +134,7 @@ class Budou(object):
     return hashlib.md5(key_source.encode('utf8')).hexdigest()


-  def _get_annotations(self, text, encoding='UTF32'):
+  def _get_annotations(self, text, language='', encoding='UTF32'):
     """Returns the list of annotations from the given text."""
     body = {
         'document': {
@@ -145,6 +147,9 @@ class Budou(object):
         'encodingType': encoding,
     }

+    if language:
+        body['document']['language'] = language
+
     request = self.service.documents().annotateText(body=body)
     response = request.execute()
     return response.get('tokens', [])
@@ -163,18 +168,19 @@ class Budou(object):
     source = re.sub(r'\s\s+', u' ', source)
     return source

-  def _get_source_chunks(self, input_text):
+  def _get_source_chunks(self, input_text, language=''):
     """Returns the words chunks.

     Args:
       input_text: An input text to annotate (unicode).
+      language: A language used to parse text (string).

     Returns:
       A list of word chunk objects (list).
     """
     chunks = []
     sentence_length = 0
-    tokens = self._get_annotations(input_text)
+    tokens = self._get_annotations(input_text, language)
     for token in tokens:
       word = token['text']['content']
       begin_offset = token['text']['beginOffset']

Result:

>>> import budou
>>> parser = budou.authenticate('xxxxx.json')
>>> parser.parse(u'再会')
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
  File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 114, in parse
  File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 183, in _get_source_chunks
  File "build/bdist.macosx-10.6-x86_64/egg/budou/budou.py", line 154, in _get_annotations
  File "/Users/yaboo/resources/anaconda/lib/python2.7/site-packages/oauth2client/util.py", line 137, in positional_wrapper
    return wrapped(*args, **kwargs)
  File "/Users/yaboo/resources/anaconda/lib/python2.7/site-packages/googleapiclient/http.py", line 838, in execute
    raise HttpError(resp, content, uri=self.uri)
googleapiclient.errors.HttpError: <HttpError 400 when requesting https://language.googleapis.com/v1beta1/documents:annotateText?alt=json returned "The language zh is not supported for syntax analysis.">
>>> parser.parse(u'再会', language='ja')
{'chunks': [Chunk(word=u'\u518d\u4f1a', pos=u'NOUN', label=u'ROOT', forward=False)], 'html_code': u'<span class="ww">\u518d\u4f1a</span>'}

Thank you for contribution. I merged your change to specify language parameter.