bookieio/breadability

exceptions with bad parsing

mitechie opened this issue · 3 comments

Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 551, in *bootstrap_inner
self.run()
File "/usr/lib/python2.7/threading.py", line 504, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 65, in fetch_content
read = ReadUrl.parse(url)
File "/home/rharding/src/bookie/bookie/lib/readable.py", line 171, in parse
if not document.readable:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 93, in html
return self._parse(self.orig_html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 80, in _parse
doc = build_doc(html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 54, in build_doc
page_unicode = page.decode(enc, 'replace')
TypeError: decode() argument 1 must be string, not None

File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 93, in html
return self._parse(self.orig_html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 80, in _parse
doc = build_doc(html)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/breadability/document.py", line 57, in build_doc
parser=utf8_parser)
File "/home/rharding/src/bookie/local/lib/python2.7/site-packages/lxml/html/init.py", line 532, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 594, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72087)
XMLSyntaxError: line 563: Tag figure invalid

[D 120826 11:32:46 existing:67] Q0 getting content for 4c3edf3a8229cd http://www.dafont.com/

Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.6/threading.py", line 532, in *bootstrap_inner
self.run()
File "/usr/lib/python2.6/threading.py", line 484, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 68, in fetch_content
read = ReadUrl.parse(url)
File "/home/bmark.us/0.5/bookie/lib/readable.py", line 176, in parse
if not document.readable:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 95, in html
return self._parse(self.orig_html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 82, in _parse
doc = build_doc(html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 59, in build_doc
parser=utf8_parser)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/lxml/html/init.py", line 534, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 596, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72123)
XMLSyntaxError: None

[D 120827 11:13:41 existing:67] Q0 getting content for 4c3edf3a8229cd http://www.dafont.com/
[D 120827 11:13:41 existing:67] Q1 getting content for 9feafedb1e468b http://www.redbullmusicacademy.com/
[D 120827 11:13:41 existing:67] Q2 getting content for 1ca8c1b6cb8e08 http://cameratoss.blogspot.com/
Exception in thread Thread-1:
Traceback (most recent call last):
File "/usr/lib/python2.6/threading.py", line 532, in *bootstrap_inner
self.run()
File "/usr/lib/python2.6/threading.py", line 484, in run
self.__target(_self.__args, _self.__kwargs)
File "scripts/readability/existing.py", line 68, in fetch_content
read = ReadUrl.parse(url)
File "/home/bmark.us/0.5/bookie/lib/readable.py", line 176, in parse
if not document.readable:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in __get

value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 426, in readable
return tounicode(self._readable)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 431, in _readable
if self.candidates:
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 419, in candidates
doc = self.doc
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/readable.py", line 409, in doc
doc = self.orig.html
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/utils.py", line 55, in get
value = self.fget(inst)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 95, in html
return self._parse(self.orig_html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 82, in _parse
doc = build_doc(html)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/breadability/document.py", line 59, in build_doc
parser=utf8_parser)
File "/home/bmark.us/0.5/lib/python2.6/site-packages/lxml/html/init.py", line 534, in document_fromstring
value = etree.fromstring(html, parser, **kw)
File "lxml.etree.pyx", line 2743, in lxml.etree.fromstring (src/lxml/lxml.etree.c:52665)
File "parser.pxi", line 1573, in lxml.etree._parseMemoryDocument (src/lxml/lxml.etree.c:79932)
File "parser.pxi", line 1452, in lxml.etree._parseDoc (src/lxml/lxml.etree.c:78774)
File "parser.pxi", line 960, in lxml.etree._BaseParser._parseDoc (src/lxml/lxml.etree.c:75389)
File "parser.pxi", line 564, in lxml.etree._ParserContext._handleParseResultDoc (src/lxml/lxml.etree.c:71739)
File "parser.pxi", line 645, in lxml.etree._handleParseResult (src/lxml/lxml.etree.c:72614)
File "parser.pxi", line 594, in lxml.etree._raiseParseError (src/lxml/lxml.etree.c:72087)
XMLSyntaxError: line 1887: htmlParseEntityRef: expecting ';'