Bug: Infinite crawling recursion on some pages
Opened this issue · 1 comments
simonwjackson commented
from goose import Goose
url = "https://savannah.gnu.org/forum/forum.php?forum_id=8420"
g = Goose()
article = g.extract(url=url)
Output:
/usr/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.
To get rid of this warning, change this:
BeautifulSoup([your markup])
to this:
BeautifulSoup([your markup], "html.parser")
markup_type=markup_type))
This hangs indefinitely. Once I press Ctrl+C..
Traceback (most recent call last):
File "/tmp/test-extract.py", line 6, in <module>
article = g.extract(url=url)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 56, in extract
return self.crawl(cc)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
return self.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 66, in crawl
article = crawler.crawl(crawl_candiate)
File "/usr/lib/python2.7/site-packages/goose/crawler.py", line 117, in crawl
raw_html = self.get_html(crawl_candidate, parse_candidate)
File "/usr/lib/python2.7/site-packages/goose/crawler.py", line 215, in get_html
html = self.htmlfetcher.get_html(parsing_candidate.url)
File "/usr/lib/python2.7/site-packages/goose/network.py", line 53, in get_html
timeout=self.config.http_timeout)
File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
return opener.open(url, data, timeout)
File "/usr/lib/python2.7/urllib2.py", line 431, in open
response = self._open(req, data)
File "/usr/lib/python2.7/urllib2.py", line 449, in _open
'_open', req)
File "/usr/lib/python2.7/urllib2.py", line 409, in _call_chain
result = func(*args)
File "/usr/lib/python2.7/urllib2.py", line 1240, in https_open
context=self._context)
File "/usr/lib/python2.7/urllib2.py", line 1194, in do_open
h.request(req.get_method(), req.get_selector(), req.data, headers)
File "/usr/lib/python2.7/httplib.py", line 1053, in request
self._send_request(method, url, body, headers)
File "/usr/lib/python2.7/httplib.py", line 1093, in _send_request
self.endheaders(body)
File "/usr/lib/python2.7/httplib.py", line 1049, in endheaders
self._send_output(message_body)
File "/usr/lib/python2.7/httplib.py", line 893, in _send_output
self.send(msg)
File "/usr/lib/python2.7/httplib.py", line 855, in send
self.connect()
File "/usr/lib/python2.7/httplib.py", line 1274, in connect
server_hostname=server_hostname)
File "/usr/lib/python2.7/ssl.py", line 352, in wrap_socket
_context=self)
File "/usr/lib/python2.7/ssl.py", line 579, in __init__
self.do_handshake()
File "/usr/lib/python2.7/ssl.py", line 808, in do_handshake
self._sslobj.do_handshake()
KeyboardInterrupt