grangier/python-goose

Bug: Infinite crawling recursion on some pages

Opened this issue · 1 comments

from goose import Goose

url = "https://savannah.gnu.org/forum/forum.php?forum_id=8420"
g = Goose()
article = g.extract(url=url)

Output:

/usr/lib/python2.7/site-packages/bs4/__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("html.parser"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

To get rid of this warning, change this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "html.parser")

  markup_type=markup_type))

This hangs indefinitely. Once I press Ctrl+C..

Traceback (most recent call last):
  File "/tmp/test-extract.py", line 6, in <module>
    article = g.extract(url=url)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 56, in extract
    return self.crawl(cc)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 69, in crawl
    return self.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/__init__.py", line 66, in crawl
    article = crawler.crawl(crawl_candiate)
  File "/usr/lib/python2.7/site-packages/goose/crawler.py", line 117, in crawl
    raw_html = self.get_html(crawl_candidate, parse_candidate)
  File "/usr/lib/python2.7/site-packages/goose/crawler.py", line 215, in get_html
    html = self.htmlfetcher.get_html(parsing_candidate.url)
  File "/usr/lib/python2.7/site-packages/goose/network.py", line 53, in get_html
    timeout=self.config.http_timeout)
  File "/usr/lib/python2.7/urllib2.py", line 154, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib/python2.7/urllib2.py", line 431, in open
    response = self._open(req, data)
  File "/usr/lib/python2.7/urllib2.py", line 449, in _open
    '_open', req)
  File "/usr/lib/python2.7/urllib2.py", line 409, in _call_chain
    result = func(*args)
  File "/usr/lib/python2.7/urllib2.py", line 1240, in https_open
    context=self._context)
  File "/usr/lib/python2.7/urllib2.py", line 1194, in do_open
    h.request(req.get_method(), req.get_selector(), req.data, headers)
  File "/usr/lib/python2.7/httplib.py", line 1053, in request
    self._send_request(method, url, body, headers)
  File "/usr/lib/python2.7/httplib.py", line 1093, in _send_request
    self.endheaders(body)
  File "/usr/lib/python2.7/httplib.py", line 1049, in endheaders
    self._send_output(message_body)
  File "/usr/lib/python2.7/httplib.py", line 893, in _send_output
    self.send(msg)
  File "/usr/lib/python2.7/httplib.py", line 855, in send
    self.connect()
  File "/usr/lib/python2.7/httplib.py", line 1274, in connect
    server_hostname=server_hostname)
  File "/usr/lib/python2.7/ssl.py", line 352, in wrap_socket
    _context=self)
  File "/usr/lib/python2.7/ssl.py", line 579, in __init__
    self.do_handshake()
  File "/usr/lib/python2.7/ssl.py", line 808, in do_handshake
    self._sslobj.do_handshake()
KeyboardInterrupt

I expect #269 should fix this