Crawling website fails on newer installations or scrapy updates
opensemanticsearch opened this issue · 2 comments
On newer installations (not sure if because upgrade to Debian 10, Python or scrapy crawler framework) crawling a website fails, because scrapy returns headers in other format.
Seems response.headers returned by scrapy (<class 'scrapy.http.headers.Headers'>) can not be converted automatically to Celery / RabbitMQ message queue, since now (or sometimes?) other format(s):
Exemple of response.headers:
{b'Date': [b'Tue, 19 Nov 2019 11:01:54 GMT'], b'Server': [b'Apache/2.4.38 (Debian)'], b'Vary': [b'Accept-Encoding'], b'Content-Type': [b'text/html;charset=UTF-8']}
Error:
Traceback (most recent call last):
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 50, in _reraise_errors
yield
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 221, in dumps
payload = encoder(data)
File "/usr/lib/python3/dist-packages/kombu/utils/json.py", line 69, in dumps
**dict(default_kwargs, **kwargs))
File "/usr/lib/python3.7/json/init.py", line 238, in dumps
**kw).encode(obj)
File "/usr/lib/python3.7/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/lib/python3.7/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
TypeError: keys must be str, int, float, bool or None, not bytes
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.7/dist-packages/scrapy/utils/defer.py", line 102, in iter_errback
yield next(it)
File "/usr/local/lib/python3.7/dist-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.7/dist-packages/scrapy/spidermiddlewares/offsite.py", line 29, in process_spider_output
for x in result:
File "/usr/local/lib/python3.7/dist-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.7/dist-packages/scrapy/spidermiddlewares/referer.py", line 339, in
return (_set_referer(r) for r in result or ())
File "/usr/local/lib/python3.7/dist-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.7/dist-packages/scrapy/spidermiddlewares/urllength.py", line 37, in
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.7/dist-packages/scrapy/core/spidermw.py", line 84, in evaluate_iterable
for r in iterable:
File "/usr/local/lib/python3.7/dist-packages/scrapy/spidermiddlewares/depth.py", line 58, in
return (r for r in result or () if _filter(r))
File "/usr/local/lib/python3.7/dist-packages/scrapy/spiders/crawl.py", line 106, in _parse_response
cb_res = callback(response, **cb_kwargs) or ()
File "/usr/bin/opensemanticsearch-index-web-crawl", line 40, in parse_item
'downloaded_headers': response.headers}, queue='tasks', priority=5)
File "/usr/lib/python3/dist-packages/celery/app/task.py", line 535, in apply_async
**options
File "/usr/lib/python3/dist-packages/celery/app/base.py", line 745, in send_task
amqp.send_task_message(P, name, message, **options)
File "/usr/lib/python3/dist-packages/celery/app/amqp.py", line 552, in send_task_message
**properties
File "/usr/lib/python3/dist-packages/kombu/messaging.py", line 169, in publish
compression, headers)
File "/usr/lib/python3/dist-packages/kombu/messaging.py", line 252, in _prepare
body) = dumps(body, serializer=serializer)
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 221, in dumps
payload = encoder(data)
File "/usr/lib/python3.7/contextlib.py", line 130, in exit
self.gen.throw(type, value, traceback)
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 54, in _reraise_errors
reraise(wrapper, wrapper(exc), sys.exc_info()[2])
File "/usr/lib/python3/dist-packages/vine/five.py", line 178, in reraise
raise value.with_traceback(tb)
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 50, in _reraise_errors
yield
File "/usr/lib/python3/dist-packages/kombu/serialization.py", line 221, in dumps
payload = encoder(data)
File "/usr/lib/python3/dist-packages/kombu/utils/json.py", line 69, in dumps
**dict(default_kwargs, **kwargs))
File "/usr/lib/python3.7/json/init.py", line 238, in dumps
**kw).encode(obj)
File "/usr/lib/python3.7/json/encoder.py", line 199, in encode
chunks = self.iterencode(o, _one_shot=True)
File "/usr/lib/python3.7/json/encoder.py", line 257, in iterencode
return _iterencode(o, 0)
kombu.exceptions.EncodeError: keys must be str, int, float, bool or None, not bytes