ChunkedEncodingError while scraping subreddit submissions
aryamansharma01 opened this issue · 10 comments
Hi Matt, I'm trying to scrape subreddit posts within a time period of six months, with a limit set to none. After irregular periods of time however, the connection gets broken apparently. Following is the code snippet and the error. I tried restarting it multiple times, but the same issue comes up. Is there any way to ensure that all data is scraped in one go? Thanks
Code :
from pmaw import PushshiftAPI
api = PushshiftAPI()
import datetime as dt
before = int(dt.datetime(2020,9,1,0,0).timestamp())
after = int(dt.datetime(2020,3,1,0,0).timestamp())
subreddit="teenagers"
posts = api.search_submissions(subreddit=subreddit, limit=None, before=before, after=after,mem_safe=True)
print(f'Retrieved {len(posts)} posts from Pushshift')
Error :
ValueError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _update_chunk_length(self)
696 try:
--> 697 self.chunk_left = int(line, 16)
698 except ValueError:ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
InvalidChunkLength Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _error_catcher(self)
437 try:
--> 438 yield
439/opt/conda/lib/python3.7/site-packages/urllib3/response.py in read_chunked(self, amt, decode_content)
763 while True:
--> 764 self._update_chunk_length()
765 if self.chunk_left == 0:/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _update_chunk_length(self)
700 self.close()
--> 701 raise InvalidChunkLength(self, line)
702InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/requests/models.py in generate()
757 try:
--> 758 for chunk in self.raw.stream(chunk_size, decode_content=True):
759 yield chunk/opt/conda/lib/python3.7/site-packages/urllib3/response.py in stream(self, amt, decode_content)
571 if self.chunked and self.supports_chunked_reads():
--> 572 for line in self.read_chunked(amt, decode_content=decode_content):
573 yield line/opt/conda/lib/python3.7/site-packages/urllib3/response.py in read_chunked(self, amt, decode_content)
792 if self._original_response:
--> 793 self._original_response.close()
794/opt/conda/lib/python3.7/contextlib.py in exit(self, type, value, traceback)
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _error_catcher(self)
454 # This includes IncompleteRead.
--> 455 raise ProtocolError("Connection broken: %r" % e, e)
456ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
During handling of the above exception, another exception occurred:
ChunkedEncodingError Traceback (most recent call last)
/tmp/ipykernel_19/3224342533.py in
1 subreddit="teenagers"
----> 2 posts = api.search_submissions(subreddit=subreddit, limit=None, before=before, after=after,mem_safe=True,safe_exit=True)
3 print(f'Retrieved {len(posts)} posts from Pushshift')/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPI.py in search_submissions(self, **kwargs)
72 Response generator object
73 """
---> 74 return self._search(kind='submission', **kwargs)/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _search(self, kind, max_ids_per_request, max_results_per_request, mem_safe, search_window, dataset, safe_exit, cache_dir, filter_fn, **kwargs)
266
267 if self.req.limit > 0 and len(self.req.req_list) > 0:
--> 268 self._multithread()
269
270 self.req.save_cache()/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _multithread(self, check_total)
100 self._get, url_pay[0], url_pay[1]): url_pay for url_pay in reqs}
101
--> 102 self._futures_handler(futures, check_total)
103
104 # reset attempts if no failures/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _futures_handler(self, futures, check_total)
131 self.num_req += int(not check_total)
132 try:
--> 133 data = future.result()
134 self.num_suc += int(not check_total)
135 url = url_pay[0]/opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result/opt/conda/lib/python3.7/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _get(self, url, payload)
50 def _get(self, url, payload={}):
51 self._impose_rate_limit()
---> 52 r = requests.get(url, params=payload)
53 status = r.status_code
54 reason = r.reason/opt/conda/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73 """
74
---> 75 return request('get', url, params=params, **kwargs)
76
77/opt/conda/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63/opt/conda/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp/opt/conda/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
695
696 if not stream:
--> 697 r.content
698
699 return r/opt/conda/lib/python3.7/site-packages/requests/models.py in content(self)
834 self._content = None
835 else:
--> 836 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
837
838 self._content_consumed = True/opt/conda/lib/python3.7/site-packages/requests/models.py in generate()
759 yield chunk
760 except ProtocolError as e:
--> 761 raise ChunkedEncodingError(e)
762 except DecodeError as e:
763 raise ContentDecodingError(e)ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
@aryamansharma01 I haven't seen this error before, I'll take a look.
Is there any way to ensure that all data is scraped in one go?
Usually the query should be able to run to completion, but at first glance it looks like this error might be due to a network connection issue with the client / host
I have that issue too. I'm trying to scrape the subreddit "antiwork" during 2021. Until Oct it works fine and then it sends me this error. I think maybe it happens in large amounts. Does someone have a solution?
I run into this issue as well, nominally with larger amounts. Is there maybe a chance there is some spillover size-wise with the limit parameter?
For me the same issue. I have downloaded several months of Wallstreetbets data without any problem but since a few days ago the issue keeps incurring. Did you already find y solution?
No. I think it's related to the issues with the server of pushshift...
Does anyone have a solution? Can we have the problematic ones skipped? Currently mine just stuck halfway for a scarping.
@ReichYang when i was looking into this before it appeared to be a connection issue between the client and server.
It would help to see any debug logs from when this error occurs to try to pinpoint what is causing this
@ReichYang when i was looking into this before it appeared to be a connection issue between the client and server.
It would help to see any debug logs from when this error occurs to try to pinpoint what is causing this
Thanks for the speedy response. I think mine is also connection error, but I tried several times and it just kept on throwing this error.
ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
I'm wondering if we can have a parameter that can ensure the scraping won't stop just because of this kind of issue, like skipping the ones that consistently receive connection issues and moving on to the next.
@mattpodolak Hey, I just encountered another one. And here is the error log.
ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _update_chunk_length(self)
696 try:
--> 697 self.chunk_left = int(line, 16)
698 except ValueError:
ValueError: invalid literal for int() with base 16: b''
During handling of the above exception, another exception occurred:
InvalidChunkLength Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _error_catcher(self)
437 try:
--> 438 yield
439
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in read_chunked(self, amt, decode_content)
763 while True:
--> 764 self._update_chunk_length()
765 if self.chunk_left == 0:
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _update_chunk_length(self)
700 self.close()
--> 701 raise InvalidChunkLength(self, line)
702
InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)
During handling of the above exception, another exception occurred:
ProtocolError Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in generate()
757 try:
--> 758 for chunk in self.raw.stream(chunk_size, decode_content=True):
759 yield chunk
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in stream(self, amt, decode_content)
571 if self.chunked and self.supports_chunked_reads():
--> 572 for line in self.read_chunked(amt, decode_content=decode_content):
573 yield line
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in read_chunked(self, amt, decode_content)
792 if self._original_response:
--> 793 self._original_response.close()
794
D:\Python37\lib\contextlib.py in __exit__(self, type, value, traceback)
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _error_catcher(self)
454 # This includes IncompleteRead.
--> 455 raise ProtocolError("Connection broken: %r" % e, e)
456
ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
During handling of the above exception, another exception occurred:
ChunkedEncodingError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_72724/806806213.py in <module>
1 for sub in subs:
----> 2 posts = api.search_submissions(subreddit=sub, mem_safe=True, after=1559365200, before=1637207011, safe_exit=True)
3 print(f'{len(posts)} posts retrieved from Pushshift')
4 post_list = [post for post in posts]
5 pd.DataFrame(post_list).to_pickle(f"{sub}_COVID_submissions.pkl")
~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPI.py in search_submissions(self, **kwargs)
72 Response generator object
73 """
---> 74 return self._search(kind='submission', **kwargs)
~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _search(self, kind, max_ids_per_request, max_results_per_request, mem_safe, search_window, dataset, safe_exit, cache_dir, filter_fn, **kwargs)
266
267 if self.req.limit > 0 and len(self.req.req_list) > 0:
--> 268 self._multithread()
269
270 self.req.save_cache()
~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _multithread(self, check_total)
100 self._get, url_pay[0], url_pay[1]): url_pay for url_pay in reqs}
101
--> 102 self._futures_handler(futures, check_total)
103
104 # reset attempts if no failures
~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _futures_handler(self, futures, check_total)
131 self.num_req += int(not check_total)
132 try:
--> 133 data = future.result()
134 self.num_suc += int(not check_total)
135 url = url_pay[0]
D:\Python37\lib\concurrent\futures\_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)
D:\Python37\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
D:\Python37\lib\concurrent\futures\thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)
~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _get(self, url, payload)
50 def _get(self, url, payload={}):
51 self._impose_rate_limit()
---> 52 r = requests.get(url, params=payload)
53 status = r.status_code
54 reason = r.reason
~\AppData\Roaming\Python\Python37\site-packages\requests\api.py in get(url, params, **kwargs)
73 """
74
---> 75 return request('get', url, params=params, **kwargs)
76
77
~\AppData\Roaming\Python\Python37\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
~\AppData\Roaming\Python\Python37\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
~\AppData\Roaming\Python\Python37\site-packages\requests\sessions.py in send(self, request, **kwargs)
695
696 if not stream:
--> 697 r.content
698
699 return r
~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in content(self)
834 self._content = None
835 else:
--> 836 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
837
838 self._content_consumed = True
~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in generate()
759 yield chunk
760 except ProtocolError as e:
--> 761 raise ChunkedEncodingError(e)
762 except DecodeError as e:
763 raise ContentDecodingError(e)
ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
Hi guys, original poster here. I played around with the libraries a bit and found that when i installed different version of urllib3 and requests, the error no longer appeared. Also I think it might help if one tries to scrape small amounts of data at a time.
!pip install urllib3==1.23
!pip install requests==2.19.1
Hope this helps!