mattpodolak/pmaw

ChunkedEncodingError while scraping subreddit submissions

aryamansharma01 opened this issue · 10 comments

Hi Matt, I'm trying to scrape subreddit posts within a time period of six months, with a limit set to none. After irregular periods of time however, the connection gets broken apparently. Following is the code snippet and the error. I tried restarting it multiple times, but the same issue comes up. Is there any way to ensure that all data is scraped in one go? Thanks

Code :

from pmaw import PushshiftAPI
api = PushshiftAPI()

import datetime as dt
before = int(dt.datetime(2020,9,1,0,0).timestamp())
after = int(dt.datetime(2020,3,1,0,0).timestamp())

subreddit="teenagers"
posts = api.search_submissions(subreddit=subreddit, limit=None, before=before, after=after,mem_safe=True)
print(f'Retrieved {len(posts)} posts from Pushshift')

Error :


ValueError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _update_chunk_length(self)
696 try:
--> 697 self.chunk_left = int(line, 16)
698 except ValueError:

ValueError: invalid literal for int() with base 16: b''

During handling of the above exception, another exception occurred:

InvalidChunkLength Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _error_catcher(self)
437 try:
--> 438 yield
439

/opt/conda/lib/python3.7/site-packages/urllib3/response.py in read_chunked(self, amt, decode_content)
763 while True:
--> 764 self._update_chunk_length()
765 if self.chunk_left == 0:

/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _update_chunk_length(self)
700 self.close()
--> 701 raise InvalidChunkLength(self, line)
702

InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)

During handling of the above exception, another exception occurred:

ProtocolError Traceback (most recent call last)
/opt/conda/lib/python3.7/site-packages/requests/models.py in generate()
757 try:
--> 758 for chunk in self.raw.stream(chunk_size, decode_content=True):
759 yield chunk

/opt/conda/lib/python3.7/site-packages/urllib3/response.py in stream(self, amt, decode_content)
571 if self.chunked and self.supports_chunked_reads():
--> 572 for line in self.read_chunked(amt, decode_content=decode_content):
573 yield line

/opt/conda/lib/python3.7/site-packages/urllib3/response.py in read_chunked(self, amt, decode_content)
792 if self._original_response:
--> 793 self._original_response.close()
794

/opt/conda/lib/python3.7/contextlib.py in exit(self, type, value, traceback)
129 try:
--> 130 self.gen.throw(type, value, traceback)
131 except StopIteration as exc:

/opt/conda/lib/python3.7/site-packages/urllib3/response.py in _error_catcher(self)
454 # This includes IncompleteRead.
--> 455 raise ProtocolError("Connection broken: %r" % e, e)
456

ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

During handling of the above exception, another exception occurred:

ChunkedEncodingError Traceback (most recent call last)
/tmp/ipykernel_19/3224342533.py in
1 subreddit="teenagers"
----> 2 posts = api.search_submissions(subreddit=subreddit, limit=None, before=before, after=after,mem_safe=True,safe_exit=True)
3 print(f'Retrieved {len(posts)} posts from Pushshift')

/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPI.py in search_submissions(self, **kwargs)
72 Response generator object
73 """
---> 74 return self._search(kind='submission', **kwargs)

/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _search(self, kind, max_ids_per_request, max_results_per_request, mem_safe, search_window, dataset, safe_exit, cache_dir, filter_fn, **kwargs)
266
267 if self.req.limit > 0 and len(self.req.req_list) > 0:
--> 268 self._multithread()
269
270 self.req.save_cache()

/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _multithread(self, check_total)
100 self._get, url_pay[0], url_pay[1]): url_pay for url_pay in reqs}
101
--> 102 self._futures_handler(futures, check_total)
103
104 # reset attempts if no failures

/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _futures_handler(self, futures, check_total)
131 self.num_req += int(not check_total)
132 try:
--> 133 data = future.result()
134 self.num_suc += int(not check_total)
135 url = url_pay[0]

/opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
426 raise CancelledError()
427 elif self._state == FINISHED:
--> 428 return self.__get_result()
429
430 self._condition.wait(timeout)

/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result

/opt/conda/lib/python3.7/concurrent/futures/thread.py in run(self)
55
56 try:
---> 57 result = self.fn(*self.args, **self.kwargs)
58 except BaseException as exc:
59 self.future.set_exception(exc)

/opt/conda/lib/python3.7/site-packages/pmaw/PushshiftAPIBase.py in _get(self, url, payload)
50 def _get(self, url, payload={}):
51 self._impose_rate_limit()
---> 52 r = requests.get(url, params=payload)
53 status = r.status_code
54 reason = r.reason

/opt/conda/lib/python3.7/site-packages/requests/api.py in get(url, params, **kwargs)
73 """
74
---> 75 return request('get', url, params=params, **kwargs)
76
77

/opt/conda/lib/python3.7/site-packages/requests/api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63

/opt/conda/lib/python3.7/site-packages/requests/sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp

/opt/conda/lib/python3.7/site-packages/requests/sessions.py in send(self, request, **kwargs)
695
696 if not stream:
--> 697 r.content
698
699 return r

/opt/conda/lib/python3.7/site-packages/requests/models.py in content(self)
834 self._content = None
835 else:
--> 836 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
837
838 self._content_consumed = True

/opt/conda/lib/python3.7/site-packages/requests/models.py in generate()
759 yield chunk
760 except ProtocolError as e:
--> 761 raise ChunkedEncodingError(e)
762 except DecodeError as e:
763 raise ContentDecodingError(e)

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

@aryamansharma01 I haven't seen this error before, I'll take a look.

Is there any way to ensure that all data is scraped in one go?

Usually the query should be able to run to completion, but at first glance it looks like this error might be due to a network connection issue with the client / host

I have that issue too. I'm trying to scrape the subreddit "antiwork" during 2021. Until Oct it works fine and then it sends me this error. I think maybe it happens in large amounts. Does someone have a solution?

I run into this issue as well, nominally with larger amounts. Is there maybe a chance there is some spillover size-wise with the limit parameter?

For me the same issue. I have downloaded several months of Wallstreetbets data without any problem but since a few days ago the issue keeps incurring. Did you already find y solution?

No. I think it's related to the issues with the server of pushshift...

Does anyone have a solution? Can we have the problematic ones skipped? Currently mine just stuck halfway for a scarping.

@ReichYang when i was looking into this before it appeared to be a connection issue between the client and server.

It would help to see any debug logs from when this error occurs to try to pinpoint what is causing this

@ReichYang when i was looking into this before it appeared to be a connection issue between the client and server.

It would help to see any debug logs from when this error occurs to try to pinpoint what is causing this

Thanks for the speedy response. I think mine is also connection error, but I tried several times and it just kept on throwing this error.

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

I'm wondering if we can have a parameter that can ensure the scraping won't stop just because of this kind of issue, like skipping the ones that consistently receive connection issues and moving on to the next.

@mattpodolak Hey, I just encountered another one. And here is the error log.

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _update_chunk_length(self)
    696         try:
--> 697             self.chunk_left = int(line, 16)
    698         except ValueError:

ValueError: invalid literal for int() with base 16: b''

During handling of the above exception, another exception occurred:

InvalidChunkLength                        Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _error_catcher(self)
    437             try:
--> 438                 yield
    439 

~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in read_chunked(self, amt, decode_content)
    763             while True:
--> 764                 self._update_chunk_length()
    765                 if self.chunk_left == 0:

~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _update_chunk_length(self)
    700             self.close()
--> 701             raise InvalidChunkLength(self, line)
    702 

InvalidChunkLength: InvalidChunkLength(got length b'', 0 bytes read)

During handling of the above exception, another exception occurred:

ProtocolError                             Traceback (most recent call last)
~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in generate()
    757                 try:
--> 758                     for chunk in self.raw.stream(chunk_size, decode_content=True):
    759                         yield chunk

~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in stream(self, amt, decode_content)
    571         if self.chunked and self.supports_chunked_reads():
--> 572             for line in self.read_chunked(amt, decode_content=decode_content):
    573                 yield line

~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in read_chunked(self, amt, decode_content)
    792             if self._original_response:
--> 793                 self._original_response.close()
    794 

D:\Python37\lib\contextlib.py in __exit__(self, type, value, traceback)
    129             try:
--> 130                 self.gen.throw(type, value, traceback)
    131             except StopIteration as exc:

~\AppData\Roaming\Python\Python37\site-packages\urllib3\response.py in _error_catcher(self)
    454                 # This includes IncompleteRead.
--> 455                 raise ProtocolError("Connection broken: %r" % e, e)
    456 

ProtocolError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

During handling of the above exception, another exception occurred:

ChunkedEncodingError                      Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_72724/806806213.py in <module>
      1 for sub in subs:
----> 2     posts = api.search_submissions(subreddit=sub, mem_safe=True, after=1559365200, before=1637207011, safe_exit=True)
      3     print(f'{len(posts)} posts retrieved from Pushshift')
      4     post_list = [post for post in posts]
      5     pd.DataFrame(post_list).to_pickle(f"{sub}_COVID_submissions.pkl")

~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPI.py in search_submissions(self, **kwargs)
     72             Response generator object
     73         """
---> 74         return self._search(kind='submission', **kwargs)

~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _search(self, kind, max_ids_per_request, max_results_per_request, mem_safe, search_window, dataset, safe_exit, cache_dir, filter_fn, **kwargs)
    266 
    267             if self.req.limit > 0 and len(self.req.req_list) > 0:
--> 268                 self._multithread()
    269 
    270         self.req.save_cache()

~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _multithread(self, check_total)
    100                     self._get, url_pay[0], url_pay[1]): url_pay for url_pay in reqs}
    101 
--> 102                 self._futures_handler(futures, check_total)
    103 
    104                 # reset attempts if no failures

~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _futures_handler(self, futures, check_total)
    131             self.num_req += int(not check_total)
    132             try:
--> 133                 data = future.result()
    134                 self.num_suc += int(not check_total)
    135                 url = url_pay[0]

D:\Python37\lib\concurrent\futures\_base.py in result(self, timeout)
    426                 raise CancelledError()
    427             elif self._state == FINISHED:
--> 428                 return self.__get_result()
    429 
    430             self._condition.wait(timeout)

D:\Python37\lib\concurrent\futures\_base.py in __get_result(self)
    382     def __get_result(self):
    383         if self._exception:
--> 384             raise self._exception
    385         else:
    386             return self._result

D:\Python37\lib\concurrent\futures\thread.py in run(self)
     55 
     56         try:
---> 57             result = self.fn(*self.args, **self.kwargs)
     58         except BaseException as exc:
     59             self.future.set_exception(exc)

~\AppData\Roaming\Python\Python37\site-packages\pmaw\PushshiftAPIBase.py in _get(self, url, payload)
     50     def _get(self, url, payload={}):
     51         self._impose_rate_limit()
---> 52         r = requests.get(url, params=payload)
     53         status = r.status_code
     54         reason = r.reason

~\AppData\Roaming\Python\Python37\site-packages\requests\api.py in get(url, params, **kwargs)
     73     """
     74 
---> 75     return request('get', url, params=params, **kwargs)
     76 
     77 

~\AppData\Roaming\Python\Python37\site-packages\requests\api.py in request(method, url, **kwargs)
     59     # cases, and look like a memory leak in others.
     60     with sessions.Session() as session:
---> 61         return session.request(method=method, url=url, **kwargs)
     62 
     63 

~\AppData\Roaming\Python\Python37\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
    540         }
    541         send_kwargs.update(settings)
--> 542         resp = self.send(prep, **send_kwargs)
    543 
    544         return resp

~\AppData\Roaming\Python\Python37\site-packages\requests\sessions.py in send(self, request, **kwargs)
    695 
    696         if not stream:
--> 697             r.content
    698 
    699         return r

~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in content(self)
    834                 self._content = None
    835             else:
--> 836                 self._content = b''.join(self.iter_content(CONTENT_CHUNK_SIZE)) or b''
    837 
    838         self._content_consumed = True

~\AppData\Roaming\Python\Python37\site-packages\requests\models.py in generate()
    759                         yield chunk
    760                 except ProtocolError as e:
--> 761                     raise ChunkedEncodingError(e)
    762                 except DecodeError as e:
    763                     raise ContentDecodingError(e)

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

Hi guys, original poster here. I played around with the libraries a bit and found that when i installed different version of urllib3 and requests, the error no longer appeared. Also I think it might help if one tries to scrape small amounts of data at a time.

!pip install urllib3==1.23
!pip install requests==2.19.1

Hope this helps!