lorien/grab

Spider fails on redirect with null byte

lorien opened this issue · 0 comments

Fails on python2, works on python3

    def test_redirect_with_invalid_byte(self):
        url = self.server.get_url()
        invalid_url = b'http://\xa0' + url.encode('ascii')

        def callback(server):
            server.set_status(301)
            server.add_header('Location', invalid_url)
            server.write('')
            server.finish()

        class TestSpider(Spider):
            def task_generator(self):
            #yield Task('page', url='http://www.tripadvisor.com/ShowUrl?
            #&excludeFromVS=false&odc=BusinessListingsUrl&d=4289178&url=1')
                #yield Task('page', invalid_url)
                yield Task('page', url)
    
            def task_page(self, grab, task):
                pass
    
        self.server.response['callback'] = callback
        bot = build_spider(TestSpider)
        bot.run()

Log:

Traceback (most recent call last):
  File "/home/lorien/web/grab/grab/spider/base_service.py", line 32, in wrapper
    callback(*args, **kwargs)
  File "/home/lorien/web/grab/grab/spider/network_service/threaded.py", line 78, in worker_callback
    grab.request()
  File "/home/lorien/web/grab/grab/base.py", line 481, in request
    referer=None)
  File "/home/lorien/web/grab/grab/base.py", line 410, in prepare_request
    self.transport.process_config(self)
  File "/home/lorien/web/grab/grab/transport/urllib3.py", line 132, in process_config
    u'%s: %s' % (six.text_type(ex), grab.config['url']))
UnicodeDecodeError: 'ascii' codec can't decode byte 0xa0 in position 7: ordinal not in range(128)
E....
======================================================================
ERROR: test_redirect_with_invalid_byte (tests.spider_error.SpiderErrorTestCase)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/home/lorien/web/grab/tests/spider_error.py", line 80, in test_redirect_with_invalid_byte
    bot.run()
  File "/home/lorien/web/grab/grab/spider/base.py", line 693, in run
    raise exc_info[1]
UnicodeDecodeError: 'ascii' codec can't decode byte 0xa0 in position 7: ordinal not in range(128)

----------------------------------------------------------------------