Web pages resources saving can fail
Closed this issue · 0 comments
pbellon commented
With the PrimaDaNoi feed I've seen that storage of articles resources fails. Which in consequence makes the archiving fail too. Here is the trace:
URL 'js/cookiecuttr/cookiecuttr.css?v=4': No schema supplied. Perhaps you meant http://js/cookiecuttr/cookiecuttr.css?v=4?",)
Traceback (most recent call last):
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/trace.py", line 374, in trace_task
R = retval = fun(*args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/trace.py", line 629, in __protected_call__
return self.run(*args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/tasks.py", line 79, in crawl_feeds
crawl_articles(qs=articles)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/local.py", line 191, in __call__
return self._get_current_object()(*a, **kw)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/trace.py", line 630, in __protected_call__
return orig(self, *args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/task.py", line 380, in __call__
return self.run(*args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/tasks.py", line 53, in crawl_articles
crawl_resources(utils.should_be_preserved(articles))
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/local.py", line 191, in __call__
return self._get_current_object()(*a, **kw)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/trace.py", line 630, in __protected_call__
return orig(self, *args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/celery/app/task.py", line 380, in __call__
return self.run(*args, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/tasks.py", line 118, in crawl_resources
crawl_article_resources(article)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/tasks.py", line 112, in crawl_article_resources
[ html_content, resources, css_resources ] = scrape_article_resources(article.url)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/tasks.py", line 103, in scrape_article_resources
scraper = HTMLScraper(url)
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/scrapers/html.py", line 34, in __init__
self._crawl_resources()
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/scrapers/html.py", line 39, in _crawl_resources
self._resources[_] = list(map(crawl_resource, urls))
File "/home/trb/Dev/OffshoreJournalism/crawler/crawler/scraping/scrapers/html.py", line 15, in crawl_resource
'content': requests.get(url).content,
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/api.py", line 72, in get
return request('get', url, params=params, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/api.py", line 58, in request
return session.request(method=method, url=url, **kwargs)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/sessions.py", line 494, in request
prep = self.prepare_request(req)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/sessions.py", line 437, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/models.py", line 305, in prepare
self.prepare_url(url, params)
File "/home/trb/Dev/OffshoreJournalism/.venv/lib/python3.6/site-packages/requests/models.py", line 379, in prepare_url
raise MissingSchema(error)
requests.exceptions.MissingSchema: Invalid URL 'js/cookiecuttr/cookiecuttr.css?v=4': No schema supplied. Perhaps you meant http://js/cookiecuttr/cookiecuttr.css?v=4?