DuckDuckGo scraper fails
sander-van-damme opened this issue · 3 comments
Hi there!
I noticed an issue with the DuckDuckGo scraper. The response HTML doesn't contain any search results, only some Javascript, so the selectors don't match anything.
I created a temporary fix below (based on the Google scraper), which uses html.duckduckgo.com
instead of duckduckgo.com
.
Thanks!
output:
Searching Duckduckgo
Traceback (most recent call last):
File "C:\Users\default\scrape.py", line 10, in <module>
engine.search('test')
File "C:\Users\default\search_engines\engine.py", line 162, in search
response = self._get_page(request['url'], request['data'])
File "C:\Users\default\search_engines\engines\duckduckgo.py", line 44, in _get_page
response = self._http_client.get(page)
File "C:\Users\default\search_engines\http_client.py", line 21, in get
page = self._quote(page)
File "C:\Users\default\search_engines\http_client.py", line 41, in _quote
if utl.decode_bytes(utl.unquote_url(url)) == utl.decode_bytes(url):
File "C:\Users\default\search_engines\utils.py", line 15, in unquote_url
return decode_bytes(requests.utils.unquote(url))
File "C:\Users\default\AppData\Local\Programs\Python\Python310\lib\urllib\parse.py", line 655, in unquote
if '%' not in string:
fix:
from ..engine import SearchEngine
from ..config import PROXY, TIMEOUT, FAKE_USER_AGENT
from ..utils import unquote_url, quote_url
class Duckduckgo(SearchEngine):
'''Searches duckduckgo.com'''
def __init__(self, proxy=PROXY, timeout=TIMEOUT):
super(Duckduckgo, self).__init__(proxy, timeout)
self._base_url = u'https://html.duckduckgo.com'
self._current_page = 1
self.set_headers({'User-Agent':FAKE_USER_AGENT})
def _selectors(self, element):
'''Returns the appropriate CSS selector.'''
selectors = {
'url': 'a.result__a',
'title': 'a.result__a',
'text': 'a.result__snippet',
'links': 'div#links div.result',
'next': 'input[value="next"]'
}
return selectors[element]
def _first_page(self):
'''Returns the initial page and query.'''
url = u'{}/html/?q={}'.format(self._base_url, quote_url(self._query, ''))
return {'url':url, 'data':None}
def _next_page(self, tags):
'''Returns the next page URL and post data (if any)'''
self._current_page += 1
selector = self._selectors('next').format(page=self._current_page)
next_page = self._get_tag_item(tags.select_one(selector), 'href')
url = None
if next_page:
url = self._base_url + next_page
return {'url':url, 'data':None}
def _get_url(self, tag, item='href'):
'''Returns the URL of search results item.'''
selector = self._selectors('url')
url = self._get_tag_item(tag.select_one(selector), item)
if url.startswith(u'/url?q='):
url = url.replace(u'/url?q=', u'').split(u'&sa=')[0]
return unquote_url(url)
Thanks for bringing this to my attention, and for taking the time to solve it. Your contribution is deeply appreciated. I hope you don't mind me using your code, until I find a solution for the main domain
Hi, I see you reopened this issue. Probably because the scraper doesn't go to the next page. I encountered this too a couple of days ago and created a quick & dirty fix.
from ..engine import SearchEngine
from ..config import PROXY, TIMEOUT, FAKE_USER_AGENT
from ..utils import unquote_url, quote_url
from urllib.parse import unquote
class Duckduckgo(SearchEngine):
'''Searches duckduckgo.com'''
def __init__(self, proxy=PROXY, timeout=TIMEOUT):
super(Duckduckgo, self).__init__(proxy, timeout)
self._base_url = u'https://html.duckduckgo.com'
self._delay = (2, 6)
self._current_page = 1
self.set_headers({'User-Agent':FAKE_USER_AGENT})
def _selectors(self, element):
'''Returns the appropriate CSS selector.'''
selectors = {
'url': 'a.result__a',
'title': 'a.result__a',
'text': 'a.result__snippet',
'links': 'div#links div.result',
'next': {'form':'form', 'submit': 'input[value="Next"]'}
}
return selectors[element]
def _first_page(self):
'''Returns the initial page and query.'''
url = u'{}/html/?q={}'.format(self._base_url, quote_url(self._query, ''))
return {'url':url, 'data':None}
def _next_page(self, tags):
'''Returns the next page URL and post data (if any)'''
selector = self._selectors('next')
forms = [
form
for form in tags.select(selector['form'])
if form.select(selector['submit'])
]
url = None
if forms:
url = self._base_url + forms[0]['action'] + '?'
for input in forms[0].select('input[name]'):
url += f'{input["name"]}={input.get("value", "") }&'
return {'url': url, 'data': None}
def _get_url(self, tag, item='href'):
'''Returns the URL of search results item.'''
selector = self._selectors('url')
url = self._get_tag_item(tag.select_one(selector), item)
if url.startswith(u'//duckduckgo.com/l/?uddg='):
url = url.replace(u'//duckduckgo.com/l/?uddg=', u'').split(u'&rut')[0]
return unquote(url)
Actually, I reopened it by accident! Please don't waste any more of your time on this, you've already done more than enough. I'll take care of any remaining issues when I have some free time