def search never returns if too few results
Closed this issue · 4 comments
Cool library btw.
Small issue when few results returned by google:
In the function def search() if there are too few results, the function never returns.
For example, passing num_results=10 but Google returns less than 10 results, the function will run forever in the loop "while start < num_results:"
A simple way to fix it is using a counter that loops on each sleep of the function.
#init count at the top of the function
count = 0
#check the count inside the "while start < num_results:" loop
if count < num_results:
print("Too few results")
break
#loop the count above the "sleep(sleep_interval)" code
count += 1.
Can you submit a PR for this?
also having this issue
This code managed to fix it for me. Change the __init__.py
file into this code:
"""googlesearch is a Python library for searching Google, easily."""
from time import sleep
from bs4 import BeautifulSoup
from requests import get
import requests
from .user_agents import get_useragent
def _req(term, results, lang, start, proxies, timeout):
resp = get(
url="https://www.google.com/search",
headers={
"User-Agent": get_useragent()
},
params={
"q": term,
"num": results + 2, # Prevents multiple requests
"hl": lang,
"start": start,
},
proxies=proxies,
timeout=timeout,
)
resp.raise_for_status()
return resp
class SearchResult:
def __init__(self, url, title, description):
self.url = url
self.title = title
self.description = description
def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5):
"""Search the Google search engine"""
escaped_term = term.replace(" ", "+")
# Proxy setup
proxies = {"https": proxy} if proxy and proxy.startswith("https") else {"http": proxy} if proxy else None
start = 0
fetched_results = 0 # Keep track of the total fetched results
while fetched_results < num_results:
# Send request
resp = _req(escaped_term, num_results - fetched_results, lang, start, proxies, timeout)
# Parse
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", attrs={"class": "g"})
new_results = 0 # Keep track of new results in this iteration
for result in result_block:
# Find link, title, description
link = result.find("a", href=True)
title = result.find("h3")
description_box = result.find("div", {"style": "-webkit-line-clamp:2"})
if link and title and description_box:
description = description_box.text
fetched_results += 1
new_results += 1
if advanced:
yield SearchResult(link["href"], title.text, description)
else:
yield link["href"]
if fetched_results >= num_results:
break # Stop if we have fetched the desired number of results
if new_results == 0:
print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.")
break # Break the loop if no new results were found in this iteration
start += 10 # Prepare for the next set of results
sleep(sleep_interval)
I will make a PR of this code.
This has been merged, so this issue is resolved.