For some reason it does not seem use the proxy to make the request, even after declaring the param correctly.
Opened this issue · 5 comments
As said above. I have tried printing/logging the proxy it gets and it gets printed. It just doesn't get used while making the request. What could the reason be?
As you can see below, the proxy being used is the localIP while the proxy being pulled by the library is also being printed successfully.
https://i.imgur.com/aoqpvLf.png
Snippet:
proxyObject = FreeProxy(country_id=['US', 'IN'], rand=True)
proxy = {"http": proxyObject.get()}
print(proxy)
url = 'https://superkicks-in.translate.goog/wp-json/wc/store/products?stock_status=instock&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en-GB&_x_tr_pto=nui'
s = requests.Session()
response = s.get(url=url, headers=headers, proxies=proxy, timeout=15)
print(response)
print(response.text)
You set you proxy for http, but your url is https.
Is there a way I can only call https proxies from the library?
What happens if you change the url to:
otherwise use
proxy = {"https": proxyObject.get()}
Since most of the proxies have https
I just read through the code and found the following. It scrapes all the proxies list here than applies the fillters and tries the proxy if it is working. If the proxies is working than it will be returned to you.
Maybe google proxies aren't working at all. See here
But I have rewritten the code so maybe it works for you usecase.
Add this code at the beginning of your code. You may need to install some packages.
import random
import sys
import time
import lxml.html as lh
import requests
import pandas as pd
import collections
import warnings
from typing import Union
def check_valid_ip(string: str) -> bool:
valid = False
if "." in string:
elements_array = string.strip().split(".")
if len(elements_array) == 4:
for i in elements_array:
if i.isnumeric() and int(i) >= 0 and int(i) <= 255:
valid = True
else:
valid = False
break
return valid
dict_str = {
'yes': True,
'': False,
'no': False,
}
def convert_str_to_bool(string: str) -> bool:
return dict_str.get(string)
list_columns_to_clean = ['google', 'https']
def clean_dataframe(df: pd.DataFrame):
for column in list_columns_to_clean:
if column in df.columns:
df[column] = df[column].apply(lambda x: convert_str_to_bool(x))
return df
class FreeProxy:
def __init__(self, country: Union[str, collections.abc.Iterable] = None,
country_code: Union[str, collections.abc.Iterable] = None,
timeout: float = 0.5, anonym: Union[str, bool] = None, rand: bool = True,
https: Union[bool, str] = True,
prefered_country: Union[str, collections.abc.Iterable] = None,
prefered_country_code: Union[str, collections.abc.Iterable] = None,
refresh_after: int = 900, google: bool=False):
self.country = country
self.country_code = country_code
self.prefered_country = prefered_country
self.prefered_country_code = prefered_country_code
self.timeout = timeout
self.anonym = anonym
self.https = https
self.random = rand
self.proxies = None
self.fetched_at = None
self.refresh_after = refresh_after
self.filtered_df = None
self.google = google
def series_to_proxy(self, series) -> dict:
http_str = 'https' if self.https else 'http'
proxy_set = {
http_str: f"{series.name}:{series['port']}"
}
return proxy_set, series.name
def get_proxy_list(self):
try:
page = requests.get('https://www.sslproxies.org')
doc = lh.fromstring(page.content)
tr_elements = doc.xpath('//*[@id="list"]//tr')
list_proxies = []
for tr_element in tr_elements:
if check_valid_ip(tr_element[0].text_content()) and tr_element[0].text_content() is not None:
dict_tmp = {}
for counter, attribute in enumerate(
['ip_address', 'port', 'country_code', 'country', 'anonymity', 'google', 'https',
'last_checked']):
dict_tmp[attribute] = tr_element[counter].text_content()
list_proxies.append(dict_tmp)
self.proxies = pd.DataFrame(list_proxies)
self.proxies = self.proxies.set_index('ip_address')
self.proxies = clean_dataframe(self.proxies)
self.fetched_at = time.time()
except requests.exceptions.RequestException as e:
print(e)
sys.exit(1)
@staticmethod
def get_filter_str(country: Union[str, collections.abc.Iterable],
country_code: Union[str, collections.abc.Iterable],
anonymity: Union[str, collections.abc.Iterable], https: bool, google:bool) -> str:
args = locals()
filter_str = ""
for arg in args:
if arg == "https" and args[arg] == 'any':
continue
if args[arg] is not None:
if isinstance(args[arg], bool):
filter_str += f"(self.proxies['{arg}'] == {args[arg]})&"
if isinstance(args[arg], str):
filter_str += f"(self.proxies['{arg}'] == '{args[arg]}') & "
if isinstance(args[arg], collections.abc.Iterable) and not isinstance(args[arg],(bytes, str)):
filter_str += f"(self.proxies['{arg}'].isin({args[arg]})) &"
if filter_str == "":
return ""
filter_str = filter_str[: len(filter_str) - 3] + filter_str[len(filter_str) - 3:].replace("&", "")
return f"self.proxies[{filter_str}]"
def find_working_proxy(self) -> dict:
if self.filtered_df.empty:
return None
if self.random:
for i in range(len(self.filtered_df)):
random_proxy = self.filtered_df.loc[random.choice(list(self.filtered_df.index))]
random_proxy, ipaddress = self.series_to_proxy(random_proxy)
proxy = self.check_if_proxy_is_working(random_proxy)
if proxy:
return proxy
else:
self.filtered_df.drop(ipaddress, inplace=True)
else:
for proxy in self.filtered_df.index:
proxy_inner = self.filtered_df.loc[proxy]
proxy_inner, ipaddress = self.series_to_proxy(proxy_inner)
proxy_inner = self.check_if_proxy_is_working(proxy_inner)
if proxy_inner:
return proxy_inner
def check_if_proxy_is_working(self, proxy) -> dict:
http_str = 'https' if self.https else 'http'
try:
with requests.get(f'{http_str}://www.google.com', proxies=proxy, timeout=self.timeout, stream=True) as r:
if r.raw.connection.sock:
if r.raw.connection.sock.getpeername()[0] == proxy[http_str].split(':')[0]:
return proxy
except Exception:
return False
def get(self):
if self.proxies is None:
self.get_proxy_list()
elif time.time() - self.fetched_at >= self.refresh_after:
self.get_proxy_list()
if self.prefered_country is not None or self.prefered_country_code is not None:
filter_str = self.get_filter_str(self.prefered_country, self.prefered_country_code, self.anonym, self.https, self.google)
else:
filter_str = self.get_filter_str(self.country, self.country_code, self.anonym, self.https, self.google)
if filter_str != "":
exec(f"self.filtered_df = {filter_str}.copy()")
else:
self.filtered_df = self.proxies
working_proxy = self.find_working_proxy()
if working_proxy is not None:
return working_proxy
warnings.warn("For the prefered country no working proxies have been found. Checking Countries. If countries is None all countries will be checked")
if self.prefered_country is not None or self.prefered_country_code is not None:
filter_str = self.get_filter_str(self.country, self.country_code, self.anonym, self.https, self.google)
if filter_str != "":
exec(f"self.filtered_df = {filter_str}.copy()")
else:
self.filtered_df = self.proxies
working_proxy = self.find_working_proxy()
if working_proxy is not None:
return working_proxy
run the following
proxy = FreeProxy(prefered_country_code=['US','IN'],https=True, rand=True).get()
url = 'http://superkicks-in.translate.goog/wp-json/wc/store/products?stock_status=instock&_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=en-GB&_x_tr_pto=nui'
s = requests.Session()
response = s.get(url=url, proxies=proxy, timeout=15)
print(response)
print(response.text)
results in a 400 Error Page not found:
Your URL is incorrect
<Response [400]>
if i run this:
url = 'https://www.google.com/search?q=free+proxies&oq=free+proxies&aqs=chrome..69i57j69i60l2.2937j0j9&sourceid=chrome&ie=UTF-8'
s = requests.Session()
response = s.get(url=url, proxies=proxy, timeout=15)
print(response)
print(response.text)
It says that google has detected unnormal traffic and wants you to solve an captcha
So after all I think it is simply not possible for free/open proxies to scrape google
<Response [429]>
<title>https://www.google.com/search?q=free+proxies&oq=free+proxies&aqs=chrome..69i57j69i60l2.2937j0j9&sourceid=chrome&ie=UTF-8</title><script src="https://www.google.com/recaptcha/api.js" async defer></script> <script>var submitCallback = function(response) {document.getElementById('captcha-form').submit();};</script>
Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot. Why did this happen?
This traffic may have been sent by malicious software, a browser plug-in, or a script that sends automated requests. If you share your network connection, ask your administrator for help — a different computer using the same IP address may be responsible. Learn more
Sometimes you may be asked to solve the CAPTCHA if you are using advanced terms that robots are known to use, or sending requests very quickly.
IP address: 95.111.225.137
Time: 2021-11-28T20:39:03Z
URL: https://www.google.com/search?q=free+proxies&oq=free+proxies&aqs=chrome..69i57j69i60l2.2937j0j9&sourceid=chrome&ie=UTF-8
<script src="https://www.google.com/recaptcha/api.js" async defer></script> <script>var submitCallback = function(response) {document.getElementById('captcha-form').submit();};</script>
Our systems have detected unusual traffic from your computer network. This page checks to see if it's really you sending the requests, and not a robot. Why did this happen?
This traffic may have been sent by malicious software, a browser plug-in, or a script that sends automated requests. If you share your network connection, ask your administrator for help — a different computer using the same IP address may be responsible. Learn more
Sometimes you may be asked to solve the CAPTCHA if you are using advanced terms that robots are known to use, or sending requests very quickly.
IP address: 95.111.225.137
Time: 2021-11-28T20:39:03Z
URL: https://www.google.com/search?q=free+proxies&oq=free+proxies&aqs=chrome..69i57j69i60l2.2937j0j9&sourceid=chrome&ie=UTF-8