因为代理实效导致的重试还是使用实效的那个IP
ubadly opened this issue · 5 comments
ubadly commented
需知
升级feapder,保证feapder是最新版,若BUG仍然存在,则详细描述问题
pip install --upgrade feapder
问题
对网站进行爬虫的时候,代理池一次获取20个代理,网站针对IP有访问频率的限制,所以我们的策略是在request_exeption里面去掉正在使用的代理,我们以为重试的时候会使用新的代理IP,但在实际使用的时候发现重试的代理IP还是失败的时候提高的代理。这样的话失败重试就没有意义了。所以想能够设置个自定以配置,能够决定在请求失败的时候重新从代理池里面拿IP。
截图
代码
RuixiangS commented
上一个简单的代码看看
ubadly commented
上一个简单的代码看看
这里我把代理api改掉了 其他的没有改动 原封不动
class S1688(feapder.AirSpider):
__custom_setting__ = {
# "USE_SESSION": True,
"SPIDER_THREAD_COUNT": 12,
"PROXY_ENABLE": True,
# "SPIDER_SLEEP_TIME": [2, 5],
# "LOG_LEVEL": "INFO",
"PROXY_EXTRACT_API": "http://v2.api.juliangip.com/dynamic/getips",
}
def download_midware(self, request: Request):
cna = ''.join(random.choices(list('DwftHIHbiXICAQHA8429Gdvc'), k=24))
request.headers = {
"Cookie": f"cna={cna};",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36 Edg/113.0.1774.42"
}
return request
def exception_request(self, request: Request, response, e):
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
def start_requests(self):
dire_list = load_dire_list()
for dire in dire_list:
url = f"https://s.1688.com/selloffer/offer_search.htm?keywords={parse.quote('%s %s 中药材' % (dire['dire_name'], dire['dire_location']), encoding='gbk')}&spm="
yield feapder.Request(url=url, dire=dire, verify=False)
def parse(self, request: Request, response: Response):
dire = request.dire
try:
data_str_result = re.search("window.data.offerresultData = successDataCheck\((.*)\)", response.text)
if not data_str_result: return
data_str = data_str_result.group(1)
data = json.loads(data_str)['data']
offer_list = data.get("offerList", [])
for drug in offer_list:
drug_id = drug['id']
title = drug['information']['subject']
if dire['dire_name'] not in title: continue
drug.update(dire)
yield feapder.Request(url=f"https://detail.1688.com/offer/{drug_id}.html",
callback=self.parse_detail, dire=dire, dire_item=drug)
except Exception as e:
request.proxies_pool.tag_proxy(request.requests_kwargs.get("proxies"), -1)
raise Exception(f"链接:{request.url}被触发风控,无法正常获取数据,尝试重试!")
def parse_detail(self, request: Request, response: Response):
dire_item = request.dire
drug_info = request.dire_item
data_match_str_result = re.search("window.__INIT_DATA=(.*)", response.text)
if not data_match_str_result: return
data_match_str = data_match_str_result.group(1)
data = json.loads(data_match_str)
item = Item()
item.item_name = "s1688"
item.table_name = item.item_name
item.update({"tempModel": data['globalData']['tempModel']})
item.update({"skuInfoMap": data['globalData']['skuModel']['skuInfoMap']})
item.update({"skuModel": data['globalData']['skuModel']})
item.update({"orderParam": data['globalData']['orderParamModel']['orderParam']})
attr_param_arr = []
module = next((x for x in data['modules'] if x['name'] == '@ali/tdmod-od-pc-layout-detail-tab-container'), None)
if module is not None:
children = module.get('children', [])
if len(children) > 0:
attr_param_arr = [i for i in children if i['name'] == '@ali/tdmod-od-pc-attribute-new']
if not attr_param_arr: return
attr_param = attr_param_arr[0]
item.update({"attrList": data['data'][attr_param['uuid']]['data']})
item['search_key'] = dire_item['dire_name']
item['dire_spec'] = dire_item['dire_spec']
item['ID'] = dire_item['ID']
item.update(drug_info)
log.info(f"s1688 {item['ID']}-{item['tempModel']['offerTitle']}-{item['search_key']}-{item['dire_spec']}")
yield item
fanatic-studio commented
能留个QQ?或者其他联系方式?
ubadly commented
能留个QQ?或者其他联系方式?
1577134779
Boris-code commented
代理模块 是打算废掉重写的,现在用起来比较麻烦。你可以先自己写个代理池,等我这边封装好了你再用我这个