Python爬虫:抓取百度搜索的前两页结果标题(JSON)
JasonWu73 opened this issue · 0 comments
JasonWu73 commented
"""
百度搜索关键字时, 返回的前两页内容.
print("\u0020")
print("�".encode('utf-8'))
"""
import mechanicalsoup
import json
# 提供目标信息
def extract_print(result_soup):
title_tags = result_soup.select("[data-tools]")
if len(title_tags) > 0:
for index, item in enumerate(title_tags, start=1):
try:
data = json.loads(item["data-tools"])
print(f"{index}. {data['title']} -> {data['url']}")
except json.decoder.JSONDecodeError:
print(f"{index}. {item['data-tools']}")
else:
print(result_soup)
# 抓取网页内容
def scrape(url="https://www.baidu.com/", kw="吴仙杰"):
# 打开无头浏览器, 连接百度
browser = mechanicalsoup.StatefulBrowser(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36")
browser.open(url)
# 填写搜索表单
browser.select_form('form[action="/s"]')
browser["wd"] = kw
first_page = browser.submit_selected()
# 展示第一页结果
print("第一页: ")
extract_print(first_page.soup)
# 获取分页链接
page_num_links = first_page.soup.select('.page-inner a')
# 展示二页信息
if len(page_num_links) > 0:
page_2_link = page_num_links[0]
second_page = browser.follow_link(page_2_link)
print("\n第二页: ")
extract_print(second_page.soup)
# 关闭无头浏览器
browser.close()
# 主入口
if __name__ == '__main__':
scrape(kw="吴仙杰 github")