JasonWu73/Blog

Python爬虫:抓取百度搜索的前两页结果标题(JSON)

JasonWu73 opened this issue · 0 comments

"""
百度搜索关键字时, 返回的前两页内容.

print("\u0020")
print("�".encode('utf-8'))
"""

import mechanicalsoup
import json


# 提供目标信息
def extract_print(result_soup):
  title_tags = result_soup.select("[data-tools]")
  if len(title_tags) > 0:
    for index, item in enumerate(title_tags, start=1):
      try:
        data = json.loads(item["data-tools"])
        print(f"{index}. {data['title']} -> {data['url']}")
      except json.decoder.JSONDecodeError:
        print(f"{index}. {item['data-tools']}")
  else:
    print(result_soup)


# 抓取网页内容
def scrape(url="https://www.baidu.com/", kw="吴仙杰"):
  # 打开无头浏览器, 连接百度
  browser = mechanicalsoup.StatefulBrowser(user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36")
  browser.open(url)

  # 填写搜索表单
  browser.select_form('form[action="/s"]')
  browser["wd"] = kw
  first_page = browser.submit_selected()

  # 展示第一页结果
  print("第一页: ")
  extract_print(first_page.soup)

  # 获取分页链接
  page_num_links = first_page.soup.select('.page-inner a')

  # 展示二页信息
  if len(page_num_links) > 0:
    page_2_link = page_num_links[0]
    second_page = browser.follow_link(page_2_link)
    print("\n第二页: ")
    extract_print(second_page.soup)

  # 关闭无头浏览器
  browser.close()


# 主入口
if __name__ == '__main__':
  scrape(kw="吴仙杰 github")