使用Python抓取古诗词
v5tech opened this issue · 0 comments
v5tech commented
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import time
import mysql.connector
# 插入古诗
def insert_poems(title,author):
cnx = mysql.connector.connect(user='root', password='root',
host='192.168.99.142',
database='sys')
cursor = cnx.cursor()
insert_poems = ("INSERT INTO `poems` (`title`,`author`) VALUES (%s,%s)")
cursor.execute(insert_poems, (title,author))
cursor.close()
cnx.commit()
cnx.close()
def fetchMingju():
page = 1
while(page<=114):
url = 'http://so.gushiwen.org/mingju/Default.aspx?p='+str(page)
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
content = BeautifulSoup(res.content,'lxml')
items = content.select('div[class="sons"]')
for item in items:
title = item.select('a')[0].text
author = item.select('a')[1].text.replace('____','')
insert_poems(title,author)
print title,author
page+=1
# 获取古诗
def fetchPoems():
page = 1
while(page<=200):
url = 'http://so.gushiwen.org/type.aspx?p='+str(page)
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
content = BeautifulSoup(res.content,'lxml')
items = content.select('div[class="sons"]')
for item in items:
title = item.select('p')[0].text
url = 'http://so.gushiwen.org'+item.select('p > a')[0].attrs['href']
res = requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
content = BeautifulSoup(res.content, 'lxml')
print content.select('div[class="shileft"] h1')[0].text
page += 1
time.sleep(2)
# 获取作者
def fetchAuthors():
page = 1
while(page<=200):
url = 'http://so.gushiwen.org/authors/Default.aspx?p='+str(page)
res = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
content = BeautifulSoup(res.content,'lxml')
items = content.select('div[class="sonsauthor"]')
for item in items:
author = item.select('p')[0].text
print author
page += 1
time.sleep(2)
def insert_stackoverflow(vote,answer,view,title):
cnx = mysql.connector.connect(user='root', password='root',
host='192.168.99.142',
database='sys')
cursor = cnx.cursor()
insert_stackoverflow = ("INSERT INTO `stackoverflow` (`vote`,`answer`,`view`,`title`) VALUES (%s,%s,%s,%s)")
cursor.execute(insert_stackoverflow, (vote,answer,view,title))
cursor.close()
cnx.commit()
cnx.close()
def fetchstackoverflow():
page = 1
while(page<=200):
url = 'http://stackoverflow.com/questions/tagged/java?page='+str(page)+'&sort=votes&pagesize=10'
res = requests.get(url, headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36'})
content = BeautifulSoup(res.content, 'lxml')
items = content.select('div[class="question-summary"]')
for item in items:
vote = item.select('div strong')[0].text
answer = item.select('div strong')[1].text
view = item.select('div.views')[0]['title'].replace(',','').replace(' views','')
title = item.select('a[class="question-hyperlink"]')[0].text
print vote,answer,view,title
insert_stackoverflow(vote,answer,view,title)
print '-------------------->%d' % page
page+=1
time.sleep(5)
fetchstackoverflow()
fetchMingju()