/MyScrapy

Primary LanguagePython

MyScrapy

1. Item

设置要爬去的内容

#### items.py
from scrapy.item import Item, Field
class QianchengItem(Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 岗位名称
    job_name=Field()
    # 对应链接
    job_url=Field()
    # 公司名称
    job_enterprise=Field()
    # 公司工作地点
    job_place=Field()
    # 工资
    salary=Field()
    # date
    date=Field()
    pass  

2. Spider

需要加载的包:

import scrapy
from qiancheng.items import QianchengItem # 在item=QianchengItem()会用到

设置必要的变量:

name = "qiancheng" #爬虫名称
DOWNLOAD_DELAY = 2 # 延长时间

如果设置了start_urls会自动遍历所有urls并调用parse函数。

start_requests函数

def start_requests(self):
	page_num=10
	for page in range(page_num):
		url='http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=000000%2C00&district=000000&funtype=0000&keyword=数据挖掘&keywordtype=2&curr_page='+str(page+1)+'&lang=c&stype=1&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14&dibiaoid=0&confirmdate=9'
		print '第%d页:%s' % (page+1,url)
  		yield scrapy.Request(url=url, callback=self.parse_page)

parse_page函数,在yield中调用了

def parse_page(self, response):
	bodys=response.css('div.el')
	for index,body in enumerate(bodys):
		url=body.css('p.t1 span a::attr(href)').extract_first()
		if url is None:
			continue
		#print '####第%d个岗位' % index
		item=QianchengItem()
		item['job_name']=body.css('p.t1 span a::attr(title)').extract_first()
		item['job_url']=url
		item['job_enterprise']=body.css('span.t2 a::attr(title)').extract_first()
		item['job_place']=body.css('span.t3::text').extract_first()
		if body.css('span.t4::text'):
			item['salary']=body.css('span.t4::text').extract_first()
		else:
			item['salary']='Null'
		item['date']=body.css('span.t5::text').extract_first()
		print item
		yield item