add cssParser
allphfa opened this issue · 0 comments
allphfa commented
Parser.py
class BaseParser(object):
def __init__(self, rule, item=None,attr='href'):
self.rule = rule
self.item = item
self.parsing_urls = []
self.pre_parse_urls = Queue()
self.filter_urls = set()
self.done_urls = []
self.attr = attr # hare
....ellipsis
class cssParser(BaseParser):
def abstract_urls(self, html):
urls = [pq(x).attr(self.attr) for x in pq(html)(self.rule)]
return urls
eg:
from gain import Css, Item, Parser, Spider, cssParser
from pyquery import PyQuery as pq
class Post(Item):
videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
videoType = Css('.intro > li:nth-child(1) > p', process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[0])('a')]))
videoAuthor = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:' '.join([x.text for x in pq(pqObj[1])('a')]))
videoNotes = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[2]).text())
videoLang = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[3]).text())
videoRegion = Css('.intro > li:nth-child(1) > p',process_func=lambda pqObj:pq(pqObj[4]).text())
# title is List
async def save(self):
if hasattr(self,'videoTitle') and hasattr(self,'videoType') and hasattr(self,'videoAuthor') and hasattr(self,'videoNotes') and hasattr(self,'videoLang') and hasattr(self,'videoRegion'):
print('片名:%s' % self.videoTitle)
print('类型:%s' % self.videoType)
print('主演:%s' % self.videoAuthor)
print('%s' % self.videoNotes)
print('%s' % self.videoLang)
print('%s' % self.videoRegion)
print('-------')
class MySpider(Spider):
concurrency = 5
encoding = 'gbk'
headers = {'User-Agent': 'Google Spider'}
start_url = r'http://www.xinxin46.com/L/lilunpian.html'
parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
cssParser('.play-list a[href^="/player/"]',Post,attr='href'),
]
MySpider.run()