Repeated bug
allphfa opened this issue · 0 comments
allphfa commented
Concurrency 50, duplicated links super (fried chicken many kind of)
Don't believe it, try it yourself
from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String
engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()
class videoInfo(Base):
__tablename__ = 'users'
id = Column(Integer, primary_key=True)
videoTitle = Column(String)
videoType = Column(String)
videoAuthor = Column(String)
videoNotes = Column(String)
videoLang = Column(String)
videoRegion = Column(String)
videoPlayPage = Column(String)
videoPlayLink = Column(String)
Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)
class getVideoInfo(Item):
def filterPlayLink(link):
url = 'http://www.xinxin46.com%s' % link[0]
content = requests.get(url).text
playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
result = str()
for x in playUrl:
line,playUrl,player = x.split('$')
result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
# result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
return result
videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
videoType = Css('.intro > li:nth-child(1) p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
videoAuthor = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
videoNotes = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[2]).text())
videoLang = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[3]).text())
videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))
videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)
async def save(self):
if hasattr(self,'videoTitle')\
and hasattr(self,'videoType')\
and hasattr(self,'videoAuthor')\
and hasattr(self,'videoNotes')\
and hasattr(self,'videoLang')\
and hasattr(self,'videoRegion')\
and hasattr(self,'videoPlayPage')\
and hasattr(self,'videoPlayLink'):
"""
if self.videoPlayLink.find('qvod') >-1:
return
print('片名:%s' % self.videoTitle)
print('类型:%s' % self.videoType)
print('主演:%s' % self.videoAuthor)
print('%s' % self.videoNotes)
print('%s' % self.videoLang)
print('%s' % self.videoRegion)
print('%s' % self.videoPlayPage)
print('%s' % self.videoPlayLink)
print('-------')
"""
global session
addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
session.add(addInfo)
session.commit()
class MySpider(Spider):
concurrency = 50
encoding = 'gbk'
headers = {'User-Agent': 'Google Spider'}
start_url = r'http://www.xinxin46.com/L/lilunpian.html'
parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
]
MySpider.run()
session.close()
'''
import requests
a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''