Repeated bug

Question

Repeated bug

allphfa opened this issue 7 years ago · 0 comments

Concurrency 50, duplicated links super (fried chicken many kind of)

Don't believe it, try it yourself

from gain import Css, Item, Parser, Spider, cssParser,Xpath
from pyquery import PyQuery as pq
import re
import requests

from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from sqlalchemy import create_engine
from sqlalchemy import Column, Integer, String


engine = create_engine('sqlite:////home/dde/test.db', echo=False)
Base = declarative_base()

class videoInfo(Base):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    videoTitle = Column(String)
    videoType = Column(String)
    videoAuthor = Column(String)
    videoNotes = Column(String)
    videoLang = Column(String)
    videoRegion = Column(String)
    videoPlayPage = Column(String)
    videoPlayLink = Column(String)

Session = sessionmaker(bind=engine)
session = Session()
Base.metadata.create_all(engine)



class getVideoInfo(Item):

    def filterPlayLink(link):
        url = 'http://www.xinxin46.com%s' % link[0]
        content = requests.get(url).text
        playUrl = eval(re.findall(r'\[\[.*?\]\]\]', content)[0])[0][1]
        result = str()
        for x in playUrl:
            line,playUrl,player = x.split('$')
            result += 'player----{}----{}----{}\n'.format(player,line,playUrl)
        # result = re.findall(r'/player/.*?/', content)[0][1:-1]+'$$$$'+ result
        return result
    videoTitle = Css('div.ui-cnt ul.intro li h2 a.title')
    videoType = Css('.intro > li:nth-child(1)  p', process_func=lambda pqObj: ' '.join([pq(x).text() for x in pq(pqObj[0])('a')]) if len(pq(pqObj[0])('a'))>0 else pq(pqObj).text())
    videoAuthor = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:' '.join([pq(x).text() for x in pq(pqObj[1])('a')]) if len(pq(pqObj[1])('a'))>0 else pq(pqObj).text())
    videoNotes = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[2]).text())
    videoLang = Css('.intro > li:nth-child(1)  p',process_func=lambda pqObj:pq(pqObj[3]).text())
    videoRegion = Css('.intro > li:nth-child(1) p',process_func=lambda pqObj:pq(pqObj[4]).text())
    videoPlayPage = Css('.play-list li a[href^="/player/"]',process_func=lambda pqObj:'\n'.join(['link----'+pq(x).text()+'----' +pq(x).attr('href') for x in pqObj]))
    
    videoPlayLink = Xpath('/html/body/div[3]/div/div[1]/div[1]/script[1]/@src',process_func=filterPlayLink)

    async def save(self):
        if hasattr(self,'videoTitle')\
            and hasattr(self,'videoType')\
            and hasattr(self,'videoAuthor')\
            and hasattr(self,'videoNotes')\
            and hasattr(self,'videoLang')\
            and hasattr(self,'videoRegion')\
            and hasattr(self,'videoPlayPage')\
            and hasattr(self,'videoPlayLink'):
            """
            if self.videoPlayLink.find('qvod') >-1:
                return

            print('片名：%s' % self.videoTitle)
            print('类型：%s' % self.videoType)
            print('主演：%s' % self.videoAuthor)
            print('%s' % self.videoNotes)
            print('%s' % self.videoLang)
            print('%s' % self.videoRegion)
            print('%s' % self.videoPlayPage)
            print('%s' % self.videoPlayLink)
            print('-------')
            """
            global session
            addInfo = videoInfo(videoTitle=self.videoTitle,videoType=self.videoType,videoAuthor=self.videoAuthor,videoNotes=self.videoNotes,videoLang=self.videoLang,videoRegion=self.videoRegion,videoPlayPage=self.videoPlayPage,videoPlayLink=self.videoPlayLink)
            session.add(addInfo)
            session.commit()




class MySpider(Spider):
    concurrency = 50
    encoding = 'gbk'
    headers = {'User-Agent': 'Google Spider'}
    start_url = r'http://www.xinxin46.com/L/lilunpian.html'
    parsers = [cssParser('.ui-pages a[href^="/L/lilunpian"]',attr='href'),
               cssParser('.primary-list li h5 a[href^="/V/"]',attr='href'),
               cssParser('.play-list a[href^="/player/"]',getVideoInfo,attr='href'),
               ]


MySpider.run()

session.close()

'''
import requests

a= requests.get('http://www.xinxin46.com/player/baishilingyincangjurudepusuOLshimingantizhidenvhaiFSET680/index-0-0.html').text
print(pq(a)('script[src^="/playdata/"]'))
'''