/kabum-scrapy

Web Scrapy Kabum

Primary LanguagePython

Utilizando Scrapy para extrair dados da KaBuM

Tutorial

$ rethinkdb
$ git clone https://github.com/leuthier/kabum-scrapy
$ pip install -r requirements.txt
$ cd kabum-scrapy
$ scrapy crawl kb -o computers.json
$ scrapy runspider login.py

Índice

Objetivos

  • Utilização de xpath nas buscas por links
  • Persistência das informações (RethinkDB)
  • Submissão de formulários
  • Manipulação de querystrings
  • Tratamento de paginação
  • Utilização de logs para sinalizar ocorrências durante o scraping
# -*- coding: utf-8 -*-
import scrapy
import logging
from kabum.items import KabumItem
from kabum.login import LoginSpider
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser

class KbSpider(scrapy.Spider):
    name = "kb"

    def start_requests(self):
        logar = LoginSpider()
        allowed_domains = ["kabum.com.br"]
        urls = [
            'http://www.kabum.com.br/computadores/computadores/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        computers = response.xpath('//div[@class="listagem-box"]')
            
        ## the computer's list index start at 3
        start = 3
        for computer in computers:
            item = KabumItem()
            
            item['name'] = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span[1]//text())').extract()[0]

            ## Sometimes computer's prices doesn't have "De R$ xx,xx por..." at beginning, we check below
            price_query = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[2]//text())').extract()[0]
            if "EM" in price_query:
                item['price'] = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[1]//text())').extract()[0]
            else:
                item['price'] = price_query
                
            price_cash_query = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[4]//text())').extract()[0]
            if "%" in price_cash_query:
                item['price_cash'] = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[3]//text())').extract()[0]
            else:
                item['price_cash'] = price_cash_query
 
            item['url'] = computer.xpath(
              'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@href)').extract()[0]
            item['data_id'] = computer.xpath(
              'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@data-id)').extract()[0]

            ## Verifies thats computer is available to purchase            
            purchase_status = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[4]/div/a/img/@src)').extract()[0]
            if "_off" in purchase_status:
                item['status'] = "Indisponivel"
            else:
                item['status'] = "Disponivel"

            item['url_photo'] = computer.xpath(
                '//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[1]/a/img/@src').extract()[0]

            stars = computer.xpath(
                '//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/div/ul/li[3]/div/@class').extract()[0]
            if stars[len(stars)-1] == "e":
                item['stars'] = 0
            else:
                item['stars'] = stars[len(stars)-1]

            start += 1
            yield item
        
        next_pages=[]
        #next_pages = response.xpath('//*[@id="BlocoConteudo"]/div[2]/div/div[2]/form/table/tbody/tr/td[6]/span/a/@href').extract()
        next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=2&ordem=5&limite=30")
        next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=3&ordem=5&limite=30")
        for i in next_pages:
            next_url = "http://www.kabum.com.br/computadores/computadores/"+i
            #logging.info('NEXT_URL: '+next_url)
            yield scrapy.Request(url=next_url, callback=self.parse)
# -*- coding: utf-8 -*-

import rethinkdb as r

class RethinkdbPipeline(object):

    conn = None
    rethinkdb_settings = {}
    
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings

        ## get rethinkdb settings from settings.py
        rethinkdb_settings = settings.get('RETHINKDB', {})

        return cls(rethinkdb_settings)

    def __init__(self, rethinkdb_settings):
        self.rethinkdb_settings = rethinkdb_settings

    def open_spider(self, spider):
        if self.rethinkdb_settings:
            
            self.table_name = self.rethinkdb_settings['table_name']
            self.db_name = self.rethinkdb_settings['db']
            self.conn = r.connect('localhost', 28015)
            table_list = r.db(self.db_name).table_list().run(self.conn)
            
            ## verify if table already exists and creates a table in database              
            if self.table_name not in table_list:
                r.db(self.db_name).table_create(self.table_name).run(self.conn)

    def close_spider(self, spider):
        if self.conn:
            self.conn.close()

    def process_item(self, item, spider):
        if self.conn:
            r.db(self.db_name).table(self.table_name).insert(item).run(self.conn)
        return item 
# -*- coding: utf-8 -*-

# Scrapy settings for kabum project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html


BOT_NAME = 'kabum'

SPIDER_MODULES = ['kabum.spiders']
NEWSPIDER_MODULE = 'kabum.spiders'

RETHINKDB = {
    'table_name': 'computers', 'db': 'kabum'
}

ITEM_PIPELINES = {
    'kabum.pipelines.RethinkdbPipeline': 1
}


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'kabum (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'kabum.middlewares.KabumSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'kabum.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'kabum.pipelines.KabumPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# -*- coding: utf-8 -*-
import scrapy

#create the KabumItem class with item's attributes to save in database
class KabumItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    price_cash = scrapy.Field()
    url = scrapy.Field()
    data_id = scrapy.Field()
    status = scrapy.Field()
    url_photo = scrapy.Field()
    stars = scrapy.Field()

Tempo gasto

  • Estudando: 8h
  • Implementando: 10h

Programas utilizados

Referências