kabum-scrapy: A Python repository from leuthier

Utilizando Scrapy para extrair dados da KaBuM

Site: http://www.kabum.com.br/

Tutorial

$ rethinkdb
$ git clone https://github.com/leuthier/kabum-scrapy
$ pip install -r requirements.txt
$ cd kabum-scrapy
$ scrapy crawl kb -o computers.json
$ scrapy runspider login.py

Índice

Objetivos

Utilização de xpath nas buscas por links
Persistência das informações (RethinkDB)
Submissão de formulários
Manipulação de querystrings
Tratamento de paginação
Utilização de logs para sinalizar ocorrências durante o scraping

Arquivo: /kabum/spiders/kb.py

# -*- coding: utf-8 -*-
import scrapy
import logging
from kabum.items import KabumItem
from kabum.login import LoginSpider
from scrapy.item import Item, Field
from scrapy.http import FormRequest
from scrapy.utils.response import open_in_browser

class KbSpider(scrapy.Spider):
    name = "kb"

    def start_requests(self):
        logar = LoginSpider()
        allowed_domains = ["kabum.com.br"]
        urls = [
            'http://www.kabum.com.br/computadores/computadores/'
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        computers = response.xpath('//div[@class="listagem-box"]')
            
        ## the computer's list index start at 3
        start = 3
        for computer in computers:
            item = KabumItem()
            
            item['name'] = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span[1]//text())').extract()[0]

            ## Sometimes computer's prices doesn't have "De R$ xx,xx por..." at beginning, we check below
            price_query = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[2]//text())').extract()[0]
            if "EM" in price_query:
                item['price'] = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[1]//text())').extract()[0]
            else:
                item['price'] = price_query
                
            price_cash_query = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[4]//text())').extract()[0]
            if "%" in price_cash_query:
                item['price_cash'] = computer.xpath(
                    'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[3]/div[3]//text())').extract()[0]
            else:
                item['price_cash'] = price_cash_query
 
            item['url'] = computer.xpath(
              'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@href)').extract()[0]
            item['data_id'] = computer.xpath(
              'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/span//@data-id)').extract()[0]

            ## Verifies thats computer is available to purchase            
            purchase_status = computer.xpath(
                'normalize-space(//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[4]/div/a/img/@src)').extract()[0]
            if "_off" in purchase_status:
                item['status'] = "Indisponivel"
            else:
                item['status'] = "Disponivel"

            item['url_photo'] = computer.xpath(
                '//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[1]/a/img/@src').extract()[0]

            stars = computer.xpath(
                '//*[@id="BlocoConteudo"]/div[2]/div/div['+str(start)+']/div[2]/div/ul/li[3]/div/@class').extract()[0]
            if stars[len(stars)-1] == "e":
                item['stars'] = 0
            else:
                item['stars'] = stars[len(stars)-1]

            start += 1
            yield item
        
        next_pages=[]
        #next_pages = response.xpath('//*[@id="BlocoConteudo"]/div[2]/div/div[2]/form/table/tbody/tr/td[6]/span/a/@href').extract()
        next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=2&ordem=5&limite=30")
        next_pages.append("?string=&dep=04&sec=34&cat=&sub=&pagina=3&ordem=5&limite=30")
        for i in next_pages:
            next_url = "http://www.kabum.com.br/computadores/computadores/"+i
            #logging.info('NEXT_URL: '+next_url)
            yield scrapy.Request(url=next_url, callback=self.parse)

Arquivo: /kabum/pipelines.py

# -*- coding: utf-8 -*-

import rethinkdb as r

class RethinkdbPipeline(object):

    conn = None
    rethinkdb_settings = {}
    
    @classmethod
    def from_crawler(cls, crawler):
        settings = crawler.settings

        ## get rethinkdb settings from settings.py
        rethinkdb_settings = settings.get('RETHINKDB', {})

        return cls(rethinkdb_settings)

    def __init__(self, rethinkdb_settings):
        self.rethinkdb_settings = rethinkdb_settings

    def open_spider(self, spider):
        if self.rethinkdb_settings:
            
            self.table_name = self.rethinkdb_settings['table_name']
            self.db_name = self.rethinkdb_settings['db']
            self.conn = r.connect('localhost', 28015)
            table_list = r.db(self.db_name).table_list().run(self.conn)
            
            ## verify if table already exists and creates a table in database              
            if self.table_name not in table_list:
                r.db(self.db_name).table_create(self.table_name).run(self.conn)

    def close_spider(self, spider):
        if self.conn:
            self.conn.close()

    def process_item(self, item, spider):
        if self.conn:
            r.db(self.db_name).table(self.table_name).insert(item).run(self.conn)
        return item

Arquivo: /kabum/settings.py

# -*- coding: utf-8 -*-

# Scrapy settings for kabum project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     http://doc.scrapy.org/en/latest/topics/settings.html
#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html


BOT_NAME = 'kabum'

SPIDER_MODULES = ['kabum.spiders']
NEWSPIDER_MODULE = 'kabum.spiders'

RETHINKDB = {
    'table_name': 'computers', 'db': 'kabum'
}

ITEM_PIPELINES = {
    'kabum.pipelines.RethinkdbPipeline': 1
}


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'kabum (+http://www.yourdomain.com)'

# Obey robots.txt rules
# ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY = 0.5
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'kabum.middlewares.KabumSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'kabum.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
#    'kabum.pipelines.KabumPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

Arquivo: /kabum/items.py

# -*- coding: utf-8 -*-
import scrapy

#create the KabumItem class with item's attributes to save in database
class KabumItem(scrapy.Item):
    name = scrapy.Field()
    price = scrapy.Field()
    price_cash = scrapy.Field()
    url = scrapy.Field()
    data_id = scrapy.Field()
    status = scrapy.Field()
    url_photo = scrapy.Field()
    stars = scrapy.Field()

Tempo gasto

Estudando: 8h
Implementando: 10h

leuthier/kabum-scrapy

Utilizando Scrapy para extrair dados da KaBuM

Site: http://www.kabum.com.br/

Tutorial

Índice

Objetivos

Arquivo: /kabum/spiders/kb.py

Arquivo: /kabum/pipelines.py

Arquivo: /kabum/settings.py

Arquivo: /kabum/items.py

Tempo gasto

Programas utilizados

Referências