lorien/grab

Spider does not process initial_urls

lorien opened this issue · 0 comments

Does nothing

# coding: utf-8
import urllib
import csv
import logging

from grab.spider import Spider, Task

class ExampleSpider(Spider):
    # List of initial tasks
    # For each URL in this list the Task object will be created
    initial_urls = ['http://habrahabr.ru/']

    def prepare(self):
        # Prepare the file handler to save results.
        # The method `prepare` is called one time before the
        # spider has started working
        self.result_file = csv.writer(open('result.txt', 'w'))

        # This counter will be used to enumerate found images
        # to simplify image file naming
        self.result_counter = 0

    def task_initial(self, grab, task):
        print('Habrahabr home page')

        # This handler for the task named `initial i.e.
        # for tasks that have been created from the
        # `self.initial_urls` list

        # As you see, inside handler you can work with Grab
        # in usual way i.e. just if you have done network request
        # manually
        for elem in grab.doc.select('//h1[@class="title"]'
                                    '/a[@class="post_title"]'):
            # For each title link create new Task
            # with name "habrapost"
            # Pay attention, that we create new tasks
            # with yield call. Also you can use `add_task` method:
            # self.add_task(Task('habrapost', url=...))
            yield Task('habrapost', url=elem.attr('href'))

    def task_habrapost(self, grab, task):
        print('Habrahabr topic: %s' % task.url)

        # This handler receives results of tasks we
        # created for each topic title found on home page

        # First, save URL and title into dictionary
        post = {
            'url': task.url,
            'title': grab.xpath_text('//h1/span[@class="post_title"]'),
        }

        # Next, create new network request to search engine to find
        # the image related to the title.
        # We pass info about the found publication in the arguments to
        # the Task object. That allows us to pass information to next
        # handler that will be called for found image.
        query = urllib.quote_plus(post['title'].encode('utf-8'))
        search_url = 'http://images.yandex.ru/yandsearch'\
                     '?text=%s&rpt=image' % query
        yield Task('image_search', url=search_url, post=post)

    def task_image_search(self, grab, task):
        print('Images search result for %s' % task.post['title'])

        # In this handler we have received result of image search.
        # That is not image! This is just a list of found images.
        # Now, we take URL of first image and spawn new network
        # request to download the image.
        # Also we pass the info about pulication, we need it be
        # available in next handler.
        image_url = grab.xpath_text('//div[@class="b-image"]/a/img/@src')
        yield Task('image', url=image_url, post=task.post)

    def task_image(self, grab, task):
        print('Image downloaded for %s' % task.post['title'])

        # OK, this is last handler in our spider.
        # We have received the content of image,
        # we need to save it.
        path = 'images/%s.jpg' % self.result_counter
        grab.response.save(path)
        self.result_file.writerow([
            task.post['url'].encode('utf-8'),
            task.post['title'].encode('utf-8'),
            path
        ])
        # Increment image counter
        self.result_counter += 1


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    # Let's start spider with two network concurrent streams
    bot = ExampleSpider(thread_number=2)
    bot.run()