Spider does not process initial_urls
lorien opened this issue · 0 comments
lorien commented
Does nothing
# coding: utf-8
import urllib
import csv
import logging
from grab.spider import Spider, Task
class ExampleSpider(Spider):
# List of initial tasks
# For each URL in this list the Task object will be created
initial_urls = ['http://habrahabr.ru/']
def prepare(self):
# Prepare the file handler to save results.
# The method `prepare` is called one time before the
# spider has started working
self.result_file = csv.writer(open('result.txt', 'w'))
# This counter will be used to enumerate found images
# to simplify image file naming
self.result_counter = 0
def task_initial(self, grab, task):
print('Habrahabr home page')
# This handler for the task named `initial i.e.
# for tasks that have been created from the
# `self.initial_urls` list
# As you see, inside handler you can work with Grab
# in usual way i.e. just if you have done network request
# manually
for elem in grab.doc.select('//h1[@class="title"]'
'/a[@class="post_title"]'):
# For each title link create new Task
# with name "habrapost"
# Pay attention, that we create new tasks
# with yield call. Also you can use `add_task` method:
# self.add_task(Task('habrapost', url=...))
yield Task('habrapost', url=elem.attr('href'))
def task_habrapost(self, grab, task):
print('Habrahabr topic: %s' % task.url)
# This handler receives results of tasks we
# created for each topic title found on home page
# First, save URL and title into dictionary
post = {
'url': task.url,
'title': grab.xpath_text('//h1/span[@class="post_title"]'),
}
# Next, create new network request to search engine to find
# the image related to the title.
# We pass info about the found publication in the arguments to
# the Task object. That allows us to pass information to next
# handler that will be called for found image.
query = urllib.quote_plus(post['title'].encode('utf-8'))
search_url = 'http://images.yandex.ru/yandsearch'\
'?text=%s&rpt=image' % query
yield Task('image_search', url=search_url, post=post)
def task_image_search(self, grab, task):
print('Images search result for %s' % task.post['title'])
# In this handler we have received result of image search.
# That is not image! This is just a list of found images.
# Now, we take URL of first image and spawn new network
# request to download the image.
# Also we pass the info about pulication, we need it be
# available in next handler.
image_url = grab.xpath_text('//div[@class="b-image"]/a/img/@src')
yield Task('image', url=image_url, post=task.post)
def task_image(self, grab, task):
print('Image downloaded for %s' % task.post['title'])
# OK, this is last handler in our spider.
# We have received the content of image,
# we need to save it.
path = 'images/%s.jpg' % self.result_counter
grab.response.save(path)
self.result_file.writerow([
task.post['url'].encode('utf-8'),
task.post['title'].encode('utf-8'),
path
])
# Increment image counter
self.result_counter += 1
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
# Let's start spider with two network concurrent streams
bot = ExampleSpider(thread_number=2)
bot.run()