Help with porting a spider used with scrapy-splash
educatron opened this issue · 10 comments
Hello there!
I'm new with Scrapy and Python.
This is the spider I used with scrapy-splash:
import scrapy
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy_splash import SplashRequest, SplashJsonResponse, SplashTextResponse
from scrapy.http import HtmlResponse
class SplashItem(scrapy.Item):
h1 = scrapy.Field()
h2 = scrapy.Field()
h3 = scrapy.Field()
class SplashSpider(CrawlSpider):
name = 'splash'
allowed_domains = ['www.domain.org']
start_urls = ['https://www.domain.org/category/post']
rules = (
Rule(LinkExtractor(allow='^https://www.domain.org/category/'), callback='parse_item', process_request='use_splash', process_links='filter_links', follow=True),
)
def filter_links(self, links):
for link in links:
if '?' in link.url:
continue
else:
yield link
def start_requests(self):
for url in self.start_urls:
yield SplashRequest(url, args={'wait':15, 'html':1, 'script':1}, meta={'real_url': url})
def use_splash(self, request):
request.meta['splash'] = {
'endpoint':'render.html',
'args':{'wait':15, 'html':1, 'script':1}
}
return request
def _requests_to_follow(self, response):
if not isinstance(
response,
(HtmlResponse, SplashJsonResponse, SplashTextResponse)):
return
seen = set()
for n, rule in enumerate(self._rules):
links = [lnk for lnk in rule.link_extractor.extract_links(response)
if lnk not in seen]
if links and rule.process_links:
links = rule.process_links(links)
for link in links:
seen.add(link)
r = self._build_request(n, link)
yield rule.process_request(r)
def parse_item(self, response):
item = SplashItem()
item['h1'] = response.xpath('normalize-space(/html/body/h1/text())').get()
item['h2'] = response.xpath('normalize-space(/html/body/h2/text())').get()
item['h3'] = response.xpath('normalize-space(/html/body/h3/text())').get()
yield item
I don't know how to adapt the process_request and _requests_to_follow functions.
Do you recommend to use await asyncio.sleep(1) in parse function to prevent being rate limited?
Thanks a lot!!!
I don't really have experience working with scrapy-splash
, but looking at the documentation for render.html
I think the following should work:
from scrapy_pyppeteer.page import PageCoroutine
class SplashSpider(CrawlSpider):
def use_splash(self, request, response):
request.meta.update({
"pyppeteer": True,
"pyppeteer_page_coroutines": [
PageCoroutine("waitFor", 15000), # milliseconds
],
})
return request
The Splash
docs for the wait
argument say it is the "Time (in seconds) to wait for updates after page is loaded", I think the Page.waitFor
coroutine should work for that. I see html
and script
do not apply to render.html
, so I ignored them.
Regarding _requests_to_follow
, I see the only change is to support alternative response classes, I think that should keep working fine.
Thanks for using this project!
Thank you for your reply!
It works great but it's not possible to use start_requests when using process_request, so the first request is not processed through pyppeteer when using custom process_request callback. It looks like a custom _requests_to_follow method is needed to make them work together.
Amazing project!
For start_requests
you could do:
def start_requests(self):
for url in self.start_urls:
yield scrapy.Request(
url=url,
meta={
"pyppeteer": True,
"pyppeteer_page_coroutines": [
PageCoroutine("waitFor", 15000),
],
},
)
I used:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, meta={"pyppeteer": True}, callback=self.parse_item)
def pyppeteer_request(self, request, response):
request.meta.update({"pyppeteer": True})
return request
When I use start_requests, it only crawls the links set in start_urls.
Hello there!
I'm new with Scrapy and Python.
This is the spider I used with scrapy-splash:import scrapy from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor from scrapy_splash import SplashRequest, SplashJsonResponse, SplashTextResponse from scrapy.http import HtmlResponse class SplashItem(scrapy.Item): h1 = scrapy.Field() h2 = scrapy.Field() h3 = scrapy.Field() class SplashSpider(CrawlSpider): name = 'splash' allowed_domains = ['www.domain.org'] start_urls = ['https://www.domain.org/category/post'] rules = ( Rule(LinkExtractor(allow='^https://www.domain.org/category/'), callback='parse_item', process_request='use_splash', process_links='filter_links', follow=True), ) def filter_links(self, links): for link in links: if '?' in link.url: continue else: yield link def start_requests(self): for url in self.start_urls: yield SplashRequest(url, args={'wait':15, 'html':1, 'script':1}, meta={'real_url': url}) def use_splash(self, request): request.meta['splash'] = { 'endpoint':'render.html', 'args':{'wait':15, 'html':1, 'script':1} } return request def _requests_to_follow(self, response): if not isinstance( response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)): return seen = set() for n, rule in enumerate(self._rules): links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen] if links and rule.process_links: links = rule.process_links(links) for link in links: seen.add(link) r = self._build_request(n, link) yield rule.process_request(r) def parse_item(self, response): item = SplashItem() item['h1'] = response.xpath('normalize-space(/html/body/h1/text())').get() item['h2'] = response.xpath('normalize-space(/html/body/h2/text())').get() item['h3'] = response.xpath('normalize-space(/html/body/h3/text())').get() yield item
I don't know how to adapt the process_request and _requests_to_follow functions.
Do you recommend to use await asyncio.sleep(1) in parse function to prevent being rate limited?Thanks a lot!!!
I tested again this spider with scrapy-splash and it works as expected. The first request is processed with Splash and the crawler follows the rest the links according to the rules set.
When I use start_requests, it only crawls the links set in start_urls.
That is because you're handling the start response(s) with the defined callback (parse_item
), producing its results directly, while the default implementation does additional processing to look for rule matches. You could use CrawlSpider.parse_start_url
if you needed to do your own processing with the initial response, while also allowing the CrawlSpider
to follow its rules.
I tested again this spider with scrapy-splash and it works as expected. The first request is processed with Splash and the crawler follows the rest the links according to the rules set.
I'm not sure I follow, could you elaborate further?
Thank you for your help! I'm going to try what you suggested.
The issue is better explained here: https://stackoverflow.com/questions/45886068/scrapy-crawlspider-splash-how-to-follow-links-through-linkextractor
I enabled HTTP caching to check the response body and tried 4 combinations always with:
rules = (
Rule(LinkExtractor(allow="^https://www.domain.org/category/"),
callback="parse_item", process_request="pyppeteer_request", process_links="filter_links", follow=True),
)
def pyppeteer_request(self, request, response):
request.meta.update({"pyppeteer": True},)
return request
- The response body of links set in start_urls become encoded. Links set in start_urls don't go through parse_item.
def parse_start_url(self, response):
yield Request(url=response.url, meta={"pyppeteer": True},)
- The response body of links set in start_urls become encoded. All links go through parse_item but the links set in start_urls aren't processed through pyppeteer: I tried to scrape a node element created with Javascript and the result was "".
def parse_start_url(self, response):
yield Request(url=response.url, meta={"pyppeteer": True}, callback=self.parse_item,)
- All links are processed with pyppeteer but the links set in start_urls don't go through parse_item.
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, meta={"pyppeteer": True}, )
- Only the links set in start_urls are processed: through pyppeteer and parse_item. The rest are not followed.
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, meta={"pyppeteer": True}, callback=self.parse_item, )
Thank you for your help! I'm going to try what you suggested.
The issue is better explained here: https://stackoverflow.com/questions/45886068/scrapy-crawlspider-splash-how-to-follow-links-through-linkextractor
Someone mentioned there "In the github issues, some users just redefine the _request_to_follow method in their class."
This did it:
def start_requests(self):
for url in self.start_urls:
yield Request(url=url, meta={"pyppeteer": True},)
def parse_start_url(self, response):
return self.parse_item(response)
Thanks a lot!!!!!!