Disable loading images
Lusitaniae opened this issue · 2 comments
Lusitaniae commented
What is the current behavior?
node scrape.js
(node:21673) UnhandledPromiseRejectionWarning: Error: Protocol error: Connection closed. Most likely the page has been closed.
at assert (/home/q/node_modules/headless-chrome-crawler/node_modules/puppeteer/lib/helper.js:251:11)
at Page.close (/home/q/node_modules/headless-chrome-crawler/node_modules/puppeteer/lib/Page.js:883:5)
at Crawler.close (/home/q/node_modules/headless-chrome-crawler/lib/crawler.js:80:22)
at Crawler.<anonymous> (/home/q/node_modules/headless-chrome-crawler/lib/helper.js:177:23)
at HCCrawler._request (/home/q/node_modules/headless-chrome-crawler/lib/hccrawler.js:349:21)
at processTicksAndRejections (internal/process/task_queues.js:97:5)
at async HCCrawler._startRequest (/home/q/node_modules/headless-chrome-crawler/lib/hccrawler.js:305:19)
at async Promise.all (index 0)
at async PriorityQueue._pull (/home/q/node_modules/headless-chrome-crawler/lib/priority-queue.js:94:5)
(node:21673) UnhandledPromiseRejectionWarning: Unhandled promise rejection. This error originated either by throwing inside of an async function without a catch block, or by rejecting a promise which was not handled with .catch(). To terminate the node process on unhandled promise rejection, use the CLI flag `--unhandled-rejections=strict` (see https://nodejs.org/api/cli.html#cli_unhandled_rejections_mode). (rejection id: 2)
(node:21673) [DEP0018] DeprecationWarning: Unhandled promise rejections are deprecated. In the future, promise rejections that are not handled will terminate the Node.js process with a non-zero exit code.
If the current behavior is a bug, please provide the steps to reproduce
async function customCrawl(page, crawl) {
await page.setRequestInterception(true);
page.on('request', (request) => {
console.log(request)
if (request.resourceType() === 'image') request.abort();
else request.continue();
});
}
(async () => {
const crawler = await HCCrawler.launch({
customCrawl,
headless: false,
// Function to be evaluated in browsers
evaluatePage: (() => ({
title: $('title').text(),
})),
// Function to be called with evaluated results from browsers
onSuccess: (result => {
console.log(result.result);
}),
});
// Queue multiple requests
// await crawler.queue(urls);
// Queue a request with custom options
await crawler.queue(url);
await crawler.onIdle(); // Resolved when no queue is left
await crawler.close(); // Close the crawler
})();
What is the expected behavior?
Using the code snipper above I expect to be able to crawl a website without loading images.
This logic comes from pupetteer
What is the motivation / use case for changing the behavior?
Reduce bandwidth usage from crawling
Please tell us about your environment:
- Version: 1.8.0
- Platform / OS version: Ubuntu 18
- Node.js version: 12.16.3
kulikalov commented
Hi @Lusitaniae!
You forgot to add return await crawl()
in your customCrawl
function:
async function customCrawl(page, crawl) {
await page.setRequestInterception(true);
page.on('request', (request) => {
console.log(request)
if (request.resourceType() === 'image') request.abort();
else request.continue();
});
return await crawl()
}
Did it help? Closing the issue?
kulikalov commented
closing due to inactivity. Presumably, resolved.