Download website to a local directory (including all css, images, js, etc.)
npm install website-scraper
var scraper = require('website-scraper');
var options = {
urls: ['http://nodejs.org/'],
directory: '/path/to/save/',
};
// with callback
scraper.scrape(options, function (error, result) {
/* some code here */
});
// or with promise
scraper.scrape(options).then(function (result) {
/* some code here */
});
Makes requests to urls
and saves all files found with sources
to directory
.
options - object containing next options:
urls:
array of urls to load and filenames for them (required, see example below)directory:
path to save loaded files (required)defaultFilename:
filename for index page (optional, default: 'index.html')sources:
array of objects to load, specifies selectors and attribute values to select files for loading (optional, see default value inlib/config/defaults.js
)subdirectories:
array of objects, specifies subdirectories for file extensions. Ifnull
all files will be saved todirectory
(optional, see example below)request
: object, custom options for request (optional, see example below)
callback - callback function (optional), includes following parameters:
error:
if error -Error
object, if success -null
result:
if error -null
, if success - array if objects containing:url:
url of loaded pagefilename:
filename where page was saved (relative todirectory
)
Let's scrape some pages from http://nodejs.org/ with images, css, js files and save them to /path/to/save/
.
Imagine we want to load:
- Home page to
index.html
- About page to
about.html
- Blog to
blog.html
and separate files into directories:
img
for .jpg, .png, .svg (full path/path/to/save/img
)js
for .js (full path/path/to/save/js
)css
for .css (full path/path/to/save/css
)
var scraper = require('website-scraper');
scraper.scrape({
urls: [
'http://nodejs.org/', // Will be saved with default filename 'index.html'
{url: 'http://nodejs.org/about', filename: 'about.html'},
{url: 'http://blog.nodejs.org/', filename: 'blog.html'}
],
directory: '/path/to/save',
subdirectories: [
{directory: 'img', extensions: ['.jpg', '.png', '.svg']},
{directory: 'js', extensions: ['.js']},
{directory: 'css', extensions: ['.css']}
],
sources: [
{selector: 'img', attr: 'src'},
{selector: 'link[rel="stylesheet"]', attr: 'href'},
{selector: 'script', attr: 'src'}
],
request: {
headers: {
'User-Agent': 'Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 4 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19'
}
}
}).then(function (result) {
console.log(result);
}).catch(function(err){
console.log(err);
});