Web scraping, crawling and DOM tree manipulation for Node.js. Uses htmlparser2 for HTML parsing and robots-txt for robots.txt
checking.
$ npm install domp
var domp = require('domp');
domp(url, function(dom) {
console.log(...dom.map(node => node.name));
// html head meta title script ...
});
You can scrape an Array
of urls by
- providing a callback:
domp(urls, function(dom) {
// called twice
})
- looping through an iterator
for (var page of domp(urls))
page.then(function (dom) {
// resolved
}, function (error) {
// rejected
});
function resolve(next) {
return function (dom) {
var title = dom.find('title').next().value,
links = [...dom.filter(node => node.href && node.href.indexOf('http') === 0)];
// get random link
var link = links[Math.floor(Math.random() * links.length)];
console.log(title.text);
console.log(link.href);
// submit link(s) to be scraped next
next(link.href);
};
}
domp.crawl('https://en.wikipedia.org', function(requests, next) {
for (var request of requests)
request.then(resolve(next));
});
Standard traversal using for ... of
:
for (var node of dom)
console.log(node);
Sibling (children with same parent) traversal using for ... of
:
for (var sibling of node.siblings)
console.log(sibling);
Tag name traversal using for ... of
and find(name)
:
for (var node of dom.find('p'))
console.log(node);
DOM nodes (see node.js
) implement mapping similar to what we're used to from Array.prototype.map
, but instead of returning an Array
it returns an Iterable
. The Iterable
can either be unpacked into an Array
using the spread operator (...
) or be used as a normal iterator.
var names = dom.map(node => node.name);
names = [...names];
// names = ['html', 'head', 'meta', 'title', ...]
for (var name of names)
console.log(name);
// html
// head
// ...
Filtering works pretty much the same (returns Iterable
):
// get all 'p' tags
var paragraphs = dom.filter(node => node.name === 'p');
// traverse
for (var p of paragraphs)
console.log(p);
There's also the short find(name)
that can be used to find tag names in the tree:
for (var node in dom.find('p'))
console.log(node);