/domp

Web scraping, crawling and DOM tree manipulation for Node.js.

Primary LanguageJavaScriptMIT LicenseMIT

domp

Web scraping, crawling and DOM tree manipulation for Node.js. Uses htmlparser2 for HTML parsing and robots-txt for robots.txt checking.

$ npm install domp
var domp = require('domp');

Usage

domp(url, function(dom) {
  console.log(...dom.map(node => node.name));
  // html head meta title script ...
});

You can scrape an Array of urls by

  1. providing a callback:
domp(urls, function(dom) {
  // called twice
})
  1. looping through an iterator
for (var page of domp(urls))
  page.then(function (dom) {
    // resolved
  }, function (error) {
    // rejected
  });
function resolve(next) {
  return function (dom) {
    var title = dom.find('title').next().value,
        links = [...dom.filter(node => node.href && node.href.indexOf('http') === 0)];

    // get random link
    var link = links[Math.floor(Math.random() * links.length)];

    console.log(title.text);
    console.log(link.href);

    // submit link(s) to be scraped next
    next(link.href);
  };
}

domp.crawl('https://en.wikipedia.org', function(requests, next) {
  for (var request of requests)
    request.then(resolve(next));
});

DOM Tree traversal

Standard traversal using for ... of:

for (var node of dom)
  console.log(node);

Sibling (children with same parent) traversal using for ... of:

for (var sibling of node.siblings)
  console.log(sibling);

Tag name traversal using for ... of and find(name):

for (var node of dom.find('p'))
  console.log(node);

DOM Manipulation

DOM nodes (see node.js) implement mapping similar to what we're used to from Array.prototype.map, but instead of returning an Array it returns an Iterable. The Iterable can either be unpacked into an Array using the spread operator (...) or be used as a normal iterator.

var names = dom.map(node => node.name);

names = [...names];
// names = ['html', 'head', 'meta', 'title', ...]

for (var name of names)
  console.log(name);
// html
// head
// ...

Filtering works pretty much the same (returns Iterable):

// get all 'p' tags
var paragraphs = dom.filter(node => node.name === 'p');

// traverse
for (var p of paragraphs)
  console.log(p);

There's also the short find(name) that can be used to find tag names in the tree:

for (var node in dom.find('p'))
  console.log(node);