fb55/domutils

Removing an element removes all the elements

Pajamaman opened this issue · 2 comments

This was kind of hard to reproduce but I'll try my best to explain. I have this HTML:

var html = '<table>\n<tr>\n<td>\nHi\n</td>\n</tr>\n</table>';

I want to parse it using htmlparser2, handle it with DomHandler, and do some manipulation with DomUtils. Here's the boilerplate stuff:

var htmlparser = require('htmlparser2');

var handler = new htmlparser.DomHandler(function (err, dom) {
    if (err) {
        throw err;
    }

    console.log(render(dom));
});

var parser = new htmlparser.Parser(handler);
parser.write(html);
parser.done();

render takes a list of nodes and returns their string representation. It is called recursively when an element contains children (see bottom, under renderTag):

function render(nodes) {
    var html = '';

    nodes.forEach(function (node) {
        if (node.type == 'text') {
            html += renderText(node);
            return;
        }

        if (node.type == 'comment') {
            html += renderComment(node);
            return;
        }

        html += renderTag(node);
    });

    return html;
}

function renderText(node) {
    return node.data;
}

function renderComment(node) {
    return '<!--' + node.data + '-->';
}

function renderTag(node) {
    var openTag = '<' + node.name + '>',
        closeTag = '</' + node.name + '>';

    if (!node.children.length) {
        return openTag + closeTag;
    }

    return openTag + render(node.children) + closeTag;
}

So far, this works as expected. But a problem occurs when I add call DomUtils.removeElement inside of render:

function render(nodes) {
    var html = '';

    //debugging
    console.log(nodes);

    nodes = nodes.filter(function (node, index) {
        if (node.type == 'text' && !node.data.trim()) {
            htmlparser.DomUtils.removeElement(node);
            return false
        }

        return true;
    });

    //debugging
    console.log(nodes);

    nodes.forEach(function (node) {
        ....

The first time render is called, nodes contains only one element: the table tag. The filtering has no effect, so the console logs the same data twice.

The second time render is called, nodes contains three elements:

[ { data: '\n',
    type: 'text',
    next: 
     { type: 'tag',
       name: 'tr',
       attribs: {},
       children: [Object],
       next: [Object],
       prev: [Circular],
       parent: [Object] },
    prev: null,
    parent: 
     { type: 'tag',
       name: 'table',
       attribs: {},
       children: [Circular],
       next: null,
       prev: null,
       parent: null } },
  { type: 'tag',
    name: 'tr',
    attribs: {},
    children: [ [Object], [Object], [Object] ],
    next: 
     { data: '\n',
       type: 'text',
       next: null,
       prev: [Circular],
       parent: [Object] },
    prev: 
     { data: '\n',
       type: 'text',
       next: [Circular],
       prev: null,
       parent: [Object] },
    parent: 
     { type: 'tag',
       name: 'table',
       attribs: {},
       children: [Circular],
       next: null,
       prev: null,
       parent: null } },
  { data: '\n',
    type: 'text',
    next: null,
    prev: 
     { type: 'tag',
       name: 'tr',
       attribs: {},
       children: [Object],
       next: [Circular],
       prev: [Object],
       parent: [Object] },
    parent: 
     { type: 'tag',
       name: 'table',
       attribs: {},
       children: [Circular],
       next: null,
       prev: null,
       parent: null } } ]

As you can see, the first element is an "empty" string. I want to remove this node from the dom completely. But doing so also removes the other elements in the array, including the tr tag. The console logs an empty array after the filtering.

I don't know if this is a bug with the DomUtils library, or if this is some kind of limitation with JavaScript, or if I'm missing something. But it seems like I should be able to do this.

Ok, I figured out what's going on. DomUtils.removeElement actually modifies the nodes array. This can be proved:

nodes.forEach(function (node) {
    if (node.type == 'text' && !node.data.trim()) {
        console.log(nodes);
        htmlparser.DomUtils.removeElement(node);
        console.log(nodes);
    }
});

So, if the first element of the array is an "empty" string, the second element is skipped. This is not what I thought would happen from looking at the removeElement method, and I'm still not entirely sure why it behaves this way. Something to do with JavaScript variables in memory, I guess.

fb55 commented

Yup, that appears to be the same instance of the array.