libxmljs/libxmljs

segmentation fault with libxmljs

SaltwaterC opened this issue · 17 comments

I was checking for XXE, even though it shouldn't be possible with libxml2 2.9 used by OS X 10.9.2. Used node.js v0.10.26 and libxmljs v0.9.0.

The code used to trigger the segfault:

// xxe.js
var parser = require('libxml-to-js');
var xml = require('fs').readFileSync('./xxe.xml').toString()
parser(xml, function (err, res) {
    if (err) {
        console.error(err)
        process.exit(1)
    }
    console.log(res)
})
<!-- xxe.xml -->
<?xml version="1.0" encoding="ISO-8859-1"?>
 <!DOCTYPE foo [
   <!ELEMENT foo ANY >
   <!ENTITY xxe SYSTEM "file:///etc/passwd" >]><foo>&xxe;</foo>
node xxe.js
[1]    28749 segmentation fault  node xxe.js

libxml-to-js is just a wrapper for libxmljs that passes back a JavaScript object following the XML doc structure.

Please provide a failing test case using libxmljs and not third party libraries.

// xxe2.js

var libxmljs = require('libxmljs');
var xml = require('fs').readFileSync('./xxe.xml').toString();
var xmlDoc = libxmljs.parseXmlString(xml);

var libxml2js = function (obj, recurse, namespaces) {
    var i, j, k, atlen, chlen, chtlen, val, old, recValue;

    if (namespaces === undefined) {
        namespaces = {};
    }

    if ( ! recurse) { // dealing with the root element
        obj = obj.root();
        if (obj.namespace()) {
            namespaces.xmlns = obj.namespace().href();
        }
    }

    var jsobj = {}, children = obj.childNodes(), attributes = obj.attrs();

    if (attributes.length > 0) {
        jsobj['@'] = {};

        for (i = 0, atlen = attributes.length; i < atlen; i++) {
            jsobj['@'][attributes[i].name()] = attributes[i].value();
        }
    }

    for (i = 0, chlen = children.length; i < chlen; i++) {
        // <"text" kludge>
        if (children[i].name() === 'text' && children[i].type() === 'text') {
            jsobj['#'] = children[i].text().trim();

            if (jsobj['#'].match(/^\s*$/)) {
                delete(jsobj['#']);
            }

            for (j = 0, chtlen = children[i].childNodes().length; j < chtlen; j++) {
                if (children[i].child(j).name() === 'text') {
                    var text = {}, textattrs = children[i].child(j).attrs();

                    text['#'] = children[i].child(j).text();

                    if (textattrs.length > 0) {
                        text['@'] = {};
                    }

                    for (k = 0, atlen = textattrs.length; k < atlen; i++) {
                        text['@'][textattrs[k].name()] = textattrs[k].value();
                    }

                    jsobj.text = text;
                    break; // only allow one "<text></text>" element for now
                }
            }
        } else if (children[i].type() === 'cdata') {
            val = children[i].toString().trim();
            val = val.replace(/^<\!\[CDATA\[/, '').replace(/\]\]\>$/, '');
            jsobj['#']=val;
        } else {
            // </"text" kludge>
            var ns = '';
            var namespace = children[i].namespace();

            if (namespace && namespace.prefix() !== null) {
                ns = namespace.prefix() + ':';
                namespaces[namespace.prefix()] = namespace.href();
            }
            var key = ns + children[i].name();

            if (typeof jsobj[key] === 'undefined') {
                if (children[i].childNodes().length === 1 && children[i].attrs().length === 0 && (children[i].childNodes()[0].type() === 'text' || children[i].childNodes()[0].type() === 'cdata')) {
                    val = children[i].childNodes()[0].toString().trim();

                    if (children[i].childNodes()[0].type() === 'cdata') {
                        val = val.replace(/^<\!\[CDATA\[/, '').replace(/\]\]\>$/, '');
                    }

                    jsobj[key] = val;
                } else {
                    if (children[i].name() !== undefined) {
                        recValue = libxml2js(children[i], true, namespaces);
                        jsobj[key] = recValue.jsobj;
                        merge(namespaces, recValue.namespaces);
                    }
                }
            } else {
                if (typeof jsobj[key] === 'string') {
                    old = jsobj[key];
                    jsobj[key] = [];
                    jsobj[key].push({'#': old});
                } else if (typeof jsobj[key] === 'object' && jsobj[key] instanceof Array === false) {
                    old = jsobj[key];
                    jsobj[key] = [];
                    jsobj[key].push(old);
                }

                recValue = libxml2js(children[i], true, namespaces);
                jsobj[key].push(recValue.jsobj);
                merge(namespaces, recValue.namespaces);
            }
        }
    }

    if ( ! recurse) {
        if (namespaces && ! lodash.isEmpty(namespaces)) {
            if ( ! jsobj['@']) {
                jsobj['@'] = {};
            }

            jsobj['@'].xmlns = namespaces;
        }

        return jsobj;
    }

    return {
        jsobj: jsobj,
        namespaces: namespaces
    };
};

libxml2js(xmlDoc);
node xxe2.js
[1]    14725 segmentation fault  node xxe2.js

Same XML file as before.

Can you not narrow down the failure to some simple js code? Or identify which call might be of issue?

This is a stripped down version that doesn't make any sense by itself, but it does reproduce the issue. I also commented on the call that triggers the segfault. Refer to the original code in case of misunderstandings:

// xxe3.js

var libxmljs = require('libxmljs');
var xml = require('fs').readFileSync('./xxe.xml').toString();
var xmlDoc = libxmljs.parseXmlString(xml);

var calls = 0;

var libxml2js = function (obj, recurse) {
    calls++;
    console.log('call to libxml2js #%d', calls);

    var i, chlen;

    if ( ! recurse) { // dealing with the root element
        obj = obj.root();
    }

    var jsobj = {}, children = obj.childNodes();
    var attributes = obj.attrs(); // this is the problematic call on the third run

    for (i = 0, chlen = children.length; i < chlen; i++) {
        console.log('iteration #%d', i);
        libxml2js(children[i], true);
    }

    if ( ! recurse) {
        return jsobj;
    }

    return {
        jsobj: jsobj
    };
};

libxml2js(xmlDoc);
node xxe3.js                                                      ⏎
call to libxml2js #1
iteration #0
call to libxml2js #2
iteration #0
call to libxml2js #3
[1]    14999 segmentation fault  node xxe3.js

When I run your failing example I get:

libxmljs/lib/document.js:120
    return bindings.fromXml(string, options || {});
                    ^
Error: Entity 'xxe' not defined

My environment looks like this:

uname -srm
Darwin 13.1.0 x86_64

node -v
v0.10.26 # binary build for OS X, installed with nvm

grep -Ri "LIBXML_DOTTED_VERSION" /usr/include/libxml2/libxml/xmlversion.h
/usr/include/libxml2/libxml/xmlversion.h: * LIBXML_DOTTED_VERSION:
/usr/include/libxml2/libxml/xmlversion.h:#define LIBXML_DOTTED_VERSION "2.9.0"

cat node_modules/libxmljs/package.json | grep '\"version'
  "version": "0.9.0",

I too have the same versions of everything (and node is also installed via nvm).

System libxml2 version doesn't matter because libxmljs uses its own builtin version (which is also 2.9.0 anyway).

Closing as I am unable to reproduce using the provided example. Will happily re-open and re-visit if someone else who can reproduce and provide a fix.

Uninstalled everything: node.js v0.10.26 from .nvm, the cached libxmljs from .npm, the installed libxmljs for testing purposes. Reinstalled all of them. The process still crashes with a segfault.

Here's what happens under lldb:

(lldb) target create node
Current executable set to 'node' (x86_64).
(lldb) process launch -- xxe.js
Process 35960 launched: '/Users/SaltwaterC/.nvm/v0.10.26/bin/node' (x86_64)
call to libxml2js #1
iteration #0
call to libxml2js #2
iteration #0
call to libxml2js #3
Process 35960 stopped
* thread #1: tid = 0xd130c, 0x00000001038832a6 xmljs.node`libxmljs::XmlAttribute::New(attr=0x0000000200000000) + 10 at xml_attribute.cc:33, queue = 'com.apple.main-thread', stop reason = EXC_BAD_ACCESS (code=1, address=0x200000008)
    frame #0: 0x00000001038832a6 xmljs.node`libxmljs::XmlAttribute::New(attr=0x0000000200000000) + 10 at xml_attribute.cc:33
   30   v8::Handle<v8::Object>
   31   XmlAttribute::New(xmlAttr* attr)
   32   {
-> 33       assert(attr->type == XML_ATTRIBUTE_NODE);
   34
   35       if (attr->_private) {
   36           return static_cast<XmlNode*>(attr->_private)->handle_;
(lldb)

Created a gist with the libxmljs that was used for triggering the issue: https://gist.github.com/SaltwaterC/10863017

And some environment info that was missing from my previous posts:

cc -v
Apple LLVM version 5.1 (clang-503.0.40) (based on LLVM 3.4svn)
Target: x86_64-apple-darwin13.1.0
Thread model: posix

Are you sure you have no other js code running or setup? I keep getting "Entity 'xxe' is not defined" even with your build. Can you make a gist of the xml file? Perhaps there was an error in copying it to/from the issue report.

It's the comment that breaks it for you. I don't have in my original XML file. I just put it there to indicate which is the file name of the XML doc.

I think libxmljs should be more specific about parsing errors though. The equivalent in PHP (which also uses libxml2 as parser) returns:

Entity: line 2: parser error : XML declaration allowed only at the start of the document

I updated the gist with the original files from my machine to steer clear from any further misunderstandings. I also made sure there's only one libxmljs build on my machine and it still fails the same way. Sorry for wasting your time with the silly communication of the issue itself.

The problem is here https://github.com/polotek/libxmljs/blob/master/src/xml_node.cc#L178

It is because we are not properly handling the different node types libxml2 could be iterating over to create the correctly typed wrapper for them. I am not sure (cause I don't really know anything about xml) but I would venture that entity declaration nodes don't have attributes? or maybe that libxml2 doesn't zero the memory for them.

One way around this (and maybe the correct approach) could be to detect the node type in the attrs call and not do anything for certain types which we know to cause failure.

I am gonna add that I hate xml, just so that is also a part of this issue thread.

We also get a segmentation fault in nodejs v0.10.31 .
Everything is ok in nodejs v0.10.29 though.

Fixed in #359

@teleological I think we can close this if you think it's been fixed by now. It's been more than a year and the memory management situation has evolved quite a bit in that time.

@rc0x03 The problem here is not due to memory management, but due to treating non-element nodes as elements, and applying inapplicable operations to the wrapped nodes. I will clean up #359, which will address this issue.

Is there any chance this can get fixed? I use libxmljs via the libxml-to-js package and the segmentation faults from this issue are causing us some problems. Thanks!