microsoft/napajs

Marshall/Unmarshall with BSON (JS-BSON)

vsdigitall opened this issue · 2 comments

As written in introduction:

During store.set, values marshalled into JSON and stored in process heap, so all threads can access it, and unmarshalled while users retrieve them via store.get.

So every set/get operation calls JSON.stringify/JSON.parse which is quite slow. Why not use BSON internally to marshall/unmarshall? Example is here:

https://github.com/kyriosli/node-shared-cache/blob/master/src/bson.cc

BSON require less memory and is faster while using inside c++ module. I believe you can expect over 10x performance improvement comparing to JSON.stringify/JSON.parse.

Also node-shared-cache has locking mechanism which allows safely read/write object values. It could be nice cross-thead resource sharing feature if we can just send object references to threads with ability to read/write and immediately see changes in other threads.

I would like to give you some feedback on cross-process object sharing approach in 'node-shared-cache' module. I hope you will find it useful.

I run a test with 20 workers-readers and 1 worker-writer with 'node-shared-cache' on 20-core Intel(R) Xeon(R) CPU E5-2640 v4 @ 2.40GHz which are accessing the same shared object. One process is setting data of shared object, others read the data of the same object with total rate 2000 rq/sec.

After 24 hrs of execution I received no memory leaks and no exceptions:
image

Napa.js sounds promising, and if we can get there a fast cross-thread (and cross-process) object sharing with transparent read/write ops then people can write more complex programs faster and with better reliability and performance.

Test source code:

const cp = require('child_process'),
      maxSocketWorkers = 20,
      CACHE_NAME = "NODE_CACHE",
      binding = require('node-shared-cache');
      
global.workers = new Object();

function sleep (ms) {
  return new Promise(resolve => { 
    setTimeout(resolve,ms) 
  })
}

function randomBytes (l) {
  let r = '';
  
  l = l / 2
  
  for (let i = 0; i < l; i++) {
    let j = Math.floor(Math.random() * 255).toString(16);
    
    r += j.length < 2 ? '0' + j : j;
  }
  
  return r;
}

if (process.argv[2] !== 'worker') { // if we are inside host process - spawn 20 child worker processes and 1 master process
  function restart (env) {
    let worker = cp.spawn(process.execPath, [__filename, 'worker'], {stdio: 'inherit', env});
    
    worker.env = env;
    global.workers[env.id] = worker;
    
    worker.on('exit', function (code, signal) {
      delete global.workers[env.id];
      
      console.log('worker ' + worker.env.id + ' died with code: ' + code + ', signal: ' + signal + '\n left workers: ' + Object.keys(global.workers).length);
    });
  }
  
  restart({ // a process where we set data to shared object
    id: 0,
    type: 'server'
  });
  
  setTimeout(function () { // a cross-process data-getters 
    for (let i = 0; i < maxSocketWorkers; i++)
      restart({
        id: i + 1,
        type: 'client'
      });
  }, 10000);
  
  console.log("Started: " + Date.now());
}
else {
  let obj;
  
  if (process.env.type == 'server') {
    try { // release previous instance
      binding.release(CACHE_NAME);
    } catch (e) {}
    
    try { // create new shmem instance
      obj = new binding.Cache(CACHE_NAME, 2 * 1024 * 1024 * 1024, binding.SIZE_2K);
    } catch (e) {}
    
    binding.clear(obj); // clear instance data
    
    for (let i = 0; i < maxSocketWorkers; i++) { // set 20 random object properties
      obj[randomBytes(64)] = randomBytes(102400);
    }
    
    let d = Object.keys(binding.dump(obj)), // dump object keys to 'd'
        l = d.length;
    
    console.log("keys prepared: ", d);
    
    async function run () {
      for (;;) {
        await sleep(10);
        
        let p = obj[d[Math.floor(Math.random() * l)]]; // Get random property
        
        for (let i = 0; i < 10; i++) // set random property value 10 times
          obj[d[Math.floor(Math.random() * l)]] = { // a javascript complex object
            string: randomBytes(102400),
            number: Date.now(),
            array: [Math.random(),Math.random(),Math.random()],
            object: { o: Math.random() }
          }
        
        p = undefined; // allow faster gc
      }
    }
    
    run();
  }
  else if (process.env.type == 'client') {
    console.log("Worker # " + process.env.id + " started");
    
    try {
      obj = new binding.Cache(CACHE_NAME, 2 * 1024 * 1024 * 1024, binding.SIZE_2K);
    } catch (e) {}
    
    let d = Object.keys(binding.dump(obj)),
        l = d.length;
    
    async function run () {
      for (;;) {
        await sleep(10); // get 100 random property values per second per worker (total 2000 random property values per sec)
        
        let p = obj[d[Math.floor(Math.random() * l)]]; // get random property value
        
        p = undefined; // allow faster gc
      }
    }
    
    run();
  }
}

Hi @vsdigitall thanks for the feedback. Currently Napa.js is using JSON.stringify/JSON.parse in object transportation because of the following reasons:

  • It's simple and straightforward.
  • It has a well-defined behavior ( because it's a part of the standard )
  • Readability
  • There are some custom behavior for transportable objects marshall/unmarshall. Those codes are written in typescript and can be easily called by JSON.stringify/JSON.parse.

However, those are reasons why Napa.js uses JSON, but not why JSON is good. I believe it's a benefit to use a native implemented binary marshalling (like BSON) eventually. This may be discussed and planned some time in future.