Issue when writing EOL in stream mode
Opened this issue · 5 comments
I have an issue when writing a file in stream mode. The EOL
seems not to be respected when writing a new line.
This is the sync version, that works as expected.
var i = fs.openSync(self._options.trainFile, 'r');
var o = fs.openSync(tmpFilePath, 'w');
var buf = new Buffer(1024 * 1024), len, prev = '';
while(len = fs.readSync(i, buf, 0, buf.length)) {
var a = (prev + buf.toString('utf-8', 0, len)).split('\n');
prev = len === buf.length ? '\n' + a.splice(a.length - 1)[0] : '';
var out = '';
a.forEach(function(text) {
if(!text) return;
text=text.toLowerCase()
.replace(/^/gm, '__label__')
.replace(/'/g, " ' ")
.replace(/"/g, '')
.replace(/\./g, ' \. ')
.replace(/,/g, ' \, ')
.replace(/\(/g, ' ( ')
.replace(/\)/g, ' ) ')
.replace(/!/g, ' ! ')
.replace(/\?/g, ' ! ')
.replace(/;/g, ' ')
.replace(/:/g, ' ')
out += text + '\n';
});
var bout = new Buffer(out, 'utf-8');
fs.writeSync(o, bout, 0, bout.length);
}
fs.closeSync(o);
fs.closeSync(i);
while this is the stream mode with byline
var os= require("os");
var Transform = require('stream').Transform
var writeStream = fs.createWriteStream(tmpFilePath, {flags: 'w', encoding: 'utf-8'});
var stream = byline(fs.createReadStream(self._options.trainFile, { flags: 'r', encoding: 'utf8'}));
//stream.pipe(writeStream);
stream.on('end', function() {
return resolve({
trainFile: tmpFilePath
});
});
stream.on('data', function(text) { /// read line by line
text=text.toLowerCase()
.replace(/^/gm, '__label__')
.replace(/'/g, " ' ")
.replace(/"/g, '')
.replace(/\./g, ' \. ')
.replace(/,/g, ' \, ')
.replace(/\(/g, ' ( ')
.replace(/\)/g, ' ) ')
.replace(/!/g, ' ! ')
.replace(/\?/g, ' ! ')
.replace(/;/g, ' ')
.replace(/:/g, ' ');
writeStream.write(text + os.EOL);
});
I don't see what this has to do with byline. You're saying that:
var writeStream = fs.createWriteStream(tmpFilePath, {flags: 'w', encoding: 'utf-8'});
followed by:
writeStream.write(text + os.EOL);
Results in a file without EOL characters. Byline plays no role in that.
Also you have an inconsistency in your code for the encodings 'utf8' vs 'utf-8'.
@jahewson sorry, does byline
read the input line in
var stream = byline(fs.createReadStream(self._options.trainFile, { flags: 'r', encoding: 'utf8'}));
So I would expect to write down that line so
stream.on('data', function(text) { /// read line by line
text=text.toLowerCase()
.replace(/^/gm, '__label__')
.replace(/'/g, " ' ")
.replace(/"/g, '')
.replace(/\./g, ' \. ')
.replace(/,/g, ' \, ')
.replace(/\(/g, ' ( ')
.replace(/\)/g, ' ) ')
.replace(/!/g, ' ! ')
.replace(/\?/g, ' ! ')
.replace(/;/g, ' ')
.replace(/:/g, ' ');
writeStream.write(text + '\n');
});
but this is out writing down a new line...
Ok, so does this:
// ...
stream.on('data', function(text) { /// read line by line
console.log(text)
});
Print only one line to the console?
@jahewson lines are printed out in the console, but when I write to writeStream.write(text + '\n');
I will get in the output stream one single line, without any \n
appended.