From a7eafb75184c63ed0b9a9c53256439908d814514 Mon Sep 17 00:00:00 2001 From: jjung Date: Tue, 25 Nov 2014 11:09:49 -0600 Subject: [PATCH 01/12] Rework of some of the underlying functionality. Adding options.split property to help speed up tokenization for huge files where there is distinct breaks between tokens (e.g. CVS file). --- lib/Tokenizer.js | 142 ++++++++++++++++++++--------------------- package.json | 3 - test/test-perf.js | 4 +- test/test-tokenizer.js | 1 - 4 files changed, 71 insertions(+), 79 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 01dc3d2..971efca 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -1,25 +1,27 @@ -var EventEmitter = require('events').EventEmitter; -var util = require('util'); -var assert = require('assert'); -var Transform = require('stream').Transform; -var disect = require('disect'); +var EventEmitter = require('events').EventEmitter, + util = require('util'), + assert = require('assert'), + Transform = require('stream').Transform; function noop(){} function Tokenizer (check_token_cb, options) { - if(!(this instanceof Tokenizer)) { - return new Tokenizer(check_token_cb); - } + if(!(this instanceof Tokenizer)) { + return new Tokenizer(check_token_cb, options); + } + + this.options = options || {}; - Transform.call(this, options); - this._readableState.objectMode = true; - this._buffered = ""; // we buffer untokenized data between writes - this._regexes = []; // should contain objects - // with regex[RegExp] and type[String] - this._ignored = {}; // a hash of ignored token types - // these will be parsed but not emitted - this._checkToken = check_token_cb || noop; + Transform.call(this, options); + this._readableState.objectMode = true; + this._buffered = ""; // we buffer untokenized data between writes + this._regexes = []; // should contain objects + // with regex[RegExp] and type[String] + this._ignored = {}; // a hash of ignored token types + // these will be parsed but not emitted + this._checkToken = check_token_cb || noop; } + util.inherits(Tokenizer, Transform); Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) { @@ -40,8 +42,10 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) }; Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { + //console.log('Try: \'' + str + '\'' ) + for (var i = 0; i < this._regexes.length; ++i) { - if(this._regexes[i].regex.test(str)) { + if(this._regexes[i].regex.test(str)) { return this._regexes[i]; } } @@ -49,39 +53,37 @@ Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { }; Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { - var regexes = this._regexes; - // in case we buffered data on previous writes - data = this._buffered + data; - this._buffered = ''; - if(!data.length) { - return; + // in case we buffered data on previous writes + data = this._buffered + data; + this._buffered = ''; + + var self = this, + regexes = this._regexes, + rule = undefined, + curStr = undefined, + subdata = this.options.split ? data.split(this.options.split) : [data]; + + subdata.forEach(function (sub) { + if (!sub.length) return; + + for (var i = sub.length; i > 0; i--) + { + curStr = sub.substring(0, i); + if (rule = self._getMatchingRule(curStr)) break; } - var self = this; - var maxIndex = disect(0, data.length, function (index) { - var buf = data.substring(0, index + 1); - return self._getMatchingRule(buf) === null; - }); - - if(maxIndex === 0) { - // no match found - throw new SyntaxError('could not tokenize ' + JSON.stringify(data)); + if (!rule) { + throw new SyntaxError('could not tokenize ' + JSON.stringify(sub)); } - else if (maxIndex === data.length && !nobuffer) { + else if (i === sub.length && !nobuffer) { // the whole string is matching - this._buffered = data; + self._buffered = sub; return; } - else { - // some substring is matching - var str = data.substring(0, maxIndex); - var rule = this._getMatchingRule(str); - if(!rule) { - throw new Error('wut ?'); - } - this._gotToken(str, rule); - this._tokenize(data.substring(maxIndex), nobuffer); - } + + self._gotToken(curStr, rule); + self._tokenize(sub.substring(i), nobuffer); + }); }; Tokenizer.prototype._flush = function _flush(callback) { @@ -109,45 +111,41 @@ Token.prototype.valueOf = function valueOf() { }; Tokenizer.prototype._gotToken = function _gotToken(str, rule) { - // notify the token checker - var type = this._checkToken(str, rule) || rule.type; - if(this._ignored[type]) return; - var token = new Token(str, type); + // notify the token checker + var type = rule.type || this._checkToken(str, rule); + if(this._ignored[type]) return; + var token = new Token(str, type); - this.push(token); + this.push(token); - this.emit('token', token, type); + this.emit('token', token, type); }; Tokenizer.prototype.addRule = function addRule(regex, type) { - // this is useful for built-in rules - if(!type) { - if(Array.isArray(regex)) { - return this.addRule(regex[0], regex[1]); - } - else if(regex) { - return this.addRule(Tokenizer[regex]); - } - else { - throw new Error('No parameters specified'); - } + // this is useful for built-in rules + if(!type) { + if(Array.isArray(regex)) { + return this.addRule(regex[0], regex[1]); + } + else if(regex) { + return this.addRule(Tokenizer[regex]); } - assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); - assert.equal(typeof type, 'string'); - this._regexes.push({regex:regex,type:type}); + else { + throw new Error('No parameters specified'); + } + } + assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); + assert.equal(typeof type, 'string'); + this._regexes.push({regex:regex,type:type}); }; /** * set some tokens to be ignored. these won't be emitted */ Tokenizer.prototype.ignore = function ignore(ignored) { - if(Array.isArray(ignored)) { - for (var i = 0; i < ignored.length; ++i) { - this.ignore(ignored[i]); - } - return; - } - this._ignored[ignored] = true; + if(ignored instanceof Array) + return ignored.forEach(this.ignore.bind(this)); + this._ignored[ignored] = true; }; module.exports = Tokenizer; @@ -155,4 +153,4 @@ module.exports = Tokenizer; // built-in rules Tokenizer.whitespace = [/^(\s)+$/, 'whitespace']; Tokenizer.word = [/^\w+$/, 'word']; -Tokenizer.number = [/^\d+(\.\d+)?$/, 'number']; +Tokenizer.number = [/^\d+(\.\d+)?$/, 'number']; \ No newline at end of file diff --git a/package.json b/package.json index ea984c2..d3458cf 100644 --- a/package.json +++ b/package.json @@ -20,8 +20,5 @@ }, "devDependencies": { "nodeunit": "~0.8.1" - }, - "dependencies": { - "disect": "~1.1.0" } } diff --git a/test/test-perf.js b/test/test-perf.js index 219cf4e..17dd52e 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -42,14 +42,12 @@ Function.prototype.timed = function (timeout) { } } - - exports['test big file of small integers'] = function (test) { var numbers = [0]; for (var i = 0; i < 100000; ++i) { numbers.push(Math.floor(Math.random() * 10000)); }; - var t = tokenizer(); + var t = tokenizer(undefined, {split: ','}); t.addRule('number'); t.addRule(/^\d+\.$/, 'maybe-float'); t.addRule('whitespace'); diff --git a/test/test-tokenizer.js b/test/test-tokenizer.js index 5e3e3c3..1f5d836 100644 --- a/test/test-tokenizer.js +++ b/test/test-tokenizer.js @@ -16,7 +16,6 @@ Function.prototype.withDomain = function(withStack) { } } - exports['test empty'] = function(test) { var t = tokenizer(); t.on('data', test.fail.bind(test, "No data should be emitted")); From d914a76ac4b2fc13ffbe7cf2688c0c808b7fa3d8 Mon Sep 17 00:00:00 2001 From: jjung Date: Tue, 25 Nov 2014 11:16:34 -0600 Subject: [PATCH 02/12] Updating nodeunit test for special case scenario that was failing for me. --- test/test-tokenizer.js | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/test/test-tokenizer.js b/test/test-tokenizer.js index 1f5d836..d708a15 100644 --- a/test/test-tokenizer.js +++ b/test/test-tokenizer.js @@ -173,3 +173,26 @@ exports['words in two chunks'] = function(test) { t.write('Hell'); t.end('o World'); }.withDomain(); + +exports['verify regex priority order and that longest matches first'] = function(test) { + //Test case built for a tokenizer I was building that was supposed to parse SLIM template code but was not working. + var t = tokenizer(undefined, {split: /^\r?\n+$/}); + t.addRule(/^([a-zA-Z0-9\-_]+\s*=\s*)(["'])(\\\2|[^"']+)*?\2$/, 'tKeyValue'); // name='value' + t.addRule(/^[a-zA-Z0-9\-_]+$/, 'tIdentifier'); // name + t.addRule(/^[#][a-zA-Z0-9\-_]+$/, 'tIdName'); // #name + t.addRule(/^\.[a-zA-Z0-9\-_]+$/, 'tClassName'); // .name + t.addRule('whitespace'); + t.ignore('whitespace'); + + var expectations = ['tIdentifier', 'tIdName', 'tClassName', 'tKeyValue', 'tKeyValue']; + + t.on('data', function(token) { + var e = expectations.shift(); + + test.equal(e, token.type); + }); + + t.on('end', test.done.bind(test)); + t.write('tag#id.class var1 = \'value1\' var2 = \'value2\''); + t.end(); +}.withDomain(); \ No newline at end of file From 9d2e2b9af748c7e22cb1beb9a0c44baccb4de439 Mon Sep 17 00:00:00 2001 From: jjung Date: Tue, 25 Nov 2014 11:21:01 -0600 Subject: [PATCH 03/12] Updating the README.md --- README.md | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index e7972af..41536f4 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,11 @@ [![Build Status](https://travis-ci.org/Floby/node-tokenizer.png)](https://travis-ci.org/Floby/node-tokenizer) # Synopsis -A wide purpose tokenizer for JavaScript. The interface follows more or less -the WriteStream from [node.js](http://nodejs.org). +A wide purpose tokenizer for JavaScript. The interface follows more or less the WriteStream from [node.js](http://nodejs.org). -node-tokenizer is published on npm so you can install it with `npm install tokenizer` +# Installation + + npm i tokenizer ## How to @@ -26,6 +27,24 @@ var t = new Tokenizer(mycallback); t.addRule(/^my regex$/, 'type'); ``` +* add split + +By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify +to split your input by that before tokenization which could improve performance dramatically. + +``` javascript +// Break CSV into subportions and tokenize each subportion separately but in order of original input +t = new Tokenizer(undefined, { + split: ',' +}); +``` +``` javascript +// Break file up by lines and tokenize each line separately. +t = new Tokenizer(undefined, { + split: /\r?\n/ +}); +``` + * write or pump to it ``` javascript From ecf0fe3c51a4a51761506f2219916e8a06c0d58b Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 11:34:50 -0600 Subject: [PATCH 04/12] Adding stepSize option. Also updating Readme. --- README.md | 56 +++++++++++++++++++++++++++++++---------------- lib/Tokenizer.js | 33 ++++++++++++++++++---------- package.json | 6 +++-- test/test-perf.js | 6 +++-- 4 files changed, 66 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 41536f4..bf1eeda 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ [![Build Status](https://travis-ci.org/Floby/node-tokenizer.png)](https://travis-ci.org/Floby/node-tokenizer) # Synopsis -A wide purpose tokenizer for JavaScript. The interface follows more or less the WriteStream from [node.js](http://nodejs.org). +A wide purpose tokenizer for JavaScript that tokenizes based on rules established using Regular Expressions. The interface conforms to the WriteStream from [node.js](http://nodejs.org). # Installation @@ -9,25 +9,34 @@ A wide purpose tokenizer for JavaScript. The interface follows more or less the ## How to -* require the Tokenizer constructor +**Requiring** ``` javascript var Tokenizer = require('tokenizer'); ``` -* construct one (we'll see what the callback is used for) +**Construction** ``` javascript -var t = new Tokenizer(mycallback); +var t = new Tokenizer(mycallback, options); ``` -* add rules +**Setting Options** + +Options is an object passed to the constructor function and can contain the following properties (defaults shown inline): + + { + stepSize: 0, // For large streams, the maximum size that will be tokenized at a time. This must be larger than the largest expected token. + split: undefined // See explanation in 'Splitting into Smaller Pieces' + } + +**Adding Rules** ``` javascript t.addRule(/^my regex$/, 'type'); ``` -* add split +**Splitting into Smaller Pieces** By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify to split your input by that before tokenization which could improve performance dramatically. @@ -38,6 +47,7 @@ t = new Tokenizer(undefined, { split: ',' }); ``` + ``` javascript // Break file up by lines and tokenize each line separately. t = new Tokenizer(undefined, { @@ -45,7 +55,7 @@ t = new Tokenizer(undefined, { }); ``` -* write or pump to it +**Writing/Piping** ``` javascript t.write(data); @@ -53,18 +63,18 @@ t.write(data); stream.pipe(t); ``` -* listen for new tokens +**Listen for tokens** ``` javascript t.on('token', function(token, type) { // do something useful // type is the type of the token (specified with addRule) // token is the actual matching string -}) +}); // alternatively you can use the tokenizer as a readable stream. ``` -* look out for the end +**Listening for completion** ``` javascript t.on('end', callback); @@ -82,24 +92,32 @@ and match, an object like this } ``` -Have a look in the example folder +##Examples + +Take a look a the [examples](https://github.com/Floby/node-tokenizer/tree/master/examples) folder. ## Rules -rules are regular expressions associated with a type name. + +Rules are regular expressions associated with a type name. + The tokenizer tries to find the longest string matching one or more rules. When several rules match the same string, priority is given to the rule -which was added first. (this may change) +which was added first. -Please note that your regular expressions should use ^ and $ in order +Note: normally your regular expressions should use ^ and $ in order to test the whole string. If these are not used, you rule will match _every_ string that contains what you specified, this could be the whole file! ## To do -* a lot of optimisation -* being able to share rules across several tokenizers - (although this can be achieved through inheritance) -* probably more hooks -* more checking + +* Continued optimisation +* Rule sharing across several tokenizers (although this can be achieved through inheritance) +* Need more hooks +* Increase test coverage + +## Testing + +Testing is provided via the ## License diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 971efca..91f8d73 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -1,7 +1,7 @@ var EventEmitter = require('events').EventEmitter, - util = require('util'), - assert = require('assert'), - Transform = require('stream').Transform; + util = require('util'), + assert = require('assert'), + Transform = require('stream').Transform; function noop(){} @@ -11,14 +11,14 @@ function Tokenizer (check_token_cb, options) { } this.options = options || {}; + this.options.stepSize = this.options.hasOwnProperty('stepSize') ? this.options.stepSize : 0; Transform.call(this, options); + this._readableState.objectMode = true; - this._buffered = ""; // we buffer untokenized data between writes - this._regexes = []; // should contain objects - // with regex[RegExp] and type[String] - this._ignored = {}; // a hash of ignored token types - // these will be parsed but not emitted + this._buffered = ''; // we buffer untokenized data between writes + this._regexes = []; // should contain objects with regex[RegExp] and type[String] + this._ignored = {}; // a hash of ignored token types these will be parsed but not emitted this._checkToken = check_token_cb || noop; } @@ -27,13 +27,22 @@ util.inherits(Tokenizer, Transform); Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) { chunk = chunk.toString(); var self = this; + process.nextTick(function () { try { - var index = 0, step = 64; - while(index < chunk.length) { - self._tokenize(chunk.substr(index, step)); - index += step; + var index = 0, + step = self.options.stepSize; + + if (self.options.stepSize > 0) + { + while(index < chunk.length) { + self._tokenize(chunk.substr(index, step)); + index += step; + } } + else + self._tokenize(chunk); + callback(); } catch(e) { callback(e); diff --git a/package.json b/package.json index d3458cf..a47cfe0 100644 --- a/package.json +++ b/package.json @@ -1,13 +1,15 @@ { "name": "tokenizer", - "description": "A wide purpose tokenizer for node.js which looks like a stream", - "version": "1.1.2", + "description": "A wide purpose tokenizer for node.js which extends the built-in 'stream' module.", + "version": "1.2.0", "homepage": "http://github.com/floby/node-tokenizer", "repository": { "type": "git", "url": "git://github.com/Floby/node-tokenizer.git" }, "author": "Florent Jaby ", + "contributors": [], + "main": "lib/Tokenizer.js", "scripts": { "test": "nodeunit test/test-tokenizer.js" diff --git a/test/test-perf.js b/test/test-perf.js index 17dd52e..0351af0 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -1,10 +1,11 @@ -var tokenizer = require('../'); -var domain = require('domain'); +var tokenizer = require('../'), + domain = require('domain'); Function.prototype.withDomain = function(withStack) { var fn = this; return function(test) { var d = domain.create(); + d.on('error', function(e) { test.fail('test failed with ' + e.message); if(withStack) { @@ -12,6 +13,7 @@ Function.prototype.withDomain = function(withStack) { } test.done(); }); + d.run(fn.bind(this, test)); } } From 52d2ea52a2f77baefa633e10ed255e7febef7d76 Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 13:12:41 -0600 Subject: [PATCH 05/12] Adding dispatch of 'split' whenever the options.split is specified and the split token is encountered. --- lib/Tokenizer.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 91f8d73..4eca28e 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -74,7 +74,9 @@ Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { subdata.forEach(function (sub) { if (!sub.length) return; - + + self.emit('split'); + for (var i = sub.length; i > 0; i--) { curStr = sub.substring(0, i); From 1948102958ca251e1963715fd466fcacfb593089 Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 15:22:52 -0600 Subject: [PATCH 06/12] Updates to performance and increasing split capabilities. --- README.md | 7 ++--- lib/Tokenizer.js | 78 ++++++++++++++++++++++++++++------------------- test/test-perf.js | 4 +-- 3 files changed, 52 insertions(+), 37 deletions(-) diff --git a/README.md b/README.md index bf1eeda..37fe9c1 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Options is an object passed to the constructor function and can contain the foll { stepSize: 0, // For large streams, the maximum size that will be tokenized at a time. This must be larger than the largest expected token. - split: undefined // See explanation in 'Splitting into Smaller Pieces' + split: undefined // A regular expression. See explanation in 'Splitting into Smaller Pieces' } **Adding Rules** @@ -38,13 +38,12 @@ t.addRule(/^my regex$/, 'type'); **Splitting into Smaller Pieces** -By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify -to split your input by that before tokenization which could improve performance dramatically. +By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of regular expression boundary (like /\n/) you can specify to split your input by that before tokenization which could improve performance dramatically. ``` javascript // Break CSV into subportions and tokenize each subportion separately but in order of original input t = new Tokenizer(undefined, { - split: ',' + split: /\,/ }); ``` diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 4eca28e..04ea380 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -40,61 +40,77 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) index += step; } } - else - self._tokenize(chunk); + else self._tokenize(chunk); callback(); } catch(e) { - callback(e); + callback(e, chunk); } }) }; Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { - //console.log('Try: \'' + str + '\'' ) - - for (var i = 0; i < this._regexes.length; ++i) { - if(this._regexes[i].regex.test(str)) { - return this._regexes[i]; - } - } + for (var i = 0; i < this._regexes.length; i++) + if(str.search(this._regexes[i].regex) == 0) + return this._regexes[i]; + return null; }; +Tokenizer.prototype._firstMatchLength = function(str, regex) { + for (var i = 1; i < str.length; i++) + if (regex.test(str.substr(0, i))) + return i; + return -1; +} + Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { + // in case we buffered data on previous writes data = this._buffered + data; this._buffered = ''; - var self = this, - regexes = this._regexes, - rule = undefined, - curStr = undefined, - subdata = this.options.split ? data.split(this.options.split) : [data]; - - subdata.forEach(function (sub) { - if (!sub.length) return; + var rule = undefined, + ix = this.options.split ? data.search(this.options.split) : -1, + str; - self.emit('split'); + if (this.options.split && ix != -1) { + str = data.substr(0, ix); + data = data.substr(ix); - for (var i = sub.length; i > 0; i--) + var len = this._firstMatchLength(data, this.options.split); + this.emit('split', data.substr(0, len)); + + data = data.substr(len); + } + else { + str = data; + data = undefined; + } + + if (str.length) { + for (var i = str.length; i > 0; i--) { - curStr = sub.substring(0, i); - if (rule = self._getMatchingRule(curStr)) break; + rule = this._getMatchingRule(str.substr(0, i)); + if (rule) break; } - if (!rule) { - throw new SyntaxError('could not tokenize ' + JSON.stringify(sub)); - } - else if (i === sub.length && !nobuffer) { - // the whole string is matching - self._buffered = sub; + if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); + else if (i == str.length && !nobuffer) { + // the whole string is matching, so we add to buffered and wait for more data becasue we might be able to match more + // TODO: check if end of stream... if so we don't want to add to buffered! + this._buffered = str; return; } - self._gotToken(curStr, rule); - self._tokenize(sub.substring(i), nobuffer); - }); + data = str.substr(i) + (data || ''); + str = str.substr(0, i); + + this._gotToken(str, rule); + } + + if (data && data.length) + this._tokenize(data, nobuffer); }; Tokenizer.prototype._flush = function _flush(callback) { diff --git a/test/test-perf.js b/test/test-perf.js index 0351af0..ca4b07a 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -46,10 +46,10 @@ Function.prototype.timed = function (timeout) { exports['test big file of small integers'] = function (test) { var numbers = [0]; - for (var i = 0; i < 100000; ++i) { + for (var i = 0; i < 1000000; ++i) { numbers.push(Math.floor(Math.random() * 10000)); }; - var t = tokenizer(undefined, {split: ','}); + var t = tokenizer(undefined, {split: /\,/}); t.addRule('number'); t.addRule(/^\d+\.$/, 'maybe-float'); t.addRule('whitespace'); From d6e53deb1c9842a24589baa20db288698d472aaa Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 15:23:38 -0600 Subject: [PATCH 07/12] Increasing size of test perf integers. --- test/test-perf.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-perf.js b/test/test-perf.js index ca4b07a..919bf63 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -47,7 +47,7 @@ Function.prototype.timed = function (timeout) { exports['test big file of small integers'] = function (test) { var numbers = [0]; for (var i = 0; i < 1000000; ++i) { - numbers.push(Math.floor(Math.random() * 10000)); + numbers.push(Math.floor(Math.random() * 100000)); }; var t = tokenizer(undefined, {split: /\,/}); t.addRule('number'); From 39a564761292f47c429aa86663b10aebf3076358 Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 16:08:11 -0600 Subject: [PATCH 08/12] Various modifications to get unit tests working again. Attempting to get the 'end' event to fire and it just won't. --- lib/Tokenizer.js | 56 ++++++++++++++++++++++------------------------- test/test-perf.js | 3 +-- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 04ea380..dda0ae5 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -35,7 +35,7 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) if (self.options.stepSize > 0) { - while(index < chunk.length) { + while (index < chunk.length) { self._tokenize(chunk.substr(index, step)); index += step; } @@ -64,41 +64,39 @@ Tokenizer.prototype._firstMatchLength = function(str, regex) { return -1; } -Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { - - // in case we buffered data on previous writes +Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { + // Did we buffered data on previous writes? data = this._buffered + data; this._buffered = ''; - var rule = undefined, - ix = this.options.split ? data.search(this.options.split) : -1, - str; + while (data && data.length) + { + var rule = undefined, + str = undefined, + ix = -1; - if (this.options.split && ix != -1) { - str = data.substr(0, ix); - data = data.substr(ix); + if (this.options.split) { + while ((ix = data.search(this.options.split)) == 0) + { + var len = this._firstMatchLength(data, this.options.split); + this.emit('split', data.substr(0, len)); - var len = this._firstMatchLength(data, this.options.split); - this.emit('split', data.substr(0, len)); + data = data.substr(len); + } - data = data.substr(len); - } - else { - str = data; - data = undefined; - } + str = ix != -1 ? data.substr(0, ix) : data; + data = ix != -1 ? data.substr(ix) : undefined; + } + else { + str = data; + data = undefined; + } - if (str.length) { for (var i = str.length; i > 0; i--) - { - rule = this._getMatchingRule(str.substr(0, i)); - if (rule) break; - } + if (rule = this._getMatchingRule(str.substr(0, i))) break; if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); - else if (i == str.length && !nobuffer) { - // the whole string is matching, so we add to buffered and wait for more data becasue we might be able to match more - // TODO: check if end of stream... if so we don't want to add to buffered! + else if (i == str.length && !endofstream && (!data || !data.length)) { this._buffered = str; return; } @@ -107,14 +105,12 @@ Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) { str = str.substr(0, i); this._gotToken(str, rule); - } - - if (data && data.length) - this._tokenize(data, nobuffer); + } // while }; Tokenizer.prototype._flush = function _flush(callback) { var self = this; + process.nextTick(function () { try { self._tokenize('', true); diff --git a/test/test-perf.js b/test/test-perf.js index 919bf63..0f1bb61 100644 --- a/test/test-perf.js +++ b/test/test-perf.js @@ -46,7 +46,7 @@ Function.prototype.timed = function (timeout) { exports['test big file of small integers'] = function (test) { var numbers = [0]; - for (var i = 0; i < 1000000; ++i) { + for (var i = 0; i < 100000; ++i) { numbers.push(Math.floor(Math.random() * 100000)); }; var t = tokenizer(undefined, {split: /\,/}); @@ -55,7 +55,6 @@ exports['test big file of small integers'] = function (test) { t.addRule('whitespace'); t.addRule(/^,$/, 'comma'); t.ignore('whitespace'); - t.ignore('comma'); t.on('data', function(token) { }); t.on('end', test.done.bind(test)); From cc3afacd7436da5be6769141f58911acbf1dfd37 Mon Sep 17 00:00:00 2001 From: jjung Date: Wed, 3 Dec 2014 16:12:37 -0600 Subject: [PATCH 09/12] Removing the process next tick from the flush method as it was causing issues when trying to listen to when tokenizing was done. --- lib/Tokenizer.js | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index dda0ae5..9f658aa 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -109,16 +109,8 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { }; Tokenizer.prototype._flush = function _flush(callback) { - var self = this; - - process.nextTick(function () { - try { - self._tokenize('', true); - callback(); - } catch(e) { - callback(e); - } - }); + this._tokenize('', true); + callback(); }; var Token = function String (content, type) { From fdd40977626670951ce7248256a53e15b48e3684 Mon Sep 17 00:00:00 2001 From: jjung Date: Thu, 4 Dec 2014 11:18:11 -0600 Subject: [PATCH 10/12] Minor fix to make sure to add back in an end of line character when the split occurs so that there is some indicator the split is there. --- lib/Tokenizer.js | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 9f658aa..9aa9943 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -84,7 +84,7 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { data = data.substr(len); } - str = ix != -1 ? data.substr(0, ix) : data; + str = ix != -1 ? data.substr(0, ix) + '\n' : data; data = ix != -1 ? data.substr(ix) : undefined; } else { @@ -158,7 +158,7 @@ Tokenizer.prototype.addRule = function addRule(regex, type) { * set some tokens to be ignored. these won't be emitted */ Tokenizer.prototype.ignore = function ignore(ignored) { - if(ignored instanceof Array) + if (ignored instanceof Array) return ignored.forEach(this.ignore.bind(this)); this._ignored[ignored] = true; }; From 91555e7d30f7414f4e205b863cb939720eab54a5 Mon Sep 17 00:00:00 2001 From: jjung Date: Thu, 4 Dec 2014 12:36:46 -0600 Subject: [PATCH 11/12] Greatly improving the speed at which we are checking tokens. --- lib/Tokenizer.js | 63 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 46 insertions(+), 17 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index 9aa9943..fd09547 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -49,12 +49,35 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) }) }; -Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) { +Tokenizer.prototype._getLongestMatch = function _getMatchingRule(str) { + var bestMatch = undefined, + longestMatchLen = 0; + + // Find the longest match that matches at the beginning of the string. for (var i = 0; i < this._regexes.length; i++) - if(str.search(this._regexes[i].regex) == 0) - return this._regexes[i]; + { + var match = undefined, + matches = str.match(this._regexes[i].regex); - return null; + if (matches && matches.length) + { + if ((match = matches[0]).length > longestMatchLen) + { + longestMatchLen = match.length; + bestMatch = { + rule: this._regexes[i], + match: match, + length: match.length, + matchesAll: longestMatchLen == str.length + }; + + if (longestMatchLen == str.length) + break; + } + } + } + + return bestMatch; }; Tokenizer.prototype._firstMatchLength = function(str, regex) { @@ -68,12 +91,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { // Did we buffered data on previous writes? data = this._buffered + data; this._buffered = ''; - + while (data && data.length) { - var rule = undefined, + + var match = undefined, str = undefined, - ix = -1; + ix = -1, + removeEOL = false; if (this.options.split) { while ((ix = data.search(this.options.split)) == 0) @@ -84,6 +109,8 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { data = data.substr(len); } + if (ix != -1) + removeEOL = true; str = ix != -1 ? data.substr(0, ix) + '\n' : data; data = ix != -1 ? data.substr(ix) : undefined; } @@ -92,19 +119,21 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { data = undefined; } - for (var i = str.length; i > 0; i--) - if (rule = this._getMatchingRule(str.substr(0, i))) break; + match = this._getLongestMatch(str); - if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); - else if (i == str.length && !endofstream && (!data || !data.length)) { + if (!match) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); + else if (match.matchesAll && !endofstream && (!data || !data.length)) { this._buffered = str; return; } - data = str.substr(i) + (data || ''); - str = str.substr(0, i); + if (removeEOL) + str = str.substr(0, str.length - 1); + + data = str.substr(match.length) + (data || ''); + str = str.substr(0, match.length); - this._gotToken(str, rule); + this._gotToken(str, match.rule); } // while }; @@ -166,6 +195,6 @@ Tokenizer.prototype.ignore = function ignore(ignored) { module.exports = Tokenizer; // built-in rules -Tokenizer.whitespace = [/^(\s)+$/, 'whitespace']; -Tokenizer.word = [/^\w+$/, 'word']; -Tokenizer.number = [/^\d+(\.\d+)?$/, 'number']; \ No newline at end of file +Tokenizer.whitespace = [/^(\s)+/, 'whitespace']; +Tokenizer.word = [/^\w+/, 'word']; +Tokenizer.number = [/^\d+(\.\d+)?/, 'number']; \ No newline at end of file From 71207ac03b2701eb82780caaa9ab64a308c937fa Mon Sep 17 00:00:00 2001 From: jjung Date: Thu, 11 Dec 2014 11:55:54 -0600 Subject: [PATCH 12/12] Fixing issues to get working with slim-to-jade --- lib/Tokenizer.js | 65 ++++++++++++++++++++++++++++++------------------ 1 file changed, 41 insertions(+), 24 deletions(-) diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js index fd09547..1846045 100644 --- a/lib/Tokenizer.js +++ b/lib/Tokenizer.js @@ -1,3 +1,5 @@ +// Using a fork of npm tokenizer by JFloby. (c) JFloby with modifications by me. + var EventEmitter = require('events').EventEmitter, util = require('util'), assert = require('assert'), @@ -5,7 +7,7 @@ var EventEmitter = require('events').EventEmitter, function noop(){} -function Tokenizer (check_token_cb, options) { +function Tokenizer (check_token_cb, options, error_cb) { if(!(this instanceof Tokenizer)) { return new Tokenizer(check_token_cb, options); } @@ -20,6 +22,7 @@ function Tokenizer (check_token_cb, options) { this._regexes = []; // should contain objects with regex[RegExp] and type[String] this._ignored = {}; // a hash of ignored token types these will be parsed but not emitted this._checkToken = check_token_cb || noop; + this._error = error_cb; } util.inherits(Tokenizer, Transform); @@ -29,33 +32,32 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) var self = this; process.nextTick(function () { - try { - var index = 0, - step = self.options.stepSize; + var index = 0, + step = self.options.stepSize; - if (self.options.stepSize > 0) - { - while (index < chunk.length) { - self._tokenize(chunk.substr(index, step)); - index += step; - } + if (self.options.stepSize > 0) + { + while (index < chunk.length) { + self._tokenize(chunk.substr(index, step)); + index += step; } - else self._tokenize(chunk); - - callback(); - } catch(e) { - callback(e, chunk); } + else self._tokenize(chunk); + + callback(undefined, chunk); }) }; Tokenizer.prototype._getLongestMatch = function _getMatchingRule(str) { var bestMatch = undefined, - longestMatchLen = 0; + longestMatchLen = 0; // Find the longest match that matches at the beginning of the string. for (var i = 0; i < this._regexes.length; i++) { + if (this._regexes[i].filter && !this._regexes[i].filter(str)) + continue; + var match = undefined, matches = str.match(this._regexes[i].regex); @@ -94,7 +96,6 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { while (data && data.length) { - var match = undefined, str = undefined, ix = -1, @@ -104,9 +105,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { while ((ix = data.search(this.options.split)) == 0) { var len = this._firstMatchLength(data, this.options.split); - this.emit('split', data.substr(0, len)); - data = data.substr(len); + if (len != -1) + { + this.emit('split', data.substr(0, len)); + + data = data.substr(len); + } + else return; } if (ix != -1) @@ -121,7 +127,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) { match = this._getLongestMatch(str); - if (!match) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); + if (!match) { + var err = new SyntaxError('No rules found to match any part of \'' + str.toString() + '\''); + + if (this._error) + this._error(err); + else + throw err; + } else if (match.matchesAll && !endofstream && (!data || !data.length)) { this._buffered = str; return; @@ -165,14 +178,14 @@ Tokenizer.prototype._gotToken = function _gotToken(str, rule) { this.emit('token', token, type); }; -Tokenizer.prototype.addRule = function addRule(regex, type) { +Tokenizer.prototype.addRule = function addRule(regex, type, filter) { // this is useful for built-in rules if(!type) { if(Array.isArray(regex)) { - return this.addRule(regex[0], regex[1]); + return this.addRule(regex[0], regex[1], filter); } else if(regex) { - return this.addRule(Tokenizer[regex]); + return this.addRule(Tokenizer[regex], filter); } else { throw new Error('No parameters specified'); @@ -180,7 +193,11 @@ Tokenizer.prototype.addRule = function addRule(regex, type) { } assert.ok((regex instanceof RegExp) || (typeof regex === 'function')); assert.equal(typeof type, 'string'); - this._regexes.push({regex:regex,type:type}); + this._regexes.push({ + regex:regex, + type:type, + filter: filter + }); }; /**