From a7eafb75184c63ed0b9a9c53256439908d814514 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Tue, 25 Nov 2014 11:09:49 -0600
Subject: [PATCH 01/12] Rework of some of the underlying functionality. Adding
 options.split property to help speed up tokenization for huge files where
 there is distinct breaks between tokens (e.g. CVS file).

---
 lib/Tokenizer.js       | 142 ++++++++++++++++++++---------------------
 package.json           |   3 -
 test/test-perf.js      |   4 +-
 test/test-tokenizer.js |   1 -
 4 files changed, 71 insertions(+), 79 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 01dc3d2..971efca 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -1,25 +1,27 @@
-var EventEmitter = require('events').EventEmitter;
-var util = require('util');
-var assert = require('assert');
-var Transform = require('stream').Transform;
-var disect = require('disect');
+var EventEmitter = require('events').EventEmitter,
+  util = require('util'),
+  assert = require('assert'),
+  Transform = require('stream').Transform;
 
 function noop(){}
 
 function Tokenizer (check_token_cb, options) {
-    if(!(this instanceof Tokenizer)) {
-      return new Tokenizer(check_token_cb);
-    }
+  if(!(this instanceof Tokenizer)) {
+    return new Tokenizer(check_token_cb, options);
+  }
+
+  this.options = options || {};
 
-    Transform.call(this, options);
-    this._readableState.objectMode = true;
-    this._buffered = ""; // we buffer untokenized data between writes
-    this._regexes = []; // should contain objects 
-                        // with regex[RegExp] and type[String]
-    this._ignored = {}; // a hash of ignored token types
-                        // these will be parsed but not emitted
-    this._checkToken = check_token_cb || noop;
+  Transform.call(this, options);
+  this._readableState.objectMode = true;
+  this._buffered = ""; // we buffer untokenized data between writes
+  this._regexes = []; // should contain objects 
+                      // with regex[RegExp] and type[String]
+  this._ignored = {}; // a hash of ignored token types
+                      // these will be parsed but not emitted
+  this._checkToken = check_token_cb || noop;
 }
+
 util.inherits(Tokenizer, Transform);
 
 Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) {
@@ -40,8 +42,10 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback)
 };
 
 Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) {
+  //console.log('Try: \'' + str + '\'' )
+  
   for (var i = 0; i < this._regexes.length; ++i) {
-      if(this._regexes[i].regex.test(str)) {
+    if(this._regexes[i].regex.test(str)) {
         return this._regexes[i];
       }
   }
@@ -49,39 +53,37 @@ Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) {
 };
 
 Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
-    var regexes = this._regexes;
-    // in case we buffered data on previous writes
-    data = this._buffered + data;
-    this._buffered = '';
-    if(!data.length) {
-      return;
+  // in case we buffered data on previous writes
+  data = this._buffered + data;
+  this._buffered = '';
+
+  var self = this,
+    regexes = this._regexes,
+    rule = undefined,
+    curStr = undefined,
+    subdata = this.options.split ? data.split(this.options.split) : [data];
+  
+  subdata.forEach(function (sub) {
+    if (!sub.length) return;
+  
+    for (var i = sub.length; i > 0; i--)
+    {
+      curStr = sub.substring(0, i);
+      if (rule = self._getMatchingRule(curStr)) break;
     }
 
-    var self = this;
-    var maxIndex = disect(0, data.length, function (index) {
-      var buf = data.substring(0, index + 1);
-      return self._getMatchingRule(buf) === null;
-    });
-
-    if(maxIndex === 0) {
-      // no match found
-      throw new SyntaxError('could not tokenize ' + JSON.stringify(data));
+    if (!rule) {
+      throw new SyntaxError('could not tokenize ' + JSON.stringify(sub));
     }
-    else if (maxIndex === data.length && !nobuffer) {
+    else if (i === sub.length && !nobuffer) {
       // the whole string is matching
-      this._buffered = data;
+      self._buffered = sub;
       return;
     }
-    else {
-      // some substring is matching
-      var str = data.substring(0, maxIndex);
-      var rule = this._getMatchingRule(str);
-      if(!rule) {
-        throw new Error('wut ?');
-      }
-      this._gotToken(str, rule);
-      this._tokenize(data.substring(maxIndex), nobuffer);
-    }
+
+    self._gotToken(curStr, rule);
+    self._tokenize(sub.substring(i), nobuffer);
+  });
 };
 
 Tokenizer.prototype._flush = function _flush(callback) {
@@ -109,45 +111,41 @@ Token.prototype.valueOf = function valueOf() {
 };
 
 Tokenizer.prototype._gotToken = function _gotToken(str, rule) {
-    // notify the token checker
-    var type = this._checkToken(str, rule) || rule.type;
-    if(this._ignored[type]) return;
-    var token = new Token(str, type);
+  // notify the token checker
+  var type = rule.type || this._checkToken(str, rule);
+  if(this._ignored[type]) return;
+  var token = new Token(str, type);
 
-    this.push(token);
+  this.push(token);
 
-    this.emit('token', token, type);
+  this.emit('token', token, type);
 };
 
 Tokenizer.prototype.addRule = function addRule(regex, type) {
-    // this is useful for built-in rules
-    if(!type) {
-      if(Array.isArray(regex)) {
-        return this.addRule(regex[0], regex[1]);
-      }
-      else if(regex) {
-        return this.addRule(Tokenizer[regex]);
-      }
-      else {
-        throw new Error('No parameters specified');
-      }
+  // this is useful for built-in rules
+  if(!type) {
+    if(Array.isArray(regex)) {
+      return this.addRule(regex[0], regex[1]);
+    }
+    else if(regex) {
+      return this.addRule(Tokenizer[regex]);
     }
-    assert.ok((regex instanceof RegExp) || (typeof regex === 'function'));
-    assert.equal(typeof type, 'string');
-    this._regexes.push({regex:regex,type:type});
+    else {
+      throw new Error('No parameters specified');
+    }
+  }
+  assert.ok((regex instanceof RegExp) || (typeof regex === 'function'));
+  assert.equal(typeof type, 'string');
+  this._regexes.push({regex:regex,type:type});
 };
 
 /**
  * set some tokens to be ignored. these won't be emitted
  */
 Tokenizer.prototype.ignore = function ignore(ignored) {
-    if(Array.isArray(ignored)) {
-        for (var i = 0; i < ignored.length; ++i) {
-            this.ignore(ignored[i]);
-        }
-        return;
-    }
-    this._ignored[ignored] = true;
+  if(ignored instanceof Array)
+    return ignored.forEach(this.ignore.bind(this));
+  this._ignored[ignored] = true;
 };
 
 module.exports = Tokenizer;
@@ -155,4 +153,4 @@ module.exports = Tokenizer;
 // built-in rules
 Tokenizer.whitespace    = [/^(\s)+$/, 'whitespace'];
 Tokenizer.word          = [/^\w+$/, 'word'];
-Tokenizer.number        = [/^\d+(\.\d+)?$/, 'number'];
+Tokenizer.number        = [/^\d+(\.\d+)?$/, 'number'];
\ No newline at end of file
diff --git a/package.json b/package.json
index ea984c2..d3458cf 100644
--- a/package.json
+++ b/package.json
@@ -20,8 +20,5 @@
   },
   "devDependencies": {
     "nodeunit": "~0.8.1"
-  },
-  "dependencies": {
-    "disect": "~1.1.0"
   }
 }
diff --git a/test/test-perf.js b/test/test-perf.js
index 219cf4e..17dd52e 100644
--- a/test/test-perf.js
+++ b/test/test-perf.js
@@ -42,14 +42,12 @@ Function.prototype.timed = function (timeout) {
   }
 }
 
-
-
 exports['test big file of small integers'] = function (test) {
   var numbers = [0];
   for (var i = 0; i < 100000; ++i) {
     numbers.push(Math.floor(Math.random() * 10000));
   };
-  var t = tokenizer();
+  var t = tokenizer(undefined, {split: ','});
   t.addRule('number');
   t.addRule(/^\d+\.$/, 'maybe-float');
   t.addRule('whitespace');
diff --git a/test/test-tokenizer.js b/test/test-tokenizer.js
index 5e3e3c3..1f5d836 100644
--- a/test/test-tokenizer.js
+++ b/test/test-tokenizer.js
@@ -16,7 +16,6 @@ Function.prototype.withDomain = function(withStack) {
   }
 }
 
-
 exports['test empty'] = function(test) {
   var t = tokenizer();
   t.on('data', test.fail.bind(test, "No data should be emitted"));

From d914a76ac4b2fc13ffbe7cf2688c0c808b7fa3d8 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Tue, 25 Nov 2014 11:16:34 -0600
Subject: [PATCH 02/12] Updating nodeunit test for special case scenario that
 was failing for me.

---
 test/test-tokenizer.js | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/test/test-tokenizer.js b/test/test-tokenizer.js
index 1f5d836..d708a15 100644
--- a/test/test-tokenizer.js
+++ b/test/test-tokenizer.js
@@ -173,3 +173,26 @@ exports['words in two chunks'] = function(test) {
   t.write('Hell');
   t.end('o World');
 }.withDomain();
+
+exports['verify regex priority order and that longest matches first'] = function(test) {
+  //Test case built for a tokenizer I was building that was supposed to parse SLIM template code but was not working.
+  var t = tokenizer(undefined, {split: /^\r?\n+$/});
+  t.addRule(/^([a-zA-Z0-9\-_]+\s*=\s*)(["'])(\\\2|[^"']+)*?\2$/, 'tKeyValue');  // name='value'
+  t.addRule(/^[a-zA-Z0-9\-_]+$/, 'tIdentifier');                                // name
+  t.addRule(/^[#][a-zA-Z0-9\-_]+$/, 'tIdName');                                 // #name
+  t.addRule(/^\.[a-zA-Z0-9\-_]+$/, 'tClassName');                               // .name
+  t.addRule('whitespace');
+  t.ignore('whitespace');
+
+  var expectations = ['tIdentifier', 'tIdName', 'tClassName', 'tKeyValue', 'tKeyValue'];
+
+  t.on('data', function(token) {
+    var e = expectations.shift();
+
+    test.equal(e, token.type);
+  });
+  
+  t.on('end', test.done.bind(test));
+  t.write('tag#id.class var1 = \'value1\' var2 = \'value2\'');
+  t.end();
+}.withDomain();
\ No newline at end of file

From 9d2e2b9af748c7e22cb1beb9a0c44baccb4de439 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Tue, 25 Nov 2014 11:21:01 -0600
Subject: [PATCH 03/12] Updating the README.md

---
 README.md | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e7972af..41536f4 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,11 @@
 [![Build Status](https://travis-ci.org/Floby/node-tokenizer.png)](https://travis-ci.org/Floby/node-tokenizer)
 
 # Synopsis
-A wide purpose tokenizer for JavaScript. The interface follows more or less
-the WriteStream from [node.js](http://nodejs.org).
+A wide purpose tokenizer for JavaScript. The interface follows more or less the WriteStream from [node.js](http://nodejs.org).
 
-node-tokenizer is published on npm so you can install it with `npm install tokenizer`
+# Installation
+
+    npm i tokenizer
 
 ## How to
 
@@ -26,6 +27,24 @@ var t = new Tokenizer(mycallback);
 t.addRule(/^my regex$/, 'type');
 ```
 
+* add split
+
+By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify
+to split your input by that before tokenization which could improve performance dramatically.
+
+``` javascript
+// Break CSV into subportions and tokenize each subportion separately but in order of original input
+t = new Tokenizer(undefined, {
+  split: ','
+}); 
+```
+``` javascript
+// Break file up by lines and tokenize each line separately.
+t = new Tokenizer(undefined, {
+  split: /\r?\n/
+});
+```
+
 * write or pump to it
 
 ``` javascript

From ecf0fe3c51a4a51761506f2219916e8a06c0d58b Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 11:34:50 -0600
Subject: [PATCH 04/12] Adding stepSize option. Also updating Readme.

---
 README.md         | 56 +++++++++++++++++++++++++++++++----------------
 lib/Tokenizer.js  | 33 ++++++++++++++++++----------
 package.json      |  6 +++--
 test/test-perf.js |  6 +++--
 4 files changed, 66 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index 41536f4..bf1eeda 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 [![Build Status](https://travis-ci.org/Floby/node-tokenizer.png)](https://travis-ci.org/Floby/node-tokenizer)
 
 # Synopsis
-A wide purpose tokenizer for JavaScript. The interface follows more or less the WriteStream from [node.js](http://nodejs.org).
+A wide purpose tokenizer for JavaScript that tokenizes based on rules established using Regular Expressions. The interface conforms to the WriteStream from [node.js](http://nodejs.org).
 
 # Installation
 
@@ -9,25 +9,34 @@ A wide purpose tokenizer for JavaScript. The interface follows more or less the
 
 ## How to
 
-* require the Tokenizer constructor
+**Requiring**
 
 ``` javascript
 var Tokenizer = require('tokenizer');
 ```
 
-* construct one (we'll see what the callback is used for)
+**Construction**
 
 ``` javascript
-var t = new Tokenizer(mycallback);
+var t = new Tokenizer(mycallback, options);
 ``` 
 
-* add rules
+**Setting Options**
+
+Options is an object passed to the constructor function and can contain the following properties (defaults shown inline):
+
+    {
+      stepSize: 0, // For large streams, the maximum size that will be tokenized at a time. This must be larger than the largest expected token.
+      split: undefined // See explanation in 'Splitting into Smaller Pieces'
+    }
+
+**Adding Rules**
 
 ``` javascript
 t.addRule(/^my regex$/, 'type');
 ```
 
-* add split
+**Splitting into Smaller Pieces**
 
 By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify
 to split your input by that before tokenization which could improve performance dramatically.
@@ -38,6 +47,7 @@ t = new Tokenizer(undefined, {
   split: ','
 }); 
 ```
+
 ``` javascript
 // Break file up by lines and tokenize each line separately.
 t = new Tokenizer(undefined, {
@@ -45,7 +55,7 @@ t = new Tokenizer(undefined, {
 });
 ```
 
-* write or pump to it
+**Writing/Piping**
 
 ``` javascript
 t.write(data);
@@ -53,18 +63,18 @@ t.write(data);
 stream.pipe(t);
 ```
 
-* listen for new tokens
+**Listen for tokens**
 
 ``` javascript
 t.on('token', function(token, type) {
     // do something useful
     // type is the type of the token (specified with addRule)
     // token is the actual matching string
-})
+});
 // alternatively you can use the tokenizer as a readable stream.
 ```
 
-* look out for the end
+**Listening for completion**
 
 ``` javascript
 t.on('end', callback);
@@ -82,24 +92,32 @@ and match, an object like this
 }
 ```
 
-Have a look in the example folder
+##Examples
+
+Take a look a the [examples](https://github.com/Floby/node-tokenizer/tree/master/examples) folder.
 
 ## Rules
-rules are regular expressions associated with a type name.
+
+Rules are regular expressions associated with a type name.
+
 The tokenizer tries to find the longest string matching one or more rules.
 When several rules match the same string, priority is given to the rule
-which was added first. (this may change)
+which was added first.
 
-Please note that your regular expressions should use ^ and $ in order
+Note: normally your regular expressions should use ^ and $ in order
 to test the whole string. If these are not used, you rule will match _every_
 string that contains what you specified, this could be the whole file!
 
 ## To do
-* a lot of optimisation
-* being able to share rules across several tokenizers
-    (although this can be achieved through inheritance)
-* probably more hooks
-* more checking
+
+* Continued optimisation
+* Rule sharing across several tokenizers (although this can be achieved through inheritance)
+* Need more hooks
+* Increase test coverage
+
+## Testing
+
+Testing is provided via the 
 
 ## License
 
diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 971efca..91f8d73 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -1,7 +1,7 @@
 var EventEmitter = require('events').EventEmitter,
-  util = require('util'),
-  assert = require('assert'),
-  Transform = require('stream').Transform;
+    util = require('util'),
+    assert = require('assert'),
+    Transform = require('stream').Transform;
 
 function noop(){}
 
@@ -11,14 +11,14 @@ function Tokenizer (check_token_cb, options) {
   }
 
   this.options = options || {};
+  this.options.stepSize = this.options.hasOwnProperty('stepSize') ? this.options.stepSize : 0;
 
   Transform.call(this, options);
+
   this._readableState.objectMode = true;
-  this._buffered = ""; // we buffer untokenized data between writes
-  this._regexes = []; // should contain objects 
-                      // with regex[RegExp] and type[String]
-  this._ignored = {}; // a hash of ignored token types
-                      // these will be parsed but not emitted
+  this._buffered = '';  // we buffer untokenized data between writes
+  this._regexes = [];   // should contain objects with regex[RegExp] and type[String]
+  this._ignored = {};   // a hash of ignored token types these will be parsed but not emitted
   this._checkToken = check_token_cb || noop;
 }
 
@@ -27,13 +27,22 @@ util.inherits(Tokenizer, Transform);
 Tokenizer.prototype._transform = function _transform(chunk, encoding, callback) {
   chunk = chunk.toString();
   var self = this;
+
   process.nextTick(function () {
     try {
-      var index = 0, step = 64;
-      while(index < chunk.length) {
-        self._tokenize(chunk.substr(index, step));
-        index += step;
+      var index = 0,
+          step = self.options.stepSize;
+
+      if (self.options.stepSize > 0)
+      {
+        while(index < chunk.length) {
+          self._tokenize(chunk.substr(index, step));
+          index += step;
+        }
       }
+      else
+        self._tokenize(chunk);
+
       callback();
     } catch(e) {
       callback(e);
diff --git a/package.json b/package.json
index d3458cf..a47cfe0 100644
--- a/package.json
+++ b/package.json
@@ -1,13 +1,15 @@
 {
   "name": "tokenizer",
-  "description": "A wide purpose tokenizer for node.js which looks like a stream",
-  "version": "1.1.2",
+  "description": "A wide purpose tokenizer for node.js which extends the built-in 'stream' module.",
+  "version": "1.2.0",
   "homepage": "http://github.com/floby/node-tokenizer",
   "repository": {
     "type": "git",
     "url": "git://github.com/Floby/node-tokenizer.git"
   },
   "author": "Florent Jaby <florent.jaby@gmail.com>",
+  "contributors": [],
+  
   "main": "lib/Tokenizer.js",
   "scripts": {
     "test": "nodeunit test/test-tokenizer.js"
diff --git a/test/test-perf.js b/test/test-perf.js
index 17dd52e..0351af0 100644
--- a/test/test-perf.js
+++ b/test/test-perf.js
@@ -1,10 +1,11 @@
-var tokenizer = require('../');
-var domain = require('domain');
+var tokenizer = require('../'),
+    domain = require('domain');
 
 Function.prototype.withDomain = function(withStack) {
   var fn = this;
   return function(test) {
     var d = domain.create();
+
     d.on('error', function(e) {
       test.fail('test failed with ' + e.message);
       if(withStack) {
@@ -12,6 +13,7 @@ Function.prototype.withDomain = function(withStack) {
       }
       test.done();
     });
+
     d.run(fn.bind(this, test));
   }
 }

From 52d2ea52a2f77baefa633e10ed255e7febef7d76 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 13:12:41 -0600
Subject: [PATCH 05/12] Adding dispatch of 'split' whenever the options.split
 is specified and the split token is encountered.

---
 lib/Tokenizer.js | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 91f8d73..4eca28e 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -74,7 +74,9 @@ Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
   
   subdata.forEach(function (sub) {
     if (!sub.length) return;
-  
+
+    self.emit('split');
+
     for (var i = sub.length; i > 0; i--)
     {
       curStr = sub.substring(0, i);

From 1948102958ca251e1963715fd466fcacfb593089 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 15:22:52 -0600
Subject: [PATCH 06/12] Updates to performance and increasing split
 capabilities.

---
 README.md         |  7 ++---
 lib/Tokenizer.js  | 78 ++++++++++++++++++++++++++++-------------------
 test/test-perf.js |  4 +--
 3 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index bf1eeda..37fe9c1 100644
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@ Options is an object passed to the constructor function and can contain the foll
 
     {
       stepSize: 0, // For large streams, the maximum size that will be tokenized at a time. This must be larger than the largest expected token.
-      split: undefined // See explanation in 'Splitting into Smaller Pieces'
+      split: undefined // A regular expression. See explanation in 'Splitting into Smaller Pieces'
     }
 
 **Adding Rules**
@@ -38,13 +38,12 @@ t.addRule(/^my regex$/, 'type');
 
 **Splitting into Smaller Pieces**
 
-By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of string boundary (like ',' or \n) you can specify
-to split your input by that before tokenization which could improve performance dramatically.
+By default, tokenizer attempts to find the longest match in the input stream. This can be a large performance hit for big files. If you are certain that your tokens will never cross a certain type of regular expression boundary (like /\n/) you can specify to split your input by that before tokenization which could improve performance dramatically.
 
 ``` javascript
 // Break CSV into subportions and tokenize each subportion separately but in order of original input
 t = new Tokenizer(undefined, {
-  split: ','
+  split: /\,/
 }); 
 ```
 
diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 4eca28e..04ea380 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -40,61 +40,77 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback)
           index += step;
         }
       }
-      else
-        self._tokenize(chunk);
+      else self._tokenize(chunk);
 
       callback();
     } catch(e) {
-      callback(e);
+      callback(e, chunk);
     }
   })
 };
 
 Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) {
-  //console.log('Try: \'' + str + '\'' )
-  
-  for (var i = 0; i < this._regexes.length; ++i) {
-    if(this._regexes[i].regex.test(str)) {
-        return this._regexes[i];
-      }
-  }
+  for (var i = 0; i < this._regexes.length; i++)
+    if(str.search(this._regexes[i].regex) == 0)
+      return this._regexes[i];
+
   return null;
 };
 
+Tokenizer.prototype._firstMatchLength = function(str, regex) {
+  for (var i = 1; i < str.length; i++)
+    if (regex.test(str.substr(0, i)))
+      return i;
+  return -1;
+}
+
 Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
+  
   // in case we buffered data on previous writes
   data = this._buffered + data;
   this._buffered = '';
 
-  var self = this,
-    regexes = this._regexes,
-    rule = undefined,
-    curStr = undefined,
-    subdata = this.options.split ? data.split(this.options.split) : [data];
-  
-  subdata.forEach(function (sub) {
-    if (!sub.length) return;
+  var rule = undefined,
+      ix = this.options.split ? data.search(this.options.split) : -1,
+      str;
 
-    self.emit('split');
+  if (this.options.split && ix != -1) {
+    str = data.substr(0, ix);
+    data = data.substr(ix);
 
-    for (var i = sub.length; i > 0; i--)
+    var len = this._firstMatchLength(data, this.options.split);
+    this.emit('split', data.substr(0, len));
+
+    data = data.substr(len);
+  }
+  else {
+    str = data;
+    data = undefined;
+  }
+
+  if (str.length) {
+    for (var i = str.length; i > 0; i--)
     {
-      curStr = sub.substring(0, i);
-      if (rule = self._getMatchingRule(curStr)) break;
+      rule = this._getMatchingRule(str.substr(0, i));
+      if (rule) break;
     }
 
-    if (!rule) {
-      throw new SyntaxError('could not tokenize ' + JSON.stringify(sub));
-    }
-    else if (i === sub.length && !nobuffer) {
-      // the whole string is matching
-      self._buffered = sub;
+    if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
+    else if (i == str.length && !nobuffer) {
+      // the whole string is matching, so we add to buffered and wait for more data becasue we might be able to match more
+      // TODO: check if end of stream... if so we don't want to add to buffered!
+      this._buffered = str;
       return;
     }
 
-    self._gotToken(curStr, rule);
-    self._tokenize(sub.substring(i), nobuffer);
-  });
+    data = str.substr(i) + (data || '');  
+    str = str.substr(0, i);
+
+    this._gotToken(str, rule);
+  }
+
+  if (data && data.length)
+    this._tokenize(data, nobuffer);
 };
 
 Tokenizer.prototype._flush = function _flush(callback) {
diff --git a/test/test-perf.js b/test/test-perf.js
index 0351af0..ca4b07a 100644
--- a/test/test-perf.js
+++ b/test/test-perf.js
@@ -46,10 +46,10 @@ Function.prototype.timed = function (timeout) {
 
 exports['test big file of small integers'] = function (test) {
   var numbers = [0];
-  for (var i = 0; i < 100000; ++i) {
+  for (var i = 0; i < 1000000; ++i) {
     numbers.push(Math.floor(Math.random() * 10000));
   };
-  var t = tokenizer(undefined, {split: ','});
+  var t = tokenizer(undefined, {split: /\,/});
   t.addRule('number');
   t.addRule(/^\d+\.$/, 'maybe-float');
   t.addRule('whitespace');

From d6e53deb1c9842a24589baa20db288698d472aaa Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 15:23:38 -0600
Subject: [PATCH 07/12] Increasing size of test perf integers.

---
 test/test-perf.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test-perf.js b/test/test-perf.js
index ca4b07a..919bf63 100644
--- a/test/test-perf.js
+++ b/test/test-perf.js
@@ -47,7 +47,7 @@ Function.prototype.timed = function (timeout) {
 exports['test big file of small integers'] = function (test) {
   var numbers = [0];
   for (var i = 0; i < 1000000; ++i) {
-    numbers.push(Math.floor(Math.random() * 10000));
+    numbers.push(Math.floor(Math.random() * 100000));
   };
   var t = tokenizer(undefined, {split: /\,/});
   t.addRule('number');

From 39a564761292f47c429aa86663b10aebf3076358 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 16:08:11 -0600
Subject: [PATCH 08/12] Various modifications to get unit tests working again.
 Attempting to get the 'end' event to fire and it just won't.

---
 lib/Tokenizer.js  | 56 ++++++++++++++++++++++-------------------------
 test/test-perf.js |  3 +--
 2 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 04ea380..dda0ae5 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -35,7 +35,7 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback)
 
       if (self.options.stepSize > 0)
       {
-        while(index < chunk.length) {
+        while (index < chunk.length) {
           self._tokenize(chunk.substr(index, step));
           index += step;
         }
@@ -64,41 +64,39 @@ Tokenizer.prototype._firstMatchLength = function(str, regex) {
   return -1;
 }
 
-Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
-  
-  // in case we buffered data on previous writes
+Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
+  // Did we buffered data on previous writes?
   data = this._buffered + data;
   this._buffered = '';
 
-  var rule = undefined,
-      ix = this.options.split ? data.search(this.options.split) : -1,
-      str;
+  while (data && data.length)
+  {
+    var rule = undefined,
+        str = undefined,
+        ix = -1;
 
-  if (this.options.split && ix != -1) {
-    str = data.substr(0, ix);
-    data = data.substr(ix);
+    if (this.options.split) {
+      while ((ix = data.search(this.options.split)) == 0)
+      {
+        var len = this._firstMatchLength(data, this.options.split);
+        this.emit('split', data.substr(0, len));
 
-    var len = this._firstMatchLength(data, this.options.split);
-    this.emit('split', data.substr(0, len));
+        data = data.substr(len);
+      }
 
-    data = data.substr(len);
-  }
-  else {
-    str = data;
-    data = undefined;
-  }
+      str = ix != -1 ? data.substr(0, ix) : data;
+      data = ix != -1 ? data.substr(ix) : undefined;
+    }
+    else {
+      str = data;
+      data = undefined;
+    }
 
-  if (str.length) {
     for (var i = str.length; i > 0; i--)
-    {
-      rule = this._getMatchingRule(str.substr(0, i));
-      if (rule) break;
-    }
+      if (rule = this._getMatchingRule(str.substr(0, i))) break;
 
     if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
-    else if (i == str.length && !nobuffer) {
-      // the whole string is matching, so we add to buffered and wait for more data becasue we might be able to match more
-      // TODO: check if end of stream... if so we don't want to add to buffered!
+    else if (i == str.length && !endofstream && (!data || !data.length)) {
       this._buffered = str;
       return;
     }
@@ -107,14 +105,12 @@ Tokenizer.prototype._tokenize = function _tokenize(data, nobuffer) {
     str = str.substr(0, i);
 
     this._gotToken(str, rule);
-  }
-
-  if (data && data.length)
-    this._tokenize(data, nobuffer);
+  } // while
 };
 
 Tokenizer.prototype._flush = function _flush(callback) {
   var self = this;
+
   process.nextTick(function () {
     try {
       self._tokenize('', true);
diff --git a/test/test-perf.js b/test/test-perf.js
index 919bf63..0f1bb61 100644
--- a/test/test-perf.js
+++ b/test/test-perf.js
@@ -46,7 +46,7 @@ Function.prototype.timed = function (timeout) {
 
 exports['test big file of small integers'] = function (test) {
   var numbers = [0];
-  for (var i = 0; i < 1000000; ++i) {
+  for (var i = 0; i < 100000; ++i) {
     numbers.push(Math.floor(Math.random() * 100000));
   };
   var t = tokenizer(undefined, {split: /\,/});
@@ -55,7 +55,6 @@ exports['test big file of small integers'] = function (test) {
   t.addRule('whitespace');
   t.addRule(/^,$/, 'comma');
   t.ignore('whitespace');
-  t.ignore('comma');
   t.on('data', function(token) {
   });
   t.on('end', test.done.bind(test));

From cc3afacd7436da5be6769141f58911acbf1dfd37 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Wed, 3 Dec 2014 16:12:37 -0600
Subject: [PATCH 09/12] Removing the process next tick from the flush method as
 it was causing issues when trying to listen to when tokenizing was done.

---
 lib/Tokenizer.js | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index dda0ae5..9f658aa 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -109,16 +109,8 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
 };
 
 Tokenizer.prototype._flush = function _flush(callback) {
-  var self = this;
-
-  process.nextTick(function () {
-    try {
-      self._tokenize('', true);
-      callback();
-    } catch(e) {
-      callback(e);
-    }
-  });
+  this._tokenize('', true);
+  callback();
 };
 
 var Token = function String (content, type) {

From fdd40977626670951ce7248256a53e15b48e3684 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Thu, 4 Dec 2014 11:18:11 -0600
Subject: [PATCH 10/12] Minor fix to make sure to add back in an end of line
 character when the split occurs so that there is some indicator the split is
 there.

---
 lib/Tokenizer.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 9f658aa..9aa9943 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -84,7 +84,7 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
         data = data.substr(len);
       }
 
-      str = ix != -1 ? data.substr(0, ix) : data;
+      str = ix != -1 ? data.substr(0, ix) + '\n' : data;
       data = ix != -1 ? data.substr(ix) : undefined;
     }
     else {
@@ -158,7 +158,7 @@ Tokenizer.prototype.addRule = function addRule(regex, type) {
  * set some tokens to be ignored. these won't be emitted
  */
 Tokenizer.prototype.ignore = function ignore(ignored) {
-  if(ignored instanceof Array)
+  if (ignored instanceof Array)
     return ignored.forEach(this.ignore.bind(this));
   this._ignored[ignored] = true;
 };

From 91555e7d30f7414f4e205b863cb939720eab54a5 Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Thu, 4 Dec 2014 12:36:46 -0600
Subject: [PATCH 11/12] Greatly improving the speed at which we are checking
 tokens.

---
 lib/Tokenizer.js | 63 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 17 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index 9aa9943..fd09547 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -49,12 +49,35 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback)
   })
 };
 
-Tokenizer.prototype._getMatchingRule = function _getMatchingRule(str) {
+Tokenizer.prototype._getLongestMatch = function _getMatchingRule(str) {
+  var bestMatch = undefined,
+    longestMatchLen = 0;
+
+  // Find the longest match that matches at the beginning of the string.
   for (var i = 0; i < this._regexes.length; i++)
-    if(str.search(this._regexes[i].regex) == 0)
-      return this._regexes[i];
+  {
+    var match = undefined,
+        matches = str.match(this._regexes[i].regex);
 
-  return null;
+    if (matches && matches.length)
+    {
+      if ((match = matches[0]).length > longestMatchLen)
+      {
+        longestMatchLen = match.length;
+        bestMatch = {
+          rule: this._regexes[i],
+          match: match,
+          length: match.length,
+          matchesAll: longestMatchLen == str.length
+        };
+
+        if (longestMatchLen == str.length)
+          break;
+      }
+    }
+  }
+
+  return bestMatch;
 };
 
 Tokenizer.prototype._firstMatchLength = function(str, regex) {
@@ -68,12 +91,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
   // Did we buffered data on previous writes?
   data = this._buffered + data;
   this._buffered = '';
-
+  
   while (data && data.length)
   {
-    var rule = undefined,
+    
+    var match = undefined,
         str = undefined,
-        ix = -1;
+        ix = -1,
+        removeEOL = false;
 
     if (this.options.split) {
       while ((ix = data.search(this.options.split)) == 0)
@@ -84,6 +109,8 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
         data = data.substr(len);
       }
 
+      if (ix != -1)
+        removeEOL = true;
       str = ix != -1 ? data.substr(0, ix) + '\n' : data;
       data = ix != -1 ? data.substr(ix) : undefined;
     }
@@ -92,19 +119,21 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
       data = undefined;
     }
 
-    for (var i = str.length; i > 0; i--)
-      if (rule = this._getMatchingRule(str.substr(0, i))) break;
+    match = this._getLongestMatch(str);
 
-    if (!rule) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
-    else if (i == str.length && !endofstream && (!data || !data.length)) {
+    if (!match) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
+    else if (match.matchesAll && !endofstream && (!data || !data.length)) {
       this._buffered = str;
       return;
     }
 
-    data = str.substr(i) + (data || '');  
-    str = str.substr(0, i);
+    if (removeEOL)
+      str = str.substr(0, str.length - 1);
+
+    data = str.substr(match.length) + (data || '');  
+    str = str.substr(0, match.length);
 
-    this._gotToken(str, rule);
+    this._gotToken(str, match.rule);
   } // while
 };
 
@@ -166,6 +195,6 @@ Tokenizer.prototype.ignore = function ignore(ignored) {
 module.exports = Tokenizer;
 
 // built-in rules
-Tokenizer.whitespace    = [/^(\s)+$/, 'whitespace'];
-Tokenizer.word          = [/^\w+$/, 'word'];
-Tokenizer.number        = [/^\d+(\.\d+)?$/, 'number'];
\ No newline at end of file
+Tokenizer.whitespace    = [/^(\s)+/, 'whitespace'];
+Tokenizer.word          = [/^\w+/, 'word'];
+Tokenizer.number        = [/^\d+(\.\d+)?/, 'number'];
\ No newline at end of file

From 71207ac03b2701eb82780caaa9ab64a308c937fa Mon Sep 17 00:00:00 2001
From: jjung <jjung@vailsys.com>
Date: Thu, 11 Dec 2014 11:55:54 -0600
Subject: [PATCH 12/12] Fixing issues to get working with slim-to-jade

---
 lib/Tokenizer.js | 65 ++++++++++++++++++++++++++++++------------------
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/lib/Tokenizer.js b/lib/Tokenizer.js
index fd09547..1846045 100644
--- a/lib/Tokenizer.js
+++ b/lib/Tokenizer.js
@@ -1,3 +1,5 @@
+// Using a fork of npm tokenizer by JFloby. (c) JFloby with modifications by me.
+
 var EventEmitter = require('events').EventEmitter,
     util = require('util'),
     assert = require('assert'),
@@ -5,7 +7,7 @@ var EventEmitter = require('events').EventEmitter,
 
 function noop(){}
 
-function Tokenizer (check_token_cb, options) {
+function Tokenizer (check_token_cb, options, error_cb) {
   if(!(this instanceof Tokenizer)) {
     return new Tokenizer(check_token_cb, options);
   }
@@ -20,6 +22,7 @@ function Tokenizer (check_token_cb, options) {
   this._regexes = [];   // should contain objects with regex[RegExp] and type[String]
   this._ignored = {};   // a hash of ignored token types these will be parsed but not emitted
   this._checkToken = check_token_cb || noop;
+  this._error = error_cb;
 }
 
 util.inherits(Tokenizer, Transform);
@@ -29,33 +32,32 @@ Tokenizer.prototype._transform = function _transform(chunk, encoding, callback)
   var self = this;
 
   process.nextTick(function () {
-    try {
-      var index = 0,
-          step = self.options.stepSize;
+    var index = 0,
+        step = self.options.stepSize;
 
-      if (self.options.stepSize > 0)
-      {
-        while (index < chunk.length) {
-          self._tokenize(chunk.substr(index, step));
-          index += step;
-        }
+    if (self.options.stepSize > 0)
+    {
+      while (index < chunk.length) {
+        self._tokenize(chunk.substr(index, step));
+        index += step;
       }
-      else self._tokenize(chunk);
-
-      callback();
-    } catch(e) {
-      callback(e, chunk);
     }
+    else self._tokenize(chunk);
+
+    callback(undefined, chunk);
   })
 };
 
 Tokenizer.prototype._getLongestMatch = function _getMatchingRule(str) {
   var bestMatch = undefined,
-    longestMatchLen = 0;
+      longestMatchLen = 0;
 
   // Find the longest match that matches at the beginning of the string.
   for (var i = 0; i < this._regexes.length; i++)
   {
+    if (this._regexes[i].filter && !this._regexes[i].filter(str))
+      continue;
+
     var match = undefined,
         matches = str.match(this._regexes[i].regex);
 
@@ -94,7 +96,6 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
   
   while (data && data.length)
   {
-    
     var match = undefined,
         str = undefined,
         ix = -1,
@@ -104,9 +105,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
       while ((ix = data.search(this.options.split)) == 0)
       {
         var len = this._firstMatchLength(data, this.options.split);
-        this.emit('split', data.substr(0, len));
 
-        data = data.substr(len);
+        if (len != -1)
+        {
+          this.emit('split', data.substr(0, len));
+
+          data = data.substr(len);
+        }
+        else return;
       }
 
       if (ix != -1)
@@ -121,7 +127,14 @@ Tokenizer.prototype._tokenize = function _tokenize(data, endofstream) {
 
     match = this._getLongestMatch(str);
 
-    if (!match) throw new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
+    if (!match) {
+      var err = new SyntaxError('No rules found to match any part of \'' + str.toString() + '\'');
+
+      if (this._error)
+        this._error(err);
+      else
+        throw err;
+    }
     else if (match.matchesAll && !endofstream && (!data || !data.length)) {
       this._buffered = str;
       return;
@@ -165,14 +178,14 @@ Tokenizer.prototype._gotToken = function _gotToken(str, rule) {
   this.emit('token', token, type);
 };
 
-Tokenizer.prototype.addRule = function addRule(regex, type) {
+Tokenizer.prototype.addRule = function addRule(regex, type, filter) {
   // this is useful for built-in rules
   if(!type) {
     if(Array.isArray(regex)) {
-      return this.addRule(regex[0], regex[1]);
+      return this.addRule(regex[0], regex[1], filter);
     }
     else if(regex) {
-      return this.addRule(Tokenizer[regex]);
+      return this.addRule(Tokenizer[regex], filter);
     }
     else {
       throw new Error('No parameters specified');
@@ -180,7 +193,11 @@ Tokenizer.prototype.addRule = function addRule(regex, type) {
   }
   assert.ok((regex instanceof RegExp) || (typeof regex === 'function'));
   assert.equal(typeof type, 'string');
-  this._regexes.push({regex:regex,type:type});
+  this._regexes.push({
+    regex:regex,
+    type:type,
+    filter: filter
+  });
 };
 
 /**