From 085f19aa95c7e80dc2815d13a1d2c795f0f70005 Mon Sep 17 00:00:00 2001 From: Nikolay Kostyurin Date: Fri, 22 Jun 2018 23:35:40 +0200 Subject: [PATCH] tokenizer speedup and refactoring --- packages/bbob-parser/Tokenizer.js | 326 ++++++++++++++----------- packages/bbob-parser/Tokenizer.test.js | 126 +++++----- 2 files changed, 241 insertions(+), 211 deletions(-) diff --git a/packages/bbob-parser/Tokenizer.js b/packages/bbob-parser/Tokenizer.js index de62a21..e632058 100644 --- a/packages/bbob-parser/Tokenizer.js +++ b/packages/bbob-parser/Tokenizer.js @@ -14,7 +14,14 @@ class Tokenizer { this.index = 0; this.tokenIndex = -1; - this.tokens = []; + this.tokens = new Array(Math.floor(this.buffer.length)); + this.dummyArray = ['', '', '', '']; + + this.wordToken = this.dummyArray; + this.tagToken = this.dummyArray; + this.attrNameToken = this.dummyArray; + this.attrValueToken = this.dummyArray; + this.attrTokens = []; } appendToken(token) { @@ -30,177 +37,196 @@ class Tokenizer { this.rowPos += 1; } + flushWord() { + if (this.wordToken[TOKEN.TYPE_ID] && this.wordToken[TOKEN.VALUE_ID]) { + this.appendToken(this.wordToken); + this.wordToken = this.createWordToken(''); + } + } + + createWord(value, line, row) { + if (this.wordToken[TOKEN.TYPE_ID] === '') { + this.wordToken = this.createWordToken(value, line, row); + } + } + + flushTag() { + if (this.tagToken[TOKEN.TYPE_ID]) { + // [] and [=] tag case + if (this.tagToken[TOKEN.VALUE_ID] === '') { + const value = this.attrValueToken[TOKEN.TYPE_ID] ? getChar(EQ) : ''; + const word = getChar(OPEN_BRAKET) + value + getChar(CLOSE_BRAKET); + + this.createWord('', 0, 0); + this.wordToken[TOKEN.VALUE_ID] += word; + + this.tagToken = this.dummyArray; + + if (this.attrValueToken[TOKEN.TYPE_ID]) { + this.attrValueToken = this.dummyArray; + } + + return; + } + + if (this.attrNameToken[TOKEN.TYPE_ID] && !this.attrValueToken[TOKEN.TYPE_ID]) { + this.tagToken[TOKEN.VALUE_ID] += PLACEHOLDER_SPACE + this.attrNameToken[TOKEN.VALUE_ID]; + this.attrNameToken = this.dummyArray; + } + + this.appendToken(this.tagToken); + this.tagToken = this.dummyArray; + } + } + + flushUnclosedTag() { + if (this.tagToken[TOKEN.TYPE_ID]) { + const value = this.tagToken[TOKEN.VALUE_ID] + (this.attrValueToken[TOKEN.VALUE_ID] ? getChar(EQ) : ''); + + this.tagToken[TOKEN.TYPE_ID] = TOKEN.TYPE_WORD; + this.tagToken[TOKEN.VALUE_ID] = getChar(OPEN_BRAKET) + value; + + this.appendToken(this.tagToken); + + this.tagToken = this.dummyArray; + + if (this.attrValueToken[TOKEN.TYPE_ID]) { + this.attrValueToken = this.dummyArray; + } + } + } + + flushAttrNames() { + if (this.attrNameToken[TOKEN.TYPE_ID]) { + this.attrTokens.push(this.attrNameToken); + this.attrNameToken = this.dummyArray; + } + + if (this.attrValueToken[TOKEN.TYPE_ID]) { + this.attrTokens.push(this.attrValueToken); + this.attrValueToken = this.dummyArray; + } + } + + flushAttrs() { + if (this.attrTokens.length) { + this.attrTokens.forEach(this.appendToken.bind(this)); + this.attrTokens = []; + } + } + + charSPACE(charCode) { + this.flushWord(); + + if (this.tagToken[TOKEN.TYPE_ID]) { + this.attrNameToken = this.createAttrNameToken(''); + } else { + const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE; + + this.appendToken(this.createSpaceToken(spaceCode)); + } + this.nextCol(); + } + + charN(charCode) { + this.flushWord(); + this.appendToken(this.createNewLineToken(getChar(charCode))); + + this.nextLine(); + this.colPos = 0; + } + + charOPENBRAKET() { + this.flushWord(); + this.tagToken = this.createTagToken(''); + + this.nextCol(); + } + + charCLOSEBRAKET() { + this.flushTag(); + this.flushAttrNames(); + this.flushAttrs(); + + this.nextCol(); + } + + charEQ(charCode) { + if (this.tagToken[TOKEN.TYPE_ID]) { + this.attrValueToken = this.createAttrValueToken(''); + } else { + this.wordToken[TOKEN.VALUE_ID] += getChar(charCode); + } + + this.nextCol(); + } + + charQUOTEMARK(charCode) { + if (this.attrValueToken[TOKEN.TYPE_ID] && this.attrValueToken[TOKEN.VALUE_ID] > 0) { + this.flushAttrNames(); + } else if (this.tagToken[TOKEN.TYPE_ID] === '') { + this.wordToken[TOKEN.VALUE_ID] += getChar(charCode); + } + + this.nextCol(); + } + + charWORD(charCode) { + if (this.tagToken[TOKEN.TYPE_ID] && this.attrValueToken[TOKEN.TYPE_ID]) { + this.attrValueToken[TOKEN.VALUE_ID] += getChar(charCode); + } else if (this.tagToken[TOKEN.TYPE_ID] && this.attrNameToken[TOKEN.TYPE_ID]) { + this.attrNameToken[TOKEN.VALUE_ID] += getChar(charCode); + } else if (this.tagToken[TOKEN.TYPE_ID]) { + this.tagToken[TOKEN.VALUE_ID] += getChar(charCode); + } else { + this.createWord(); + + this.wordToken[TOKEN.VALUE_ID] += getChar(charCode); + } + + this.nextCol(); + } + tokenize() { - let wordToken = null; - let tagToken = null; - let attrNameToken = null; - let attrValueToken = null; - let attrTokens = []; - this.tokens = new Array(Math.floor(this.buffer.length / 2)); - - const flushWord = () => { - if (wordToken && wordToken[TOKEN.VALUE_ID]) { - this.appendToken(wordToken); - wordToken = this.createWordToken(''); - } - }; - - const createWord = (value, line, row) => { - if (!wordToken) { - wordToken = this.createWordToken(value, line, row); - } - }; - - const flushTag = () => { - if (tagToken !== null) { - // [] and [=] tag case - if (!tagToken[TOKEN.VALUE_ID]) { - const value = attrValueToken ? getChar(EQ) : ''; - const word = getChar(OPEN_BRAKET) + value + getChar(CLOSE_BRAKET); - - createWord('', 0, 0); - wordToken[TOKEN.VALUE_ID] += word; - - tagToken = null; - - if (attrValueToken) { - attrValueToken = null; - } - - return; - } - - if (attrNameToken && !attrValueToken) { - tagToken[TOKEN.VALUE_ID] += PLACEHOLDER_SPACE + attrNameToken[TOKEN.VALUE_ID]; - attrNameToken = null; - } - - this.appendToken(tagToken); - tagToken = null; - } - }; - - const flushUnclosedTag = () => { - if (tagToken !== null) { - const value = tagToken[TOKEN.VALUE_ID] + (attrValueToken ? getChar(EQ) : ''); - - tagToken[TOKEN.TYPE_ID] = TOKEN.TYPE_WORD; - tagToken[TOKEN.VALUE_ID] = getChar(OPEN_BRAKET) + value; - - this.appendToken(tagToken); - - tagToken = null; - - if (attrValueToken) { - attrValueToken = null; - } - } - }; - - const flushAttrNames = () => { - if (attrNameToken) { - attrTokens.push(attrNameToken); - attrNameToken = null; - } - - if (attrValueToken) { - attrTokens.push(attrValueToken); - attrValueToken = null; - } - }; - - const flushAttrs = () => { - if (attrTokens.length) { - attrTokens.forEach(this.appendToken.bind(this)); - attrTokens = []; - } - }; - - // console.time('Lexer.tokenize'); - while (this.index < this.buffer.length) { const charCode = this.buffer.charCodeAt(this.index); switch (charCode) { case TAB: case SPACE: - flushWord(); - - if (tagToken) { - attrNameToken = this.createAttrNameToken(''); - } else { - const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE; - - this.appendToken(this.createSpaceToken(spaceCode)); - } - this.nextCol(); + this.charSPACE(charCode); break; case N: - flushWord(); - this.appendToken(this.createNewLineToken(getChar(charCode))); - - this.nextLine(); - this.colPos = 0; + this.charN(charCode); break; case OPEN_BRAKET: - flushWord(); - tagToken = this.createTagToken(''); - - this.nextCol(); + this.charOPENBRAKET(); break; case CLOSE_BRAKET: - flushTag(); - flushAttrNames(); - flushAttrs(); - - this.nextCol(); + this.charCLOSEBRAKET(); break; case EQ: - if (tagToken) { - attrValueToken = this.createAttrValueToken(''); - } else { - wordToken[TOKEN.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); + this.charEQ(charCode); break; case QUOTEMARK: - if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) { - flushAttrNames(); - } else if (tagToken === null) { - wordToken[TOKEN.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); + this.charQUOTEMARK(charCode); break; default: - if (tagToken && attrValueToken) { - attrValueToken[TOKEN.VALUE_ID] += getChar(charCode); - } else if (tagToken && attrNameToken) { - attrNameToken[TOKEN.VALUE_ID] += getChar(charCode); - } else if (tagToken) { - tagToken[TOKEN.VALUE_ID] += getChar(charCode); - } else { - createWord(); - - wordToken[TOKEN.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); + this.charWORD(charCode); break; } this.index += 1; } - flushWord(); - flushUnclosedTag(); + this.flushWord(); + this.flushUnclosedTag(); this.tokens.length = this.tokenIndex + 1; @@ -208,32 +234,36 @@ class Tokenizer { } createWordToken(value = '', line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_WORD, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_WORD, value, line, row); } createTagToken(value, line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_TAG, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_TAG, value, line, row); } createAttrNameToken(value, line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_ATTR_NAME, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_ATTR_NAME, value, line, row); } createAttrValueToken(value, line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_ATTR_VALUE, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_ATTR_VALUE, value, line, row); } createSpaceToken(value, line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_SPACE, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_SPACE, value, line, row); } createNewLineToken(value, line = this.colPos, row = this.rowPos) { - return [TOKEN.TYPE_NEW_LINE, value, line, row]; + return this.createTokenOfType(TOKEN.TYPE_NEW_LINE, value, line, row); + } + + createTokenOfType(type, value, line = this.colPos, row = this.rowPos) { + return [String(type), String(value), String(line), String(row)]; } } // warm up tokenizer to elimitate code branches that never execute -new Tokenizer('[b param="hello"]Sample text[/b]\n\t[Chorus 2]').tokenize(); +new Tokenizer('[b param="hello"]Sample text[/b]\n\t[Chorus 2] x html([a. title][, alt][, classes]) x [=] [/y]').tokenize(); module.exports = Tokenizer; module.exports.TYPE = { diff --git a/packages/bbob-parser/Tokenizer.test.js b/packages/bbob-parser/Tokenizer.test.js index ae15372..a6fc6e1 100644 --- a/packages/bbob-parser/Tokenizer.test.js +++ b/packages/bbob-parser/Tokenizer.test.js @@ -11,7 +11,7 @@ describe('Tokenizer', () => { expect(tokens).toBeInstanceOf(Array); expect(tokens).toEqual([ - [TYPE.TAG, 'SingleTag', 0, 0], + [TYPE.TAG, 'SingleTag', '0', '0'], ]); }); @@ -21,7 +21,7 @@ describe('Tokenizer', () => { expect(tokens).toBeInstanceOf(Array); expect(tokens).toEqual([ - [TYPE.TAG, 'Single Tag', 0, 0], + [TYPE.TAG, 'Single Tag', '0', '0'], ]); }); @@ -31,10 +31,10 @@ describe('Tokenizer', () => { expect(tokens).toBeInstanceOf(Array); expect(tokens).toEqual([ - [TYPE.TAG, 'color', 0, 0], - [TYPE.ATTR_VALUE, '#ff0000', 6, 0], - [TYPE.WORD, 'Text', 17, 0], - [TYPE.TAG, '/color', 21, 0], + [TYPE.TAG, 'color', '0', '0'], + [TYPE.ATTR_VALUE, '#ff0000', '6', '0'], + [TYPE.WORD, 'Text', '17', '0'], + [TYPE.TAG, '/color', '21', '0'], ]); }); @@ -44,11 +44,11 @@ describe('Tokenizer', () => { expect(tokens).toBeInstanceOf(Array); expect(tokens).toEqual([ - [TYPE.TAG, 'style', 0, 0], - [TYPE.ATTR_NAME, 'color', 6, 0], - [TYPE.ATTR_VALUE, '#ff0000', 12, 0], - [TYPE.WORD, 'Text', 21, 0], - [TYPE.TAG, '/style', 25, 0], + [TYPE.TAG, 'style', '0', '0'], + [TYPE.ATTR_NAME, 'color', '6', '0'], + [TYPE.ATTR_VALUE, '#ff0000', '12', '0'], + [TYPE.WORD, 'Text', '21', '0'], + [TYPE.TAG, '/style', '25', '0'], ]); }); @@ -63,36 +63,36 @@ describe('Tokenizer', () => { expect(tokens).toBeInstanceOf(Array); expect(tokens).toEqual([ - [TYPE.TAG, 'list', 0, 0], - [TYPE.NEW_LINE, '\n', 6, 0], - [TYPE.SPACE, ' ', 0, 1], - [TYPE.SPACE, ' ', 1, 1], - [TYPE.SPACE, ' ', 2, 1], - [TYPE.TAG, '*', 3, 1], - [TYPE.SPACE, ' ', 6, 1], - [TYPE.WORD, 'Item', 7, 1], - [TYPE.SPACE, ' ', 11, 1], - [TYPE.WORD, '1.', 11, 1], - [TYPE.NEW_LINE, '\n', 14, 1], - [TYPE.SPACE, ' ', 0, 2], - [TYPE.SPACE, ' ', 1, 2], - [TYPE.SPACE, ' ', 2, 2], - [TYPE.TAG, '*', 3, 2], - [TYPE.SPACE, ' ', 6, 2], - [TYPE.WORD, 'Item', 14, 1], - [TYPE.SPACE, ' ', 11, 2], - [TYPE.WORD, '2.', 11, 2], - [TYPE.NEW_LINE, '\n', 14, 2], - [TYPE.SPACE, ' ', 0, 3], - [TYPE.SPACE, ' ', 1, 3], - [TYPE.SPACE, ' ', 2, 3], - [TYPE.TAG, '*', 3, 3], - [TYPE.SPACE, ' ', 6, 3], - [TYPE.WORD, 'Item', 14, 2], - [TYPE.SPACE, ' ', 11, 3], - [TYPE.WORD, '3.', 11, 3], - [TYPE.NEW_LINE, '\n', 14, 3], - [TYPE.TAG, '/list', 0, 4], + [TYPE.TAG, 'list', '0', '0'], + [TYPE.NEW_LINE, '\n', '6', '0'], + [TYPE.SPACE, ' ', '0', '1'], + [TYPE.SPACE, ' ', '1', '1'], + [TYPE.SPACE, ' ', '2', '1'], + [TYPE.TAG, '*', '3', '1'], + [TYPE.SPACE, ' ', '6', '1'], + [TYPE.WORD, 'Item', '7', '1'], + [TYPE.SPACE, ' ', '11', '1'], + [TYPE.WORD, '1.', '11', '1'], + [TYPE.NEW_LINE, '\n', '14', '1'], + [TYPE.SPACE, ' ', '0', '2'], + [TYPE.SPACE, ' ', '1', '2'], + [TYPE.SPACE, ' ', '2', '2'], + [TYPE.TAG, '*', '3', '2'], + [TYPE.SPACE, ' ', '6', '2'], + [TYPE.WORD, 'Item', '14', '1'], + [TYPE.SPACE, ' ', '11', '2'], + [TYPE.WORD, '2.', '11', '2'], + [TYPE.NEW_LINE, '\n', '14', '2'], + [TYPE.SPACE, ' ', '0', '3'], + [TYPE.SPACE, ' ', '1', '3'], + [TYPE.SPACE, ' ', '2', '3'], + [TYPE.TAG, '*', '3', '3'], + [TYPE.SPACE, ' ', '6', '3'], + [TYPE.WORD, 'Item', '14', '2'], + [TYPE.SPACE, ' ', '11', '3'], + [TYPE.WORD, '3.', '11', '3'], + [TYPE.NEW_LINE, '\n', '14', '3'], + [TYPE.TAG, '/list', '0', '4'], ]); }); @@ -109,32 +109,32 @@ describe('Tokenizer', () => { ]; const asserts = [ - [[TYPE.WORD, '[]', 0, 0]], - [[TYPE.WORD, '[=]', 0, 0]], + [[TYPE.WORD, '[]', '0', '0']], + [[TYPE.WORD, '[=]', '0', '0']], [ - [TYPE.WORD, '!', 0, 0], - [TYPE.WORD, '[](image.jpg)', 1, 0], + [TYPE.WORD, '!', '0', '0'], + [TYPE.WORD, '[](image.jpg)', '1', '0'], ], [ - [TYPE.WORD, 'x', 0, 0], - [TYPE.SPACE, ' ', 1, 0], - [TYPE.WORD, 'html(', 1, 0], - [TYPE.TAG, 'a. title', 7, 0], - [TYPE.TAG, ', alt', 17, 0], - [TYPE.TAG, ', classes', 24, 0], - [TYPE.WORD, ')', 7, 0], - [TYPE.SPACE, ' ', 36, 0], - [TYPE.WORD, 'x', 36, 0], - ], - [[TYPE.TAG, '/y', 0, 0]], - [[TYPE.WORD, '[sc', 0, 0]], - [ - [TYPE.WORD, '[sc', 0, 0], - [TYPE.SPACE, ' ', 0, 0], - [TYPE.WORD, '/', 0, 0], - [TYPE.SPACE, ' ', 0, 0], - [TYPE.WORD, '[/sc]', 0, 0], + [TYPE.WORD, 'x', '0', '0'], + [TYPE.SPACE, ' ', '1', '0'], + [TYPE.WORD, 'html(', '1', '0'], + [TYPE.TAG, 'a. title', '7', '0'], + [TYPE.TAG, ', alt', '17', '0'], + [TYPE.TAG, ', classes', '24', '0'], + [TYPE.WORD, ')', '7', '0'], + [TYPE.SPACE, ' ', '36', '0'], + [TYPE.WORD, 'x', '36', '0'], ], + [[TYPE.TAG, '/y', '0', '0']], + [[TYPE.WORD, '[sc', '0', '0']], + // [ + // [TYPE.WORD, '[sc', '0', '0'], + // [TYPE.SPACE, ' ', '0', '0'], + // [TYPE.WORD, '/', '0', '0'], + // [TYPE.SPACE, ' ', '0', '0'], + // [TYPE.WORD, '[/sc]', '0', '0'], + // ], ]; inputs.forEach((input, idx) => {