From 305643daa2f12617895d7fce085021d307962143 Mon Sep 17 00:00:00 2001 From: Nikolay Kostyurin Date: Sun, 10 Jun 2018 22:13:51 +0200 Subject: [PATCH] more Tokenizer invalid cases tests --- packages/bbob-parser/Tokenizer.js | 107 +++++++++++++++++-------- packages/bbob-parser/Tokenizer.test.js | 49 +++++++++++ packages/bbob-parser/parse.test.js | 48 ++++++----- 3 files changed, 145 insertions(+), 59 deletions(-) diff --git a/packages/bbob-parser/Tokenizer.js b/packages/bbob-parser/Tokenizer.js index 66fc2d5..9569313 100644 --- a/packages/bbob-parser/Tokenizer.js +++ b/packages/bbob-parser/Tokenizer.js @@ -52,6 +52,14 @@ class Tokenizer { this.colPos = 0; this.rowPos = 0; this.index = 0; + + this.tokenIndex = -1; + this.tokens = []; + } + + appendToken(token) { + this.tokenIndex++; + this.tokens[this.tokenIndex] = token; } tokenize() { @@ -60,30 +68,67 @@ class Tokenizer { let attrNameToken = null; let attrValueToken = null; let attrTokens = []; - let tokens = new Array(Math.floor(this.buffer.length / 2)); - let tokenIndex = -1; + this.tokens = new Array(Math.floor(this.buffer.length / 2)); const flushWord = () => { if (wordToken && wordToken[TOKEN.VALUE_ID]) { - tokenIndex++; - tokens[tokenIndex] = wordToken; + this.appendToken(wordToken); wordToken = this.createWordToken('') } }; + const createWord = (value, line, row) => { + if (!wordToken) { + wordToken = this.createWordToken(value, line, row) + } + }; + const flushTag = () => { if (tagToken !== null) { + // [] and [=] tag case + if (!tagToken[TOKEN.VALUE_ID]) { + const value = attrValueToken ? getChar(CHAR.EQ) : ''; + const word = getChar(CHAR.OPEN_BRAKET) + value + getChar(CHAR.CLOSE_BRAKET); + + createWord('', 0, 0); + wordToken[TOKEN.VALUE_ID] += word; + + tagToken = null; + + if (attrValueToken) { + attrValueToken = null + } + + return; + } + if (attrNameToken && !attrValueToken) { - tagToken[TOKEN.VALUE_ID] += SPACE + attrNameToken[TOKEN.VALUE_ID] + tagToken[TOKEN.VALUE_ID] += SPACE + attrNameToken[TOKEN.VALUE_ID]; attrNameToken = null } - tokenIndex++; - tokens[tokenIndex] = tagToken; + this.appendToken(tagToken); tagToken = null; } }; + const flushUnclosedTag = () => { + if (tagToken !== null) { + const value = tagToken[TOKEN.VALUE_ID] + (attrValueToken ? getChar(CHAR.EQ) : ''); + + tagToken[TOKEN.TYPE_ID] = TOKEN.TYPE_WORD; + tagToken[TOKEN.VALUE_ID] = getChar(CHAR.OPEN_BRAKET) + value; + + this.appendToken(tagToken); + + tagToken = null; + + if (attrValueToken) { + attrValueToken = null + } + } + }; + const flushAttrNames = () => { if (attrNameToken) { attrTokens.push(attrNameToken); @@ -98,11 +143,7 @@ class Tokenizer { const flushAttrs = () => { if (attrTokens.length) { - attrTokens.forEach(attrToken => { - tokenIndex++; - tokens[tokenIndex] = attrToken - }); - + attrTokens.forEach(this.appendToken.bind(this)); attrTokens = []; } }; @@ -122,16 +163,14 @@ class Tokenizer { } else { const spaceCode = charCode === CHAR.TAB ? SPACE_TAB : SPACE; - tokenIndex++; - tokens[tokenIndex] = this.createSpaceToken(spaceCode); + this.appendToken(this.createSpaceToken(spaceCode)); } this.colPos++; break; case CHAR.N: flushWord(); - tokenIndex++; - tokens[tokenIndex] = this.createNewLineToken(getChar(charCode)); + this.appendToken(this.createNewLineToken(getChar(charCode))); this.rowPos++; this.colPos = 0; @@ -180,9 +219,7 @@ class Tokenizer { } else if (tagToken) { tagToken[TOKEN.VALUE_ID] += getChar(charCode) } else { - if (!wordToken) { - wordToken = this.createWordToken('') - } + createWord(); wordToken[TOKEN.VALUE_ID] += getChar(charCode); } @@ -195,39 +232,41 @@ class Tokenizer { } flushWord(); + flushUnclosedTag(); - tokens.length = tokenIndex + 1; + this.tokens.length = this.tokenIndex + 1; - return tokens; + return this.tokens; } - createWordToken(value) { - return [TOKEN.TYPE_WORD, value, this.colPos, this.rowPos] + createWordToken(value = '', line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_WORD, value, line, row] } - createTagToken(value) { - return [TOKEN.TYPE_TAG, value, this.colPos, this.rowPos] + createTagToken(value, line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_TAG, value, line, row] } - createAttrNameToken(value) { - return [TOKEN.TYPE_ATTR_NAME, value, this.colPos, this.rowPos] + createAttrNameToken(value, line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_ATTR_NAME, value, line, row] } - createAttrValueToken(value) { - return [TOKEN.TYPE_ATTR_VALUE, value, this.colPos, this.rowPos] + createAttrValueToken(value, line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_ATTR_VALUE, value, line, row] } - createSpaceToken(value) { - return [TOKEN.TYPE_SPACE, value, this.colPos, this.rowPos] + createSpaceToken(value, line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_SPACE, value, line, row] } - createNewLineToken(value) { - return [TOKEN.TYPE_NEW_LINE, value, this.colPos, this.rowPos] + createNewLineToken(value, line = this.colPos, row = this.rowPos) { + return [TOKEN.TYPE_NEW_LINE, value, line, row] } } // warm up tokenizer to elimitate code branches that never execute -new Tokenizer(`[b param="hello"]Sample text[/b]\n\t[Chorus]`).tokenize(); +new Tokenizer(`[sc=asdasd`).tokenize(); +//new Tokenizer(`[b param="hello"]Sample text[/b]\n\t[Chorus]`).tokenize(); module.exports = Tokenizer; module.exports.CHAR = CHAR; diff --git a/packages/bbob-parser/Tokenizer.test.js b/packages/bbob-parser/Tokenizer.test.js index 30c0388..286389c 100644 --- a/packages/bbob-parser/Tokenizer.test.js +++ b/packages/bbob-parser/Tokenizer.test.js @@ -91,5 +91,54 @@ describe("Tokenizer", () => { [TYPE.NEW_LINE, '\n', 14, 3], [TYPE.TAG, '/list', 0, 4] ]) + }); + + test("tokenize bad tags as texts", () => { + const inputs = [ + '[]', + '[=]', + '![](image.jpg)', + 'x html([a. title][, alt][, classes]) x', + '[/y]', + '[sc', + '[sc / [/sc]', + '[sc arg="val', + ]; + + const asserts = [ + [[TYPE.WORD, '[]', 0, 0]], + [[TYPE.WORD, '[=]', 0, 0]], + [ + [TYPE.WORD, '!', 0, 0], + [TYPE.WORD, '[](image.jpg)', 1, 0] + ], + [ + [TYPE.WORD, "x", 0, 0], + [TYPE.SPACE, " ", 1, 0], + [TYPE.WORD, "html(", 1, 0], + [TYPE.TAG, "a. title", 7, 0], + [TYPE.TAG, ", alt", 17, 0], + [TYPE.TAG, ", classes", 24, 0], + [TYPE.WORD, ")", 7, 0], + [TYPE.SPACE, " ", 36, 0], + [TYPE.WORD, "x", 36, 0] + ], + [[TYPE.TAG, "/y", 0, 0]], + [[TYPE.WORD, '[sc', 0, 0]], + [ + [TYPE.WORD, '[sc', 0, 0], + [TYPE.SPACE, ' ', 0, 0], + [TYPE.WORD, '/', 0, 0], + [TYPE.SPACE, ' ', 0, 0], + [TYPE.WORD, '[/sc]', 0, 0] + ], + ]; + + inputs.forEach((input, idx) => { + const tokens = new Tokenizer(input).tokenize(); + + expect(tokens).toBeInstanceOf(Array); + expect(tokens).toEqual(asserts[idx]) + }); }) }); \ No newline at end of file diff --git a/packages/bbob-parser/parse.test.js b/packages/bbob-parser/parse.test.js index 4d5000d..8a11c3e 100644 --- a/packages/bbob-parser/parse.test.js +++ b/packages/bbob-parser/parse.test.js @@ -1,5 +1,4 @@ const parse = require('./index'); -const OldParser = require('./benchmark/OldParser'); const options = { allowOnlyTags: ['ch', 'syllable', 'tab'], @@ -12,28 +11,27 @@ describe("parse", () => { expect(ast).toEqual([{tag: 'Verse 2', attrs: {}, content: []}]); }); - test("same as old parser", () => { - const input = `[Verse 2] -[ch]Eb[/ch] [ch]Fm[/ch] - I'm walking around -[ch]Ab[/ch] [ch]Cm[/ch] - With my little raincloud -[ch]Eb[/ch] [ch]Fm[/ch] - Hanging over my head -[ch]Cm[/ch] [ch]Ab[/ch] - And it ain’t coming down -[ch]Eb[/ch] [ch]Fm[/ch] - Where do I go? -[ch]Ab[/ch] [ch]Cm[/ch] - Gimme some sort of sign -[ch]Eb[/ch] [ch]Fm[/ch] - Hit me with lightning! -[ch]Cm[/ch] [ch]Ab[/ch] - Maybe I’ll come alive -`; - const ast1 = parse(input, options); - const ast2 = OldParser.parse(input); - - expect(ast1).toEqual(ast2); - }) + // test("pass invalid tags", () => { + // const inputs = [ + // '[]', + // '![](image.jpg)', + // 'x html([a. title][, alt][, classes]) x', + // '[/y]', + // '[sc', + // '[sc / [/sc]', + // '[sc arg="val', + // ]; + // + // const ast1 = parse(inputs[0]); + // + // + // + // console.log('ast1', ast1); + // + // + // + // expect(ast1).toEqual([ + // + // ]); + // }) }); \ No newline at end of file