diff --git a/packages/bbob-parser/Parser.js b/packages/bbob-parser/Parser.js index ba957e1..d16ab2a 100644 --- a/packages/bbob-parser/Parser.js +++ b/packages/bbob-parser/Parser.js @@ -1,44 +1,19 @@ +const { + convertTokenToText, + getTagName, + getTokenColumn, + getTokenLine, + getTokenValue, + isAttrNameToken, + isAttrValueToken, + isTagStart, + isTagToken, + isTextToken, + isTagEnd +} = require("./Tokenizer"); const Tokenizer = require("./Tokenizer"); -const TokenType = Tokenizer.TYPE; const TokenChar = Tokenizer.CHAR; -const getCharCode = Tokenizer.getCharCode; - -const isTextToken = (token) => { - const type = token[Tokenizer.TOKEN.TYPE_ID]; - - return type === TokenType.SPACE || type === TokenType.NEW_LINE || type === TokenType.WORD -}; - -const isTagToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.TAG; - -const isTagStart = (token) => !isTagEnd(token); - -const isTagEnd = (token) => getTokenValue(token).charCodeAt(0) === TokenChar.SLASH; - -const isAttrNameToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.ATTR_NAME; - -const isAttrValueToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.ATTR_VALUE; - -const getTagName = (token) => { - const value = getTokenValue(token); - - return isTagEnd(token) ? value.slice(1) : value -}; - -const convertTagToText = (token) => { - let text = getCharCode(TokenChar.OPEN_BRAKET); - - if (isTagEnd(token)) { - text += getCharCode(TokenChar.SLASH) - } - - text += getTokenValue(token); - text += getCharCode(TokenChar.CLOSE_BRAKET); - - return text -}; - -const getTokenValue = (token) => token[Tokenizer.TOKEN.VALUE_ID]; +const getChar = Tokenizer.getChar; const createTagNode = (name, attrs = {}, content = []) => ({tag: name, attrs, content}); @@ -65,6 +40,10 @@ module.exports = class Parser { const curTags = []; const curTagsAttrName = []; + const closableTags = this.findNestedTags(tokens); + + const isNestedTag = (token) => closableTags.indexOf(getTokenValue(token)) >= 0; + const getCurTag = () => { if (curTags.length) { return curTags[curTags.length - 1] @@ -124,7 +103,7 @@ module.exports = class Parser { if (isTagStart(token)) { createCurTag(token); - if (this.isCloseTag(getTokenValue(token))) { + if (isNestedTag(token)) { nestedNodes.push(getCurTag()) } else { getNodes().push(getCurTag()); @@ -141,12 +120,11 @@ module.exports = class Parser { if (lastNestedNode) { getNodes().push(lastNestedNode) } else { - debugger; - console.warn(`Inconsistent tag '${getTokenValue(token)}'`); + console.warn(`Inconsistent tag '${getTokenValue(token)}' on line ${getTokenLine(token)} and column ${getTokenColumn(token)}`); } } } else { - getNodes().push(convertTagToText(token)) + getNodes().push(convertTokenToText(token)) } } @@ -168,8 +146,22 @@ module.exports = class Parser { return nodes } - isCloseTag(value) { - return this.options.closableTags && this.options.closableTags.indexOf(value) >= 0 + findNestedTags(tokens) { + const tags = tokens.filter(isTagToken).reduce((acc, token) => { + acc[getTokenValue(token)] = true; + + return acc + }, {}); + + const closeChar = getChar(TokenChar.SLASH); + + return Object.keys(tags).reduce((arr, key) => { + if (tags[key] && tags[closeChar + key]) { + arr.push(key) + } + + return arr; + }, []) } isAllowedTag(value) { diff --git a/packages/bbob-parser/README.md b/packages/bbob-parser/README.md new file mode 100644 index 0000000..a991849 --- /dev/null +++ b/packages/bbob-parser/README.md @@ -0,0 +1,2 @@ +# bbob-parser +Fast BB Code parser written in pure javascript, no dependencies diff --git a/packages/bbob-parser/Tokenizer.js b/packages/bbob-parser/Tokenizer.js index 7713c33..66fc2d5 100644 --- a/packages/bbob-parser/Tokenizer.js +++ b/packages/bbob-parser/Tokenizer.js @@ -1,19 +1,50 @@ const CHAR = require('./char'); const TOKEN = require('./token'); +const getChar = String.fromCharCode; -// const TOKEN.TYPE_ID = 0; -// const TOKEN.VALUE_ID = 1; -// const TOKEN.LINE_ID = 2; -// const TOKEN.COLUMN_ID = 3; -// -// const TOKEN.TYPE_WORD = 'word'; -// const TOKEN.TYPE_TAG = 'tag'; -// const TOKEN.TYPE_ATTR_NAME = 'attr-name'; -// const TOKEN.TYPE_ATTR_VALUE = 'attr-value'; -// const TOKEN.TYPE_SPACE = 'space'; -// const TOKEN.TYPE_NEW_LINE = 'new-line'; -const getCharCode = String.fromCharCode; +const getTokenValue = (token) => token[Tokenizer.TOKEN.VALUE_ID]; + +const getTokenLine = (token) => token[Tokenizer.TOKEN.LINE_ID]; +const getTokenColumn = (token) => token[Tokenizer.TOKEN.COLUMN_ID]; + +const isTextToken = (token) => { + const type = token[Tokenizer.TOKEN.TYPE_ID]; + + return type === TOKEN.TYPE_SPACE || type === TOKEN.TYPE_NEW_LINE || type === TOKEN.TYPE_WORD +}; + +const isTagToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_TAG; + +const isTagStart = (token) => !isTagEnd(token); + +const isTagEnd = (token) => getTokenValue(token).charCodeAt(0) === CHAR.SLASH; + +const isAttrNameToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_NAME; + +const isAttrValueToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_VALUE; + +const getTagName = (token) => { + const value = getTokenValue(token); + + return isTagEnd(token) ? value.slice(1) : value +}; + +const convertTagToText = (token) => { + let text = getChar(CHAR.OPEN_BRAKET); + + if (isTagEnd(token)) { + text += getChar(CHAR.SLASH) + } + + text += getTokenValue(token); + text += getChar(CHAR.CLOSE_BRAKET); + + return text +}; + +const SPACE_TAB = ' '; +const SPACE = ' '; class Tokenizer { constructor(input) { @@ -24,7 +55,7 @@ class Tokenizer { } tokenize() { - let wordToken = this.createWordToken(''); + let wordToken = null; let tagToken = null; let attrNameToken = null; let attrValueToken = null; @@ -33,7 +64,7 @@ class Tokenizer { let tokenIndex = -1; const flushWord = () => { - if (wordToken[TOKEN.VALUE_ID]) { + if (wordToken && wordToken[TOKEN.VALUE_ID]) { tokenIndex++; tokens[tokenIndex] = wordToken; wordToken = this.createWordToken('') @@ -42,20 +73,23 @@ class Tokenizer { const flushTag = () => { if (tagToken !== null) { + if (attrNameToken && !attrValueToken) { + tagToken[TOKEN.VALUE_ID] += SPACE + attrNameToken[TOKEN.VALUE_ID] + attrNameToken = null + } + tokenIndex++; tokens[tokenIndex] = tagToken; tagToken = null; } }; - const flushAttrName = () => { + const flushAttrNames = () => { if (attrNameToken) { attrTokens.push(attrNameToken); attrNameToken = null; } - }; - const flushAttrValue = () => { if (attrValueToken) { attrTokens.push(attrValueToken); attrValueToken = null @@ -85,20 +119,19 @@ class Tokenizer { if (tagToken) { attrNameToken = this.createAttrNameToken(''); + } else { + const spaceCode = charCode === CHAR.TAB ? SPACE_TAB : SPACE; + + tokenIndex++; + tokens[tokenIndex] = this.createSpaceToken(spaceCode); } - - const spaceCode = charCode === CHAR.TAB ? ' ' : ' '; - - tokenIndex++; - tokens[tokenIndex] = this.createSpaceToken(spaceCode); - this.colPos++; break; case CHAR.N: flushWord(); tokenIndex++; - tokens[tokenIndex] = this.createNewLineToken(getCharCode(charCode)); + tokens[tokenIndex] = this.createNewLineToken(getChar(charCode)); this.rowPos++; this.colPos = 0; @@ -113,8 +146,7 @@ class Tokenizer { case CHAR.CLOSE_BRAKET: flushTag(); - flushAttrName(); - flushAttrValue(); + flushAttrNames(); flushAttrs(); this.colPos++; @@ -124,7 +156,7 @@ class Tokenizer { if (tagToken) { attrValueToken = this.createAttrValueToken('') } else { - wordToken[TOKEN.VALUE_ID] += getCharCode(charCode); + wordToken[TOKEN.VALUE_ID] += getChar(charCode); } this.colPos++; @@ -132,10 +164,9 @@ class Tokenizer { case CHAR.QUOTEMARK: if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) { - flushAttrName(); - flushAttrValue(); + flushAttrNames(); } else if (tagToken === null) { - wordToken[TOKEN.VALUE_ID] += getCharCode(charCode); + wordToken[TOKEN.VALUE_ID] += getChar(charCode); } this.colPos++; @@ -143,13 +174,17 @@ class Tokenizer { default: if (tagToken && attrValueToken) { - attrValueToken[TOKEN.VALUE_ID] += getCharCode(charCode) + attrValueToken[TOKEN.VALUE_ID] += getChar(charCode) } else if (tagToken && attrNameToken) { - attrNameToken[TOKEN.VALUE_ID] += getCharCode(charCode) + attrNameToken[TOKEN.VALUE_ID] += getChar(charCode) } else if (tagToken) { - tagToken[TOKEN.VALUE_ID] += getCharCode(charCode) + tagToken[TOKEN.VALUE_ID] += getChar(charCode) } else { - wordToken[TOKEN.VALUE_ID] += getCharCode(charCode); + if (!wordToken) { + wordToken = this.createWordToken('') + } + + wordToken[TOKEN.VALUE_ID] += getChar(charCode); } this.colPos++; @@ -161,7 +196,7 @@ class Tokenizer { flushWord(); - tokens.length = tokenIndex; + tokens.length = tokenIndex + 1; return tokens; } @@ -210,5 +245,16 @@ module.exports.TOKEN = { LINE_ID: TOKEN.LINE_ID, COLUMN_ID: TOKEN.COLUMN_ID, }; -module.exports.getCharCode = getCharCode; +module.exports.getChar = getChar; +module.exports.getTokenValue = getTokenValue; +module.exports.getTokenLine = getTokenLine; +module.exports.getTokenColumn = getTokenColumn; +module.exports.isTextToken = isTextToken; +module.exports.isTagToken = isTagToken; +module.exports.isTagStart = isTagStart; +module.exports.isTagEnd = isTagEnd; +module.exports.isAttrNameToken = isAttrNameToken; +module.exports.isAttrValueToken = isAttrValueToken; +module.exports.getTagName = getTagName; +module.exports.convertTokenToText = convertTagToText; diff --git a/packages/bbob-parser/Tokenizer.test.js b/packages/bbob-parser/Tokenizer.test.js index 0908708..30c0388 100644 --- a/packages/bbob-parser/Tokenizer.test.js +++ b/packages/bbob-parser/Tokenizer.test.js @@ -1,14 +1,95 @@ const Tokenizer = require('./Tokenizer'); +const TYPE = Tokenizer.TYPE; describe("Tokenizer", () => { - it("tokenize single tag", () => { + test("tokenize single tag", () => { const input = `[SingleTag]`; + const tokens = new Tokenizer(input).tokenize(); + + expect(tokens).toBeInstanceOf(Array); + expect(tokens).toEqual([ + [TYPE.TAG, 'SingleTag', 0, 0] + ]) + }); + + test("tokenize single tag with spaces", () => { + const input = `[Single Tag]`; + const tokens = new Tokenizer(input).tokenize(); + + expect(tokens).toBeInstanceOf(Array); + expect(tokens).toEqual([ + [TYPE.TAG, 'Single Tag', 0, 0] + ]) + }); + + test("tokenize tag as param", () => { + const input = `[color="#ff0000"]Text[/color]`; + const tokens = new Tokenizer(input).tokenize(); + + expect(tokens).toBeInstanceOf(Array); + expect(tokens).toEqual([ + [TYPE.TAG, 'color', 0, 0], + [TYPE.ATTR_VALUE, '#ff0000', 6, 0], + [TYPE.WORD, 'Text', 17, 0], + [TYPE.TAG, '/color', 21, 0] + ]) + }); + + test("tokenize tag param without quotemarks", () => { + const input = `[style color=#ff0000]Text[/style]`; + const tokens = new Tokenizer(input).tokenize(); + + expect(tokens).toBeInstanceOf(Array); + expect(tokens).toEqual([ + [TYPE.TAG, 'style', 0, 0], + [TYPE.ATTR_NAME, 'color', 6, 0], + [TYPE.ATTR_VALUE, '#ff0000', 12, 0], + [TYPE.WORD, 'Text', 21, 0], + [TYPE.TAG, '/style', 25, 0] + ]) + }); + + test("tokenize list tag with items", () => { + const input = `[list] + [*] Item 1. + [*] Item 2. + [*] Item 3. +[/list]`; const tokens = new Tokenizer(input).tokenize(); - console.log('tokens', tokens); - expect(tokens).toBeInstanceOf(Array); - expect(tokens[0]).toEqual(['tag', 'SingleTag', 0, 0]) + expect(tokens).toEqual([ + [TYPE.TAG, 'list', 0, 0], + [TYPE.NEW_LINE, '\n', 6, 0], + [TYPE.SPACE, ' ', 0, 1], + [TYPE.SPACE, ' ', 1, 1], + [TYPE.SPACE, ' ', 2, 1], + [TYPE.TAG, '*', 3, 1], + [TYPE.SPACE, ' ', 6, 1], + [TYPE.WORD, 'Item', 7, 1], + [TYPE.SPACE, ' ', 11, 1], + [TYPE.WORD, '1.', 11, 1], + [TYPE.NEW_LINE, '\n', 14, 1], + [TYPE.SPACE, ' ', 0, 2], + [TYPE.SPACE, ' ', 1, 2], + [TYPE.SPACE, ' ', 2, 2], + [TYPE.TAG, '*', 3, 2], + [TYPE.SPACE, ' ', 6, 2], + [TYPE.WORD, 'Item', 14, 1], + [TYPE.SPACE, ' ', 11, 2], + [TYPE.WORD, '2.', 11, 2], + [TYPE.NEW_LINE, '\n', 14, 2], + [TYPE.SPACE, ' ', 0, 3], + [TYPE.SPACE, ' ', 1, 3], + [TYPE.SPACE, ' ', 2, 3], + [TYPE.TAG, '*', 3, 3], + [TYPE.SPACE, ' ', 6, 3], + [TYPE.WORD, 'Item', 14, 2], + [TYPE.SPACE, ' ', 11, 3], + [TYPE.WORD, '3.', 11, 3], + [TYPE.NEW_LINE, '\n', 14, 3], + [TYPE.TAG, '/list', 0, 4] + ]) }) }); \ No newline at end of file diff --git a/packages/bbob-parser/benchmark/parser_test_new.js b/packages/bbob-parser/benchmark/parser_test_new.js index 00c33e3..3a82bdb 100644 --- a/packages/bbob-parser/benchmark/parser_test_new.js +++ b/packages/bbob-parser/benchmark/parser_test_new.js @@ -6,7 +6,7 @@ const options = { const textStub = require("./test/stub"); -const count = 10; +const count = 0; const parsers3 = []; console.time('newParser'); diff --git a/packages/bbob-parser/benchmark/parser_test_old.js b/packages/bbob-parser/benchmark/parser_test_old.js index 38e9a09..7ca494d 100644 --- a/packages/bbob-parser/benchmark/parser_test_old.js +++ b/packages/bbob-parser/benchmark/parser_test_old.js @@ -2,7 +2,7 @@ const OldParser = require('./OldParser') const textStub = require("./test/stub"); -const count = 10; +const count = 0; const oldParsers3 = []; console.time('oldParser'); for (let i = 0; i <= count; i++) { diff --git a/packages/bbob-parser/package.json b/packages/bbob-parser/package.json index f5f19c6..2daf78b 100644 --- a/packages/bbob-parser/package.json +++ b/packages/bbob-parser/package.json @@ -12,7 +12,8 @@ "author": "Nikolay Kostyurin ", "license": "MIT", "devDependencies": { - "jest": "^23.1.0" + "jest": "^23.1.0", + "xbbcode-parser": "^0.1.2" }, "publishConfig": { "registry": "https://registry.npmjs.org/" diff --git a/packages/bbob-parser/parse.test.js b/packages/bbob-parser/parse.test.js index 5d0d336..4d5000d 100644 --- a/packages/bbob-parser/parse.test.js +++ b/packages/bbob-parser/parse.test.js @@ -1,9 +1,7 @@ const parse = require('./index'); const OldParser = require('./benchmark/OldParser'); -const tabText = require('./benchmark/test/stub'); const options = { - closableTags: ['ch', 'syllable', 'tab'], allowOnlyTags: ['ch', 'syllable', 'tab'], }; @@ -15,8 +13,26 @@ describe("parse", () => { }); test("same as old parser", () => { - const ast1 = parse(tabText, options); - const ast2 = OldParser.parse(tabText); + const input = `[Verse 2] +[ch]Eb[/ch] [ch]Fm[/ch] + I'm walking around +[ch]Ab[/ch] [ch]Cm[/ch] + With my little raincloud +[ch]Eb[/ch] [ch]Fm[/ch] + Hanging over my head +[ch]Cm[/ch] [ch]Ab[/ch] + And it ain’t coming down +[ch]Eb[/ch] [ch]Fm[/ch] + Where do I go? +[ch]Ab[/ch] [ch]Cm[/ch] + Gimme some sort of sign +[ch]Eb[/ch] [ch]Fm[/ch] + Hit me with lightning! +[ch]Cm[/ch] [ch]Ab[/ch] + Maybe I’ll come alive +`; + const ast1 = parse(input, options); + const ast2 = OldParser.parse(input); expect(ast1).toEqual(ast2); }) diff --git a/packages/bbob-parser/token.js b/packages/bbob-parser/token.js index 9a22fba..018b5e8 100644 --- a/packages/bbob-parser/token.js +++ b/packages/bbob-parser/token.js @@ -1,7 +1,7 @@ const TOKEN_TYPE_ID = 0; const TOKEN_VALUE_ID = 1; -const TOKEN_LINE_ID = 2; -const TOKEN_COLUMN_ID = 3; +const TOKEN_COLUMN_ID = 2; +const TOKEN_LINE_ID = 3; const TOKEN_TYPE_WORD = 'word'; const TOKEN_TYPE_TAG = 'tag';