diff --git a/packages/bbob-parser/README.md b/packages/bbob-parser/README.md index a991849..46165ca 100644 --- a/packages/bbob-parser/README.md +++ b/packages/bbob-parser/README.md @@ -1,2 +1,15 @@ -# bbob-parser -Fast BB Code parser written in pure javascript, no dependencies +# @bbob/parser + +Parses BBCode and returns AST Tree looks like + +```json +[ + { + tag: 'url', + attrs: { + url: 'https://github.com/JiLiZART/bbob/tree/master/packages/bbob-parser' + }, + content: ['hello', ' ', 'world!'] + } +] +``` diff --git a/packages/bbob-parser/lib/Token.js b/packages/bbob-parser/lib/Token.js index bf0ae96..a47915c 100644 --- a/packages/bbob-parser/lib/Token.js +++ b/packages/bbob-parser/lib/Token.js @@ -61,6 +61,10 @@ class Token { this.row = Number(row); } + isEmpty() { + return !!this.type; + } + isText() { return isTextToken(this); } diff --git a/packages/bbob-parser/lib/Tokenizer.js b/packages/bbob-parser/lib/Tokenizer.js index 383074b..deaf844 100644 --- a/packages/bbob-parser/lib/Tokenizer.js +++ b/packages/bbob-parser/lib/Tokenizer.js @@ -4,6 +4,7 @@ const { CLOSE_BRAKET, EQ, TAB, SPACE, N, QUOTEMARK, PLACEHOLDER_SPACE, PLACEHOLDER_SPACE_TAB, SLASH, + BACKSLASH, } = require('./char'); const Token = require('./Token'); @@ -41,6 +42,15 @@ class Tokenizer { this.emitToken(token); } + skipChar(num) { + this.index += num; + this.colPos += num; + } + + seekChar(num) { + return this.buffer.charCodeAt(this.index + num); + } + nextCol() { this.colPos += 1; } @@ -115,6 +125,7 @@ class Tokenizer { } if (this.attrValueToken[Token.TYPE_ID]) { + delete this.attrValueToken.quoted; this.attrTokens.push(this.attrValueToken); this.attrValueToken = this.dummyToken; } @@ -129,12 +140,16 @@ class Tokenizer { charSPACE(charCode) { this.flushWord(); + const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE; - if (this.tagToken[Token.TYPE_ID]) { - this.attrNameToken = this.createAttrNameToken(''); + if (this.inTag()) { + if (this.inAttrValue() && this.attrValueToken.quoted) { + this.attrValueToken[Token.VALUE_ID] += spaceCode; + } else { + this.flushAttrNames(); + this.attrNameToken = this.createAttrNameToken(''); + } } else { - const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE; - this.appendToken(this.createSpaceToken(spaceCode)); } this.nextCol(); @@ -156,16 +171,23 @@ class Tokenizer { } charCLOSEBRAKET() { + this.nextCol(); this.flushTag(); this.flushAttrNames(); this.flushAttrs(); - - this.nextCol(); } charEQ(charCode) { - if (this.tagToken[Token.TYPE_ID]) { + const nextCharCode = this.seekChar(1); + const isNextQuotemark = nextCharCode === QUOTEMARK; + + if (this.inTag()) { this.attrValueToken = this.createAttrValueToken(''); + + if (isNextQuotemark) { + this.attrValueToken.quoted = true; + this.skipChar(1); + } } else { this.wordToken[Token.VALUE_ID] += getChar(charCode); } @@ -174,7 +196,13 @@ class Tokenizer { } charQUOTEMARK(charCode) { - if (this.attrValueToken[Token.TYPE_ID] && this.attrValueToken[Token.VALUE_ID] > 0) { + const prevCharCode = this.seekChar(-1); + const isPrevBackslash = prevCharCode === BACKSLASH; + + if (this.inAttrValue() && + this.attrValueToken[Token.VALUE_ID] && + this.attrValueToken.quoted && + !isPrevBackslash) { this.flushAttrNames(); } else if (this.tagToken[Token.TYPE_ID] === '') { this.wordToken[Token.VALUE_ID] += getChar(charCode); @@ -183,13 +211,31 @@ class Tokenizer { this.nextCol(); } + charBACKSLASH() { + const nextCharCode = this.seekChar(1); + const isNextQuotemark = nextCharCode === QUOTEMARK; + + if (this.inAttrValue() && + this.attrValueToken[Token.VALUE_ID] && + this.attrValueToken.quoted && + isNextQuotemark + ) { + this.attrValueToken[Token.VALUE_ID] += getChar(nextCharCode); + this.skipChar(1); + } + + this.nextCol(); + } + charWORD(charCode) { - if (this.tagToken[Token.TYPE_ID] && this.attrValueToken[Token.TYPE_ID]) { - this.attrValueToken[Token.VALUE_ID] += getChar(charCode); - } else if (this.tagToken[Token.TYPE_ID] && this.attrNameToken[Token.TYPE_ID]) { - this.attrNameToken[Token.VALUE_ID] += getChar(charCode); - } else if (this.tagToken[Token.TYPE_ID]) { - this.tagToken[Token.VALUE_ID] += getChar(charCode); + if (this.inTag()) { + if (this.inAttrValue()) { + this.attrValueToken[Token.VALUE_ID] += getChar(charCode); + } else if (this.inAttrName()) { + this.attrNameToken[Token.VALUE_ID] += getChar(charCode); + } else { + this.tagToken[Token.VALUE_ID] += getChar(charCode); + } } else { this.createWord(); @@ -214,11 +260,11 @@ class Tokenizer { break; case OPEN_BRAKET: - this.charOPENBRAKET(); + this.charOPENBRAKET(charCode); break; case CLOSE_BRAKET: - this.charCLOSEBRAKET(); + this.charCLOSEBRAKET(charCode); break; case EQ: @@ -229,6 +275,10 @@ class Tokenizer { this.charQUOTEMARK(charCode); break; + case BACKSLASH: + this.charBACKSLASH(charCode); + break; + default: this.charWORD(charCode); break; @@ -245,6 +295,18 @@ class Tokenizer { return this.tokens; } + inTag() { + return this.tagToken[Token.TYPE_ID]; + } + + inAttrValue() { + return this.attrValueToken[Token.TYPE_ID]; + } + + inAttrName() { + return this.attrNameToken[Token.TYPE_ID]; + } + createWordToken(value = '', line = this.colPos, row = this.rowPos) { return createTokenOfType(Token.TYPE_WORD, value, line, row); } diff --git a/packages/bbob-parser/lib/char.js b/packages/bbob-parser/lib/char.js index 2d0fc82..b235d59 100644 --- a/packages/bbob-parser/lib/char.js +++ b/packages/bbob-parser/lib/char.js @@ -11,6 +11,7 @@ const OPEN_BRAKET = '['.charCodeAt(0); const CLOSE_BRAKET = ']'.charCodeAt(0); const SLASH = '/'.charCodeAt(0); +const BACKSLASH = '\\'.charCodeAt(0); const PLACEHOLDER_SPACE_TAB = ' '; const PLACEHOLDER_SPACE = ' '; @@ -31,4 +32,5 @@ module.exports = { SLASH, PLACEHOLDER_SPACE_TAB, PLACEHOLDER_SPACE, + BACKSLASH, }; diff --git a/packages/bbob-parser/test/Parser.test.js b/packages/bbob-parser/test/Parser.test.js index 488bdad..afd0832 100644 --- a/packages/bbob-parser/test/Parser.test.js +++ b/packages/bbob-parser/test/Parser.test.js @@ -36,4 +36,21 @@ describe('Parser', () => { }, ]); }); + + test('parse tag with quoted param with spaces', () => { + const ast = parse('[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]Text[/url]'); + + expect(ast).toBeInstanceOf(Array); + expect(ast).toEqual([ + { + tag: 'url', + attrs: { + href: 'https://ru.wikipedia.org', + target: '_blank', + text: 'Foo Bar', + }, + content: ['Text'], + }, + ]); + }); }); diff --git a/packages/bbob-parser/test/Tokenizer.test.js b/packages/bbob-parser/test/Tokenizer.test.js index 7c7301b..6da53ee 100644 --- a/packages/bbob-parser/test/Tokenizer.test.js +++ b/packages/bbob-parser/test/Tokenizer.test.js @@ -47,6 +47,34 @@ describe('Tokenizer', () => { expectOutput(output, tokens); }); + test('tokenize tag with quotemark params with spaces', () => { + const input = '[url text="Foo Bar"]Text[/url]'; + const tokens = tokenize(input); + const output = [ + [TYPE.TAG, 'url', '0', '0'], + [TYPE.ATTR_NAME, 'text', '4', '0'], + [TYPE.ATTR_VALUE, 'Foo Bar', '9', '0'], + [TYPE.WORD, 'Text', '20', '0'], + [TYPE.TAG, '/url', '24', '0'], + ]; + + expectOutput(output, tokens); + }); + + test('tokenize tag with escaped quotemark param', () => { + const input = `[url text="Foo \\"Bar"]Text[/url]`; + const tokens = tokenize(input); + const output = [ + [TYPE.TAG, 'url', '0', '0'], + [TYPE.ATTR_NAME, 'text', '4', '0'], + [TYPE.ATTR_VALUE, 'Foo "Bar', '9', '0'], + [TYPE.WORD, 'Text', '22', '0'], + [TYPE.TAG, '/url', '26', '0'], + ]; + + expectOutput(output, tokens); + }); + test('tokenize tag param without quotemarks', () => { const input = '[style color=#ff0000]Text[/style]'; const tokens = tokenize(input);