diff --git a/packages/bbob-parser/lib/Tokenizer.js b/packages/bbob-parser/lib/Tokenizer.js deleted file mode 100644 index b792d4d..0000000 --- a/packages/bbob-parser/lib/Tokenizer.js +++ /dev/null @@ -1,348 +0,0 @@ -const { - getChar, - OPEN_BRAKET, - CLOSE_BRAKET, EQ, TAB, SPACE, N, QUOTEMARK, - PLACEHOLDER_SPACE, PLACEHOLDER_SPACE_TAB, - SLASH, - BACKSLASH, -} = require('@bbob/plugin-helper/lib/char'); -const Token = require('./Token'); - -const createTokenOfType = (type, value, line, row) => new Token(type, value, line, row); - -class Tokenizer { - constructor(input, options = {}) { - this.buffer = input; - this.colPos = 0; - this.rowPos = 0; - // eslint-disable-next-line no-bitwise - this.index = 2 ** 32; - - this.tokenIndex = -1; - this.tokens = new Array(Math.floor(this.buffer.length)); - this.dummyToken = null; // createTokenOfType('', '', '', ''); - - this.wordToken = this.dummyToken; - this.tagToken = this.dummyToken; - this.attrNameToken = this.dummyToken; - this.attrValueToken = this.dummyToken; - this.attrTokens = []; - - this.options = options; - - this.charMap = { - [TAB]: this.charSPACE.bind(this), - [SPACE]: this.charSPACE.bind(this), - [N]: this.charN.bind(this), - [OPEN_BRAKET]: this.charOPENBRAKET.bind(this), - [CLOSE_BRAKET]: this.charCLOSEBRAKET.bind(this), - [EQ]: this.charEQ.bind(this), - [QUOTEMARK]: this.charQUOTEMARK.bind(this), - [BACKSLASH]: this.charBACKSLASH.bind(this), - default: this.charWORD.bind(this), - }; - } - - emitToken(token) { - if (this.options.onToken) { - this.options.onToken(token); - } - } - - appendToken(token) { - this.tokenIndex += 1; - this.tokens[this.tokenIndex] = token; - this.emitToken(token); - } - - skipChar(num) { - this.index += num; - this.colPos += num; - } - - seekChar(num) { - return this.buffer.charCodeAt(this.index + num); - } - - nextCol() { - this.colPos += 1; - } - - nextLine() { - this.rowPos += 1; - } - - flushWord() { - if (this.inWord() && this.wordToken[Token.VALUE_ID]) { - this.appendToken(this.wordToken); - this.wordToken = this.createWordToken(''); - } - } - - createWord(value, line, row) { - if (!this.inWord()) { - this.wordToken = this.createWordToken(value, line, row); - } - } - - flushTag() { - if (this.inTag()) { - // [] and [=] tag case - if (this.tagToken[Token.VALUE_ID] === '') { - const value = this.inAttrValue() ? getChar(EQ) : ''; - const word = getChar(OPEN_BRAKET) + value + getChar(CLOSE_BRAKET); - - this.createWord('', 0, 0); - this.wordToken[Token.VALUE_ID] += word; - - this.tagToken = this.dummyToken; - - if (this.inAttrValue()) { - this.attrValueToken = this.dummyToken; - } - - return; - } - - if (this.inAttrName() && !this.inAttrValue()) { - this.tagToken[Token.VALUE_ID] += PLACEHOLDER_SPACE + this.attrNameToken[Token.VALUE_ID]; - this.attrNameToken = this.dummyToken; - } - - this.appendToken(this.tagToken); - this.tagToken = this.dummyToken; - } - } - - flushUnclosedTag() { - if (this.inTag()) { - const value = this.tagToken[Token.VALUE_ID] + (this.attrValueToken && this.attrValueToken[Token.VALUE_ID] ? getChar(EQ) : ''); - - this.tagToken[Token.TYPE_ID] = Token.TYPE_WORD; - this.tagToken[Token.VALUE_ID] = getChar(OPEN_BRAKET) + value; - - this.appendToken(this.tagToken); - - this.tagToken = this.dummyToken; - - if (this.inAttrValue()) { - this.attrValueToken = this.dummyToken; - } - } - } - - flushAttrNames() { - if (this.inAttrName()) { - this.attrTokens.push(this.attrNameToken); - this.attrNameToken = this.dummyToken; - } - - if (this.inAttrValue()) { - this.attrValueToken.quoted = undefined; - this.attrTokens.push(this.attrValueToken); - this.attrValueToken = this.dummyToken; - } - } - - flushAttrs() { - if (this.attrTokens.length) { - this.attrTokens.forEach(this.appendToken.bind(this)); - this.attrTokens = []; - } - } - - charSPACE(charCode) { - const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE; - - this.flushWord(); - - if (this.inTag()) { - if (this.inAttrValue() && this.attrValueToken.quoted) { - this.attrValueToken[Token.VALUE_ID] += spaceCode; - } else { - this.flushAttrNames(); - this.attrNameToken = this.createAttrNameToken(''); - } - } else { - this.appendToken(this.createSpaceToken(spaceCode)); - } - this.nextCol(); - } - - charN(charCode) { - this.flushWord(); - this.appendToken(this.createNewLineToken(getChar(charCode))); - - this.nextLine(); - this.colPos = 0; - } - - charOPENBRAKET(charCode) { - const nextCharCode = this.seekChar(1); - const isNextSpace = nextCharCode === SPACE || nextCharCode === TAB; - - if (isNextSpace) { - this.createWord(); - this.wordToken[Token.VALUE_ID] += getChar(charCode); - } else { - this.flushWord(); - - this.tagToken = this.createTagToken(''); - } - - this.nextCol(); - } - - charCLOSEBRAKET(charCode) { - const prevCharCode = this.seekChar(-1); - const isPrevSpace = prevCharCode === SPACE || prevCharCode === TAB; - - if (isPrevSpace) { - this.wordToken[Token.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); - this.flushTag(); - this.flushAttrNames(); - this.flushAttrs(); - } - - charEQ(charCode) { - const nextCharCode = this.seekChar(1); - const isNextQuotemark = nextCharCode === QUOTEMARK; - - if (this.inTag()) { - this.attrValueToken = this.createAttrValueToken(''); - - if (isNextQuotemark) { - this.attrValueToken.quoted = true; - this.skipChar(1); - } - } else { - this.wordToken[Token.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); - } - - charQUOTEMARK(charCode) { - const prevCharCode = this.seekChar(-1); - const isPrevBackslash = prevCharCode === BACKSLASH; - - if (this.inAttrValue() && - this.attrValueToken[Token.VALUE_ID] && - this.attrValueToken.quoted && - !isPrevBackslash) { - this.flushAttrNames(); - } else if (!this.inTag()) { - if (!this.wordToken) { - this.wordToken = this.createWordToken(getChar(charCode)); - } else { - this.wordToken[Token.VALUE_ID] += getChar(charCode); - } - } - - this.nextCol(); - } - - charBACKSLASH() { - const nextCharCode = this.seekChar(1); - const isNextQuotemark = nextCharCode === QUOTEMARK; - - if (this.inAttrValue() && - this.attrValueToken[Token.VALUE_ID] && - this.attrValueToken.quoted && - isNextQuotemark - ) { - this.attrValueToken[Token.VALUE_ID] += getChar(nextCharCode); - this.skipChar(1); - } - - this.nextCol(); - } - - charWORD(charCode) { - if (this.inTag()) { - if (this.inAttrValue()) { - this.attrValueToken[Token.VALUE_ID] += getChar(charCode); - } else if (this.inAttrName()) { - this.attrNameToken[Token.VALUE_ID] += getChar(charCode); - } else { - this.tagToken[Token.VALUE_ID] += getChar(charCode); - } - } else { - this.createWord(); - - this.wordToken[Token.VALUE_ID] += getChar(charCode); - } - - this.nextCol(); - } - - tokenize() { - this.index = 0; - while (this.index < this.buffer.length) { - const charCode = this.buffer.charCodeAt(this.index); - - (this.charMap[charCode] || this.charMap.default)(charCode); - - // eslint-disable-next-line no-plusplus - ++this.index; - } - - this.flushWord(); - this.flushUnclosedTag(); - - this.tokens.length = this.tokenIndex + 1; - - return this.tokens; - } - - inWord() { - return this.wordToken && this.wordToken[Token.TYPE_ID]; - } - - inTag() { - return this.tagToken && this.tagToken[Token.TYPE_ID]; - } - - inAttrValue() { - return this.attrValueToken && this.attrValueToken[Token.TYPE_ID]; - } - - inAttrName() { - return this.attrNameToken && this.attrNameToken[Token.TYPE_ID]; - } - - createWordToken(value = '', line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_WORD, value, line, row); - } - - createTagToken(value, line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_TAG, value, line, row); - } - - createAttrNameToken(value, line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_ATTR_NAME, value, line, row); - } - - createAttrValueToken(value, line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_ATTR_VALUE, value, line, row); - } - - createSpaceToken(value, line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_SPACE, value, line, row); - } - - createNewLineToken(value, line = this.colPos, row = this.rowPos) { - return createTokenOfType(Token.TYPE_NEW_LINE, value, line, row); - } - - isTokenNested(token) { - const value = getChar(OPEN_BRAKET) + getChar(SLASH) + token.getValue(); - return this.buffer.indexOf(value) > -1; - } -} - -module.exports = Tokenizer; -module.exports.createTokenOfType = createTokenOfType; diff --git a/packages/bbob-parser/lib/lexer.js b/packages/bbob-parser/lib/lexer.js new file mode 100644 index 0000000..dce33ee --- /dev/null +++ b/packages/bbob-parser/lib/lexer.js @@ -0,0 +1,195 @@ +/* eslint-disable no-plusplus,no-param-reassign */ +const c = require('@bbob/plugin-helper/lib/char'); +const Token = require('./Token'); + +const OPEN_BRAKET = c.getChar(c.OPEN_BRAKET); +const CLOSE_BRAKET = c.getChar(c.CLOSE_BRAKET); +const QUOTEMARK = c.getChar(c.QUOTEMARK); +const BACKSLASH = c.getChar(c.BACKSLASH); +const SLASH = c.getChar(c.SLASH); +const SPACE = c.getChar(c.SPACE); +const TAB = c.getChar(c.TAB); +const EQ = c.getChar(c.EQ); +const N = c.getChar(c.N); + +const RESERVED_CHARS = [CLOSE_BRAKET, OPEN_BRAKET, QUOTEMARK, BACKSLASH, SPACE, TAB, EQ, N]; +const NOT_CHAR_TOKENS = [OPEN_BRAKET, SPACE, TAB, N]; +const WHITESPACES = [SPACE, TAB]; + +const isCharReserved = char => (RESERVED_CHARS.indexOf(char) >= 0); +const isWhiteSpace = char => (WHITESPACES.indexOf(char) >= 0); +const isCharToken = char => (NOT_CHAR_TOKENS.indexOf(char) === -1); + +const createCharGrabber = (source) => { + let idx = 0; + + const skip = () => { + idx += 1; + }; + const hasNext = () => source.length > idx; + + return { + skip, + hasNext, + isLast: () => (idx === source.length), + grabWhile: (cond) => { + const start = idx; + + while (hasNext() && cond(source[idx])) { + skip(); + } + + return source.substr(start, idx - start); + }, + getNext: () => source[idx + 1], + getPrev: () => source[idx - 1], + getCurr: () => source[idx], + }; +}; + +const trimChar = (str, charToRemove) => { + while (str.charAt(0) === charToRemove) { + str = str.substring(1); + } + + while (str.charAt(str.length - 1) === charToRemove) { + str = str.substring(0, str.length - 1); + } + + return str; +}; + +const unquote = str => str.replace(BACKSLASH + QUOTEMARK, QUOTEMARK); +const createToken = (type, value, r = 0, cl = 0) => new Token(type, value, r, cl); + +function createLexer(buffer, options = {}) { + let row = 0; + let col = 0; + + let tokenIndex = -1; + const tokens = new Array(Math.floor(buffer.length)); + const emitToken = (token) => { + if (options.onToken) { + options.onToken(token); + } + + tokenIndex += 1; + tokens[tokenIndex] = token; + }; + + const parseAttrs = (str) => { + let tagName = null; + let skipSpaces = false; + + const attrTokens = []; + const attrCharGrabber = createCharGrabber(str); + const validAttr = (val) => { + const isEQ = val === EQ; + const isWS = isWhiteSpace(val); + const isPrevSLASH = attrCharGrabber.getPrev() === SLASH; + + if (tagName === null) { + return !(isEQ || isWS || attrCharGrabber.isLast()); + } + + if (skipSpaces && isWS) { + return true; + } + + if (val === QUOTEMARK && !isPrevSLASH) { + skipSpaces = !skipSpaces; + } + + return !(isEQ || isWS); + }; + + const nextAttr = () => { + const attrStr = attrCharGrabber.grabWhile(validAttr); + + // first string before space is a tag name + if (tagName === null) { + tagName = attrStr; + } else if (isWhiteSpace(attrCharGrabber.getCurr()) || !attrCharGrabber.hasNext()) { + const escaped = unquote(trimChar(attrStr, QUOTEMARK)); + attrTokens.push(createToken(Token.TYPE_ATTR_VALUE, escaped, row, col)); + } else { + attrTokens.push(createToken(Token.TYPE_ATTR_NAME, attrStr, row, col)); + } + + attrCharGrabber.skip(); + }; + + while (attrCharGrabber.hasNext()) { + nextAttr(); + } + + return { tag: tagName, attrs: attrTokens }; + }; + + const grabber = createCharGrabber(buffer); + + const next = () => { + const char = grabber.getCurr(); + + if (char === N) { + grabber.skip(); + col = 0; + row++; + + emitToken(createToken(Token.TYPE_NEW_LINE, char, row, col)); + } else if (isWhiteSpace(char)) { + const str = grabber.grabWhile(isWhiteSpace); + emitToken(createToken(Token.TYPE_SPACE, str, row, col)); + } else if (char === OPEN_BRAKET) { + const nextChar = grabber.getNext(); + grabber.skip(); // skip [ + + if (isCharReserved(nextChar)) { + emitToken(createToken(Token.TYPE_WORD, char, row, col)); + } else { + const str = grabber.grabWhile(val => val !== CLOSE_BRAKET); + grabber.skip(); // skip ] + + if (!(str.indexOf(EQ) > 0) || str[0] === SLASH) { + emitToken(createToken(Token.TYPE_TAG, str, row, col)); + } else { + const parsed = parseAttrs(str); + + emitToken(createToken(Token.TYPE_TAG, parsed.tag, row, col)); + parsed.attrs.map(emitToken); + } + } + } else if (char === CLOSE_BRAKET) { + grabber.skip(); + + emitToken(createToken(Token.TYPE_WORD, char, row, col)); + } else if (isCharToken(char)) { + const str = grabber.grabWhile(isCharToken); + + emitToken(createToken(Token.TYPE_WORD, str, row, col)); + } + }; + + const tokenize = () => { + while (grabber.hasNext()) { + next(); + } + + tokens.length = tokenIndex + 1; + + return tokens; + }; + + const isTokenNested = (token) => { + const value = OPEN_BRAKET + SLASH + token.getValue(); + return buffer.indexOf(value) > -1; + }; + + return { + tokenize, + isTokenNested, + }; +} + +module.exports = createLexer; +module.exports.createTokenOfType = createToken; diff --git a/packages/bbob-parser/lib/parse.js b/packages/bbob-parser/lib/parse.js index 0f33919..70aa4fc 100644 --- a/packages/bbob-parser/lib/parse.js +++ b/packages/bbob-parser/lib/parse.js @@ -1,4 +1,4 @@ -const Tokenizer = require('./Tokenizer'); +const createLexer = require('./lexer'); const TagNode = require('@bbob/plugin-helper/lib/TagNode'); /** @@ -28,7 +28,7 @@ let tokenizer = null; // eslint-disable-next-line no-unused-vars let tokens = null; -const createTokenizer = (input, onToken) => new Tokenizer(input, { onToken }); +const createTokenizer = (input, onToken) => createLexer(input, { onToken }); /** * @private diff --git a/packages/bbob-parser/test/Tokenizer.test.js b/packages/bbob-parser/test/lexer.test.js similarity index 82% rename from packages/bbob-parser/test/Tokenizer.test.js rename to packages/bbob-parser/test/lexer.test.js index cc04b14..a0c3df3 100644 --- a/packages/bbob-parser/test/Tokenizer.test.js +++ b/packages/bbob-parser/test/lexer.test.js @@ -1,5 +1,5 @@ -const Tokenizer = require('../lib/Tokenizer'); const Token = require('../lib/Token'); +const lexer = require('../lib/lexer'); const TYPE = { WORD: Token.TYPE_WORD, @@ -10,14 +10,15 @@ const TYPE = { NEW_LINE: Token.TYPE_NEW_LINE, }; -const tokenize = input => (new Tokenizer(input).tokenize()); +const tokenize = input => (lexer(input).tokenize()); -describe('Tokenizer', () => { +describe('lexer', () => { const expectOutput = (output, tokens) => { expect(tokens).toBeInstanceOf(Array); output.forEach((token, idx) => { expect(tokens[idx]).toBeInstanceOf(Object); - expect(tokens[idx]).toEqual(Tokenizer.createTokenOfType(...token)); + expect(tokens[idx].type).toEqual(token[0]); + expect(tokens[idx].value).toEqual(token[1]); }); }; @@ -92,12 +93,14 @@ describe('Tokenizer', () => { }); test('tokenize tag with quotemark params with spaces', () => { - const input = '[url text="Foo Bar"]Text[/url]'; + const input = '[url text="Foo Bar" text2="Foo Bar 2"]Text[/url]'; const tokens = tokenize(input); const output = [ [TYPE.TAG, 'url', '0', '0'], [TYPE.ATTR_NAME, 'text', '4', '0'], [TYPE.ATTR_VALUE, 'Foo Bar', '9', '0'], + [TYPE.ATTR_NAME, 'text2', '4', '0'], + [TYPE.ATTR_VALUE, 'Foo Bar 2', '9', '0'], [TYPE.WORD, 'Text', '20', '0'], [TYPE.TAG, '/url', '24', '0'], ]; @@ -144,27 +147,21 @@ describe('Tokenizer', () => { const output = [ [TYPE.TAG, 'list', '0', '0'], [TYPE.NEW_LINE, '\n', '6', '0'], - [TYPE.SPACE, ' ', '0', '1'], - [TYPE.SPACE, ' ', '1', '1'], - [TYPE.SPACE, ' ', '2', '1'], + [TYPE.SPACE, ' ', '0', '1'], [TYPE.TAG, '*', '3', '1'], [TYPE.SPACE, ' ', '6', '1'], [TYPE.WORD, 'Item', '7', '1'], [TYPE.SPACE, ' ', '11', '1'], [TYPE.WORD, '1.', '11', '1'], [TYPE.NEW_LINE, '\n', '14', '1'], - [TYPE.SPACE, ' ', '0', '2'], - [TYPE.SPACE, ' ', '1', '2'], - [TYPE.SPACE, ' ', '2', '2'], + [TYPE.SPACE, ' ', '0', '2'], [TYPE.TAG, '*', '3', '2'], [TYPE.SPACE, ' ', '6', '2'], [TYPE.WORD, 'Item', '14', '1'], [TYPE.SPACE, ' ', '11', '2'], [TYPE.WORD, '2.', '11', '2'], [TYPE.NEW_LINE, '\n', '14', '2'], - [TYPE.SPACE, ' ', '0', '3'], - [TYPE.SPACE, ' ', '1', '3'], - [TYPE.SPACE, ' ', '2', '3'], + [TYPE.SPACE, ' ', '0', '3'], [TYPE.TAG, '*', '3', '3'], [TYPE.SPACE, ' ', '6', '3'], [TYPE.WORD, 'Item', '14', '2'], @@ -185,16 +182,24 @@ describe('Tokenizer', () => { 'x html([a. title][, alt][, classes]) x', '[/y]', '[sc', - // '[sc / [/sc]', - // '[sc arg="val', + '[sc / [/sc]', + '[sc arg="val', ]; const asserts = [ - [[TYPE.WORD, '[]', '0', '0']], - [[TYPE.WORD, '[=]', '0', '0']], + [ + [TYPE.WORD, '[', '0', '0'], + [TYPE.WORD, ']', '0', '0'] + ], + [ + [TYPE.WORD, '[', '0', '0'], + [TYPE.WORD, '=]', '0', '0'] + ], [ [TYPE.WORD, '!', '0', '0'], - [TYPE.WORD, '[](image.jpg)', '1', '0'], + [TYPE.WORD, '[', '1', '0'], + [TYPE.WORD, ']', '1', '0'], + [TYPE.WORD, '(image.jpg)', '1', '0'], ], [ [TYPE.WORD, 'x', '0', '0'], @@ -207,15 +212,20 @@ describe('Tokenizer', () => { [TYPE.SPACE, ' ', '36', '0'], [TYPE.WORD, 'x', '36', '0'], ], - [[TYPE.TAG, '/y', '0', '0']], - [[TYPE.WORD, '[sc', '0', '0']], - // [ - // [TYPE.WORD, '[sc', '0', '0'], - // [TYPE.SPACE, ' ', '0', '0'], - // [TYPE.WORD, '/', '0', '0'], - // [TYPE.SPACE, ' ', '0', '0'], - // [TYPE.WORD, '[/sc]', '0', '0'], - // ], + [ + [TYPE.TAG, '/y', '0', '0'] + ], + [ + [TYPE.TAG, 'sc', '0', '0'] + ], + [ + [TYPE.TAG, 'sc / [/sc', '0', '0'] + ], + [ + [TYPE.TAG, 'sc', '0', '0'], + [TYPE.ATTR_NAME, 'arg', '0', '0'], + [TYPE.ATTR_VALUE, 'val', '0', '0'] + ] ]; inputs.forEach((input, idx) => { diff --git a/packages/bbob-parser/test/parse.test.js b/packages/bbob-parser/test/parse.test.js index 716fdb2..318a09d 100644 --- a/packages/bbob-parser/test/parse.test.js +++ b/packages/bbob-parser/test/parse.test.js @@ -61,7 +61,7 @@ describe('Parser', () => { content: [] }, ' ', - '/h1', + '/h1]', ] ); });