2
0
mirror of https://github.com/tenrok/BBob.git synced 2026-05-15 11:59:37 +03:00

feat(lexer): new lexer (#1)

This commit is contained in:
Nikolay Kostyurin
2018-08-14 18:12:58 +02:00
committed by GitHub
parent 4dd7177d89
commit 88826512e7
5 changed files with 236 additions and 379 deletions
-348
View File
@@ -1,348 +0,0 @@
const {
getChar,
OPEN_BRAKET,
CLOSE_BRAKET, EQ, TAB, SPACE, N, QUOTEMARK,
PLACEHOLDER_SPACE, PLACEHOLDER_SPACE_TAB,
SLASH,
BACKSLASH,
} = require('@bbob/plugin-helper/lib/char');
const Token = require('./Token');
const createTokenOfType = (type, value, line, row) => new Token(type, value, line, row);
class Tokenizer {
constructor(input, options = {}) {
this.buffer = input;
this.colPos = 0;
this.rowPos = 0;
// eslint-disable-next-line no-bitwise
this.index = 2 ** 32;
this.tokenIndex = -1;
this.tokens = new Array(Math.floor(this.buffer.length));
this.dummyToken = null; // createTokenOfType('', '', '', '');
this.wordToken = this.dummyToken;
this.tagToken = this.dummyToken;
this.attrNameToken = this.dummyToken;
this.attrValueToken = this.dummyToken;
this.attrTokens = [];
this.options = options;
this.charMap = {
[TAB]: this.charSPACE.bind(this),
[SPACE]: this.charSPACE.bind(this),
[N]: this.charN.bind(this),
[OPEN_BRAKET]: this.charOPENBRAKET.bind(this),
[CLOSE_BRAKET]: this.charCLOSEBRAKET.bind(this),
[EQ]: this.charEQ.bind(this),
[QUOTEMARK]: this.charQUOTEMARK.bind(this),
[BACKSLASH]: this.charBACKSLASH.bind(this),
default: this.charWORD.bind(this),
};
}
emitToken(token) {
if (this.options.onToken) {
this.options.onToken(token);
}
}
appendToken(token) {
this.tokenIndex += 1;
this.tokens[this.tokenIndex] = token;
this.emitToken(token);
}
skipChar(num) {
this.index += num;
this.colPos += num;
}
seekChar(num) {
return this.buffer.charCodeAt(this.index + num);
}
nextCol() {
this.colPos += 1;
}
nextLine() {
this.rowPos += 1;
}
flushWord() {
if (this.inWord() && this.wordToken[Token.VALUE_ID]) {
this.appendToken(this.wordToken);
this.wordToken = this.createWordToken('');
}
}
createWord(value, line, row) {
if (!this.inWord()) {
this.wordToken = this.createWordToken(value, line, row);
}
}
flushTag() {
if (this.inTag()) {
// [] and [=] tag case
if (this.tagToken[Token.VALUE_ID] === '') {
const value = this.inAttrValue() ? getChar(EQ) : '';
const word = getChar(OPEN_BRAKET) + value + getChar(CLOSE_BRAKET);
this.createWord('', 0, 0);
this.wordToken[Token.VALUE_ID] += word;
this.tagToken = this.dummyToken;
if (this.inAttrValue()) {
this.attrValueToken = this.dummyToken;
}
return;
}
if (this.inAttrName() && !this.inAttrValue()) {
this.tagToken[Token.VALUE_ID] += PLACEHOLDER_SPACE + this.attrNameToken[Token.VALUE_ID];
this.attrNameToken = this.dummyToken;
}
this.appendToken(this.tagToken);
this.tagToken = this.dummyToken;
}
}
flushUnclosedTag() {
if (this.inTag()) {
const value = this.tagToken[Token.VALUE_ID] + (this.attrValueToken && this.attrValueToken[Token.VALUE_ID] ? getChar(EQ) : '');
this.tagToken[Token.TYPE_ID] = Token.TYPE_WORD;
this.tagToken[Token.VALUE_ID] = getChar(OPEN_BRAKET) + value;
this.appendToken(this.tagToken);
this.tagToken = this.dummyToken;
if (this.inAttrValue()) {
this.attrValueToken = this.dummyToken;
}
}
}
flushAttrNames() {
if (this.inAttrName()) {
this.attrTokens.push(this.attrNameToken);
this.attrNameToken = this.dummyToken;
}
if (this.inAttrValue()) {
this.attrValueToken.quoted = undefined;
this.attrTokens.push(this.attrValueToken);
this.attrValueToken = this.dummyToken;
}
}
flushAttrs() {
if (this.attrTokens.length) {
this.attrTokens.forEach(this.appendToken.bind(this));
this.attrTokens = [];
}
}
charSPACE(charCode) {
const spaceCode = charCode === TAB ? PLACEHOLDER_SPACE_TAB : PLACEHOLDER_SPACE;
this.flushWord();
if (this.inTag()) {
if (this.inAttrValue() && this.attrValueToken.quoted) {
this.attrValueToken[Token.VALUE_ID] += spaceCode;
} else {
this.flushAttrNames();
this.attrNameToken = this.createAttrNameToken('');
}
} else {
this.appendToken(this.createSpaceToken(spaceCode));
}
this.nextCol();
}
charN(charCode) {
this.flushWord();
this.appendToken(this.createNewLineToken(getChar(charCode)));
this.nextLine();
this.colPos = 0;
}
charOPENBRAKET(charCode) {
const nextCharCode = this.seekChar(1);
const isNextSpace = nextCharCode === SPACE || nextCharCode === TAB;
if (isNextSpace) {
this.createWord();
this.wordToken[Token.VALUE_ID] += getChar(charCode);
} else {
this.flushWord();
this.tagToken = this.createTagToken('');
}
this.nextCol();
}
charCLOSEBRAKET(charCode) {
const prevCharCode = this.seekChar(-1);
const isPrevSpace = prevCharCode === SPACE || prevCharCode === TAB;
if (isPrevSpace) {
this.wordToken[Token.VALUE_ID] += getChar(charCode);
}
this.nextCol();
this.flushTag();
this.flushAttrNames();
this.flushAttrs();
}
charEQ(charCode) {
const nextCharCode = this.seekChar(1);
const isNextQuotemark = nextCharCode === QUOTEMARK;
if (this.inTag()) {
this.attrValueToken = this.createAttrValueToken('');
if (isNextQuotemark) {
this.attrValueToken.quoted = true;
this.skipChar(1);
}
} else {
this.wordToken[Token.VALUE_ID] += getChar(charCode);
}
this.nextCol();
}
charQUOTEMARK(charCode) {
const prevCharCode = this.seekChar(-1);
const isPrevBackslash = prevCharCode === BACKSLASH;
if (this.inAttrValue() &&
this.attrValueToken[Token.VALUE_ID] &&
this.attrValueToken.quoted &&
!isPrevBackslash) {
this.flushAttrNames();
} else if (!this.inTag()) {
if (!this.wordToken) {
this.wordToken = this.createWordToken(getChar(charCode));
} else {
this.wordToken[Token.VALUE_ID] += getChar(charCode);
}
}
this.nextCol();
}
charBACKSLASH() {
const nextCharCode = this.seekChar(1);
const isNextQuotemark = nextCharCode === QUOTEMARK;
if (this.inAttrValue() &&
this.attrValueToken[Token.VALUE_ID] &&
this.attrValueToken.quoted &&
isNextQuotemark
) {
this.attrValueToken[Token.VALUE_ID] += getChar(nextCharCode);
this.skipChar(1);
}
this.nextCol();
}
charWORD(charCode) {
if (this.inTag()) {
if (this.inAttrValue()) {
this.attrValueToken[Token.VALUE_ID] += getChar(charCode);
} else if (this.inAttrName()) {
this.attrNameToken[Token.VALUE_ID] += getChar(charCode);
} else {
this.tagToken[Token.VALUE_ID] += getChar(charCode);
}
} else {
this.createWord();
this.wordToken[Token.VALUE_ID] += getChar(charCode);
}
this.nextCol();
}
tokenize() {
this.index = 0;
while (this.index < this.buffer.length) {
const charCode = this.buffer.charCodeAt(this.index);
(this.charMap[charCode] || this.charMap.default)(charCode);
// eslint-disable-next-line no-plusplus
++this.index;
}
this.flushWord();
this.flushUnclosedTag();
this.tokens.length = this.tokenIndex + 1;
return this.tokens;
}
inWord() {
return this.wordToken && this.wordToken[Token.TYPE_ID];
}
inTag() {
return this.tagToken && this.tagToken[Token.TYPE_ID];
}
inAttrValue() {
return this.attrValueToken && this.attrValueToken[Token.TYPE_ID];
}
inAttrName() {
return this.attrNameToken && this.attrNameToken[Token.TYPE_ID];
}
createWordToken(value = '', line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_WORD, value, line, row);
}
createTagToken(value, line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_TAG, value, line, row);
}
createAttrNameToken(value, line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_ATTR_NAME, value, line, row);
}
createAttrValueToken(value, line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_ATTR_VALUE, value, line, row);
}
createSpaceToken(value, line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_SPACE, value, line, row);
}
createNewLineToken(value, line = this.colPos, row = this.rowPos) {
return createTokenOfType(Token.TYPE_NEW_LINE, value, line, row);
}
isTokenNested(token) {
const value = getChar(OPEN_BRAKET) + getChar(SLASH) + token.getValue();
return this.buffer.indexOf(value) > -1;
}
}
module.exports = Tokenizer;
module.exports.createTokenOfType = createTokenOfType;
+195
View File
@@ -0,0 +1,195 @@
/* eslint-disable no-plusplus,no-param-reassign */
const c = require('@bbob/plugin-helper/lib/char');
const Token = require('./Token');
const OPEN_BRAKET = c.getChar(c.OPEN_BRAKET);
const CLOSE_BRAKET = c.getChar(c.CLOSE_BRAKET);
const QUOTEMARK = c.getChar(c.QUOTEMARK);
const BACKSLASH = c.getChar(c.BACKSLASH);
const SLASH = c.getChar(c.SLASH);
const SPACE = c.getChar(c.SPACE);
const TAB = c.getChar(c.TAB);
const EQ = c.getChar(c.EQ);
const N = c.getChar(c.N);
const RESERVED_CHARS = [CLOSE_BRAKET, OPEN_BRAKET, QUOTEMARK, BACKSLASH, SPACE, TAB, EQ, N];
const NOT_CHAR_TOKENS = [OPEN_BRAKET, SPACE, TAB, N];
const WHITESPACES = [SPACE, TAB];
const isCharReserved = char => (RESERVED_CHARS.indexOf(char) >= 0);
const isWhiteSpace = char => (WHITESPACES.indexOf(char) >= 0);
const isCharToken = char => (NOT_CHAR_TOKENS.indexOf(char) === -1);
const createCharGrabber = (source) => {
let idx = 0;
const skip = () => {
idx += 1;
};
const hasNext = () => source.length > idx;
return {
skip,
hasNext,
isLast: () => (idx === source.length),
grabWhile: (cond) => {
const start = idx;
while (hasNext() && cond(source[idx])) {
skip();
}
return source.substr(start, idx - start);
},
getNext: () => source[idx + 1],
getPrev: () => source[idx - 1],
getCurr: () => source[idx],
};
};
const trimChar = (str, charToRemove) => {
while (str.charAt(0) === charToRemove) {
str = str.substring(1);
}
while (str.charAt(str.length - 1) === charToRemove) {
str = str.substring(0, str.length - 1);
}
return str;
};
const unquote = str => str.replace(BACKSLASH + QUOTEMARK, QUOTEMARK);
const createToken = (type, value, r = 0, cl = 0) => new Token(type, value, r, cl);
function createLexer(buffer, options = {}) {
let row = 0;
let col = 0;
let tokenIndex = -1;
const tokens = new Array(Math.floor(buffer.length));
const emitToken = (token) => {
if (options.onToken) {
options.onToken(token);
}
tokenIndex += 1;
tokens[tokenIndex] = token;
};
const parseAttrs = (str) => {
let tagName = null;
let skipSpaces = false;
const attrTokens = [];
const attrCharGrabber = createCharGrabber(str);
const validAttr = (val) => {
const isEQ = val === EQ;
const isWS = isWhiteSpace(val);
const isPrevSLASH = attrCharGrabber.getPrev() === SLASH;
if (tagName === null) {
return !(isEQ || isWS || attrCharGrabber.isLast());
}
if (skipSpaces && isWS) {
return true;
}
if (val === QUOTEMARK && !isPrevSLASH) {
skipSpaces = !skipSpaces;
}
return !(isEQ || isWS);
};
const nextAttr = () => {
const attrStr = attrCharGrabber.grabWhile(validAttr);
// first string before space is a tag name
if (tagName === null) {
tagName = attrStr;
} else if (isWhiteSpace(attrCharGrabber.getCurr()) || !attrCharGrabber.hasNext()) {
const escaped = unquote(trimChar(attrStr, QUOTEMARK));
attrTokens.push(createToken(Token.TYPE_ATTR_VALUE, escaped, row, col));
} else {
attrTokens.push(createToken(Token.TYPE_ATTR_NAME, attrStr, row, col));
}
attrCharGrabber.skip();
};
while (attrCharGrabber.hasNext()) {
nextAttr();
}
return { tag: tagName, attrs: attrTokens };
};
const grabber = createCharGrabber(buffer);
const next = () => {
const char = grabber.getCurr();
if (char === N) {
grabber.skip();
col = 0;
row++;
emitToken(createToken(Token.TYPE_NEW_LINE, char, row, col));
} else if (isWhiteSpace(char)) {
const str = grabber.grabWhile(isWhiteSpace);
emitToken(createToken(Token.TYPE_SPACE, str, row, col));
} else if (char === OPEN_BRAKET) {
const nextChar = grabber.getNext();
grabber.skip(); // skip [
if (isCharReserved(nextChar)) {
emitToken(createToken(Token.TYPE_WORD, char, row, col));
} else {
const str = grabber.grabWhile(val => val !== CLOSE_BRAKET);
grabber.skip(); // skip ]
if (!(str.indexOf(EQ) > 0) || str[0] === SLASH) {
emitToken(createToken(Token.TYPE_TAG, str, row, col));
} else {
const parsed = parseAttrs(str);
emitToken(createToken(Token.TYPE_TAG, parsed.tag, row, col));
parsed.attrs.map(emitToken);
}
}
} else if (char === CLOSE_BRAKET) {
grabber.skip();
emitToken(createToken(Token.TYPE_WORD, char, row, col));
} else if (isCharToken(char)) {
const str = grabber.grabWhile(isCharToken);
emitToken(createToken(Token.TYPE_WORD, str, row, col));
}
};
const tokenize = () => {
while (grabber.hasNext()) {
next();
}
tokens.length = tokenIndex + 1;
return tokens;
};
const isTokenNested = (token) => {
const value = OPEN_BRAKET + SLASH + token.getValue();
return buffer.indexOf(value) > -1;
};
return {
tokenize,
isTokenNested,
};
}
module.exports = createLexer;
module.exports.createTokenOfType = createToken;
+2 -2
View File
@@ -1,4 +1,4 @@
const Tokenizer = require('./Tokenizer');
const createLexer = require('./lexer');
const TagNode = require('@bbob/plugin-helper/lib/TagNode');
/**
@@ -28,7 +28,7 @@ let tokenizer = null;
// eslint-disable-next-line no-unused-vars
let tokens = null;
const createTokenizer = (input, onToken) => new Tokenizer(input, { onToken });
const createTokenizer = (input, onToken) => createLexer(input, { onToken });
/**
* @private
@@ -1,5 +1,5 @@
const Tokenizer = require('../lib/Tokenizer');
const Token = require('../lib/Token');
const lexer = require('../lib/lexer');
const TYPE = {
WORD: Token.TYPE_WORD,
@@ -10,14 +10,15 @@ const TYPE = {
NEW_LINE: Token.TYPE_NEW_LINE,
};
const tokenize = input => (new Tokenizer(input).tokenize());
const tokenize = input => (lexer(input).tokenize());
describe('Tokenizer', () => {
describe('lexer', () => {
const expectOutput = (output, tokens) => {
expect(tokens).toBeInstanceOf(Array);
output.forEach((token, idx) => {
expect(tokens[idx]).toBeInstanceOf(Object);
expect(tokens[idx]).toEqual(Tokenizer.createTokenOfType(...token));
expect(tokens[idx].type).toEqual(token[0]);
expect(tokens[idx].value).toEqual(token[1]);
});
};
@@ -92,12 +93,14 @@ describe('Tokenizer', () => {
});
test('tokenize tag with quotemark params with spaces', () => {
const input = '[url text="Foo Bar"]Text[/url]';
const input = '[url text="Foo Bar" text2="Foo Bar 2"]Text[/url]';
const tokens = tokenize(input);
const output = [
[TYPE.TAG, 'url', '0', '0'],
[TYPE.ATTR_NAME, 'text', '4', '0'],
[TYPE.ATTR_VALUE, 'Foo Bar', '9', '0'],
[TYPE.ATTR_NAME, 'text2', '4', '0'],
[TYPE.ATTR_VALUE, 'Foo Bar 2', '9', '0'],
[TYPE.WORD, 'Text', '20', '0'],
[TYPE.TAG, '/url', '24', '0'],
];
@@ -144,27 +147,21 @@ describe('Tokenizer', () => {
const output = [
[TYPE.TAG, 'list', '0', '0'],
[TYPE.NEW_LINE, '\n', '6', '0'],
[TYPE.SPACE, ' ', '0', '1'],
[TYPE.SPACE, ' ', '1', '1'],
[TYPE.SPACE, ' ', '2', '1'],
[TYPE.SPACE, ' ', '0', '1'],
[TYPE.TAG, '*', '3', '1'],
[TYPE.SPACE, ' ', '6', '1'],
[TYPE.WORD, 'Item', '7', '1'],
[TYPE.SPACE, ' ', '11', '1'],
[TYPE.WORD, '1.', '11', '1'],
[TYPE.NEW_LINE, '\n', '14', '1'],
[TYPE.SPACE, ' ', '0', '2'],
[TYPE.SPACE, ' ', '1', '2'],
[TYPE.SPACE, ' ', '2', '2'],
[TYPE.SPACE, ' ', '0', '2'],
[TYPE.TAG, '*', '3', '2'],
[TYPE.SPACE, ' ', '6', '2'],
[TYPE.WORD, 'Item', '14', '1'],
[TYPE.SPACE, ' ', '11', '2'],
[TYPE.WORD, '2.', '11', '2'],
[TYPE.NEW_LINE, '\n', '14', '2'],
[TYPE.SPACE, ' ', '0', '3'],
[TYPE.SPACE, ' ', '1', '3'],
[TYPE.SPACE, ' ', '2', '3'],
[TYPE.SPACE, ' ', '0', '3'],
[TYPE.TAG, '*', '3', '3'],
[TYPE.SPACE, ' ', '6', '3'],
[TYPE.WORD, 'Item', '14', '2'],
@@ -185,16 +182,24 @@ describe('Tokenizer', () => {
'x html([a. title][, alt][, classes]) x',
'[/y]',
'[sc',
// '[sc / [/sc]',
// '[sc arg="val',
'[sc / [/sc]',
'[sc arg="val',
];
const asserts = [
[[TYPE.WORD, '[]', '0', '0']],
[[TYPE.WORD, '[=]', '0', '0']],
[
[TYPE.WORD, '[', '0', '0'],
[TYPE.WORD, ']', '0', '0']
],
[
[TYPE.WORD, '[', '0', '0'],
[TYPE.WORD, '=]', '0', '0']
],
[
[TYPE.WORD, '!', '0', '0'],
[TYPE.WORD, '[](image.jpg)', '1', '0'],
[TYPE.WORD, '[', '1', '0'],
[TYPE.WORD, ']', '1', '0'],
[TYPE.WORD, '(image.jpg)', '1', '0'],
],
[
[TYPE.WORD, 'x', '0', '0'],
@@ -207,15 +212,20 @@ describe('Tokenizer', () => {
[TYPE.SPACE, ' ', '36', '0'],
[TYPE.WORD, 'x', '36', '0'],
],
[[TYPE.TAG, '/y', '0', '0']],
[[TYPE.WORD, '[sc', '0', '0']],
// [
// [TYPE.WORD, '[sc', '0', '0'],
// [TYPE.SPACE, ' ', '0', '0'],
// [TYPE.WORD, '/', '0', '0'],
// [TYPE.SPACE, ' ', '0', '0'],
// [TYPE.WORD, '[/sc]', '0', '0'],
// ],
[
[TYPE.TAG, '/y', '0', '0']
],
[
[TYPE.TAG, 'sc', '0', '0']
],
[
[TYPE.TAG, 'sc / [/sc', '0', '0']
],
[
[TYPE.TAG, 'sc', '0', '0'],
[TYPE.ATTR_NAME, 'arg', '0', '0'],
[TYPE.ATTR_VALUE, 'val', '0', '0']
]
];
inputs.forEach((input, idx) => {
+1 -1
View File
@@ -61,7 +61,7 @@ describe('Parser', () => {
content: []
},
' ',
'/h1',
'/h1]',
]
);
});