2
0
mirror of https://github.com/tenrok/BBob.git synced 2026-05-15 11:59:37 +03:00
Files
bbob/packages/bbob-parser/Tokenizer.js
T
Nikolay Kostyurin edf90de445 initial
2018-06-04 23:18:50 +02:00

215 lines
6.0 KiB
JavaScript

const CHAR = require('./char');
const TOKEN = require('./token');
// const TOKEN.TYPE_ID = 0;
// const TOKEN.VALUE_ID = 1;
// const TOKEN.LINE_ID = 2;
// const TOKEN.COLUMN_ID = 3;
//
// const TOKEN.TYPE_WORD = 'word';
// const TOKEN.TYPE_TAG = 'tag';
// const TOKEN.TYPE_ATTR_NAME = 'attr-name';
// const TOKEN.TYPE_ATTR_VALUE = 'attr-value';
// const TOKEN.TYPE_SPACE = 'space';
// const TOKEN.TYPE_NEW_LINE = 'new-line';
const getCharCode = String.fromCharCode;
class Tokenizer {
constructor(input) {
this.buffer = input;
this.colPos = 0;
this.rowPos = 0;
this.index = 0;
}
tokenize() {
let wordToken = this.createWordToken('');
let tagToken = null;
let attrNameToken = null;
let attrValueToken = null;
let attrTokens = [];
let tokens = new Array(Math.floor(this.buffer.length / 2));
let tokenIndex = -1;
const flushWord = () => {
if (wordToken[TOKEN.VALUE_ID]) {
tokenIndex++;
tokens[tokenIndex] = wordToken;
wordToken = this.createWordToken('')
}
};
const flushTag = () => {
if (tagToken !== null) {
tokenIndex++;
tokens[tokenIndex] = tagToken;
tagToken = null;
}
};
const flushAttrName = () => {
if (attrNameToken) {
attrTokens.push(attrNameToken);
attrNameToken = null;
}
};
const flushAttrValue = () => {
if (attrValueToken) {
attrTokens.push(attrValueToken);
attrValueToken = null
}
};
const flushAttrs = () => {
if (attrTokens.length) {
attrTokens.forEach(attrToken => {
tokenIndex++;
tokens[tokenIndex] = attrToken
});
attrTokens = [];
}
};
// console.time('Lexer.tokenize');
while (this.index < this.buffer.length) {
const charCode = this.buffer.charCodeAt(this.index);
switch (charCode) {
case CHAR.TAB:
case CHAR.SPACE:
flushWord();
if (tagToken) {
attrNameToken = this.createAttrNameToken('');
}
const spaceCode = charCode === CHAR.TAB ? ' ' : ' ';
tokenIndex++;
tokens[tokenIndex] = this.createSpaceToken(spaceCode);
this.colPos++;
break;
case CHAR.N:
flushWord();
tokenIndex++;
tokens[tokenIndex] = this.createNewLineToken(getCharCode(charCode));
this.rowPos++;
this.colPos = 0;
break;
case CHAR.OPEN_BRAKET:
flushWord();
tagToken = this.createTagToken('');
this.colPos++;
break;
case CHAR.CLOSE_BRAKET:
flushTag();
flushAttrName();
flushAttrValue();
flushAttrs();
this.colPos++;
break;
case CHAR.EQ:
if (tagToken) {
attrValueToken = this.createAttrValueToken('')
} else {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
}
this.colPos++;
break;
case CHAR.QUOTEMARK:
if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) {
flushAttrName();
flushAttrValue();
} else if (tagToken === null) {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
}
this.colPos++;
break;
default:
if (tagToken && attrValueToken) {
attrValueToken[TOKEN.VALUE_ID] += getCharCode(charCode)
} else if (tagToken && attrNameToken) {
attrNameToken[TOKEN.VALUE_ID] += getCharCode(charCode)
} else if (tagToken) {
tagToken[TOKEN.VALUE_ID] += getCharCode(charCode)
} else {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
}
this.colPos++;
break;
}
this.index++;
}
flushWord();
tokens.length = tokenIndex;
return tokens;
}
createWordToken(value) {
return [TOKEN.TYPE_WORD, value, this.colPos, this.rowPos]
}
createTagToken(value) {
return [TOKEN.TYPE_TAG, value, this.colPos, this.rowPos]
}
createAttrNameToken(value) {
return [TOKEN.TYPE_ATTR_NAME, value, this.colPos, this.rowPos]
}
createAttrValueToken(value) {
return [TOKEN.TYPE_ATTR_VALUE, value, this.colPos, this.rowPos]
}
createSpaceToken(value) {
return [TOKEN.TYPE_SPACE, value, this.colPos, this.rowPos]
}
createNewLineToken(value) {
return [TOKEN.TYPE_NEW_LINE, value, this.colPos, this.rowPos]
}
}
// warm up tokenizer to elimitate code branches that never execute
new Tokenizer(`[b param="hello"]Sample text[/b]\n\t[Chorus]`).tokenize();
module.exports = Tokenizer;
module.exports.CHAR = CHAR;
module.exports.TYPE = {
WORD: TOKEN.TYPE_WORD,
TAG: TOKEN.TYPE_TAG,
ATTR_NAME: TOKEN.TYPE_ATTR_NAME,
ATTR_VALUE: TOKEN.TYPE_ATTR_VALUE,
SPACE: TOKEN.TYPE_SPACE,
NEW_LINE: TOKEN.TYPE_NEW_LINE,
};
module.exports.TOKEN = {
TYPE_ID: TOKEN.TYPE_ID,
VALUE_ID: TOKEN.VALUE_ID,
LINE_ID: TOKEN.LINE_ID,
COLUMN_ID: TOKEN.COLUMN_ID,
};
module.exports.getCharCode = getCharCode;