2
0
mirror of https://github.com/tenrok/BBob.git synced 2026-05-15 11:59:37 +03:00
Files
bbob/packages/bbob-parser/Tokenizer.js
T
2018-06-11 22:31:02 +02:00

300 lines
8.8 KiB
JavaScript

const CHAR = require('./char');
const TOKEN = require('./token');
const getChar = String.fromCharCode;
const getTokenValue = (token) => token[Tokenizer.TOKEN.VALUE_ID];
const getTokenLine = (token) => token[Tokenizer.TOKEN.LINE_ID];
const getTokenColumn = (token) => token[Tokenizer.TOKEN.COLUMN_ID];
const isTextToken = (token) => {
const type = token[Tokenizer.TOKEN.TYPE_ID];
return type === TOKEN.TYPE_SPACE || type === TOKEN.TYPE_NEW_LINE || type === TOKEN.TYPE_WORD
};
const isTagToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_TAG;
const isTagStart = (token) => !isTagEnd(token);
const isTagEnd = (token) => getTokenValue(token).charCodeAt(0) === CHAR.SLASH;
const isAttrNameToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_NAME;
const isAttrValueToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_VALUE;
const getTagName = (token) => {
const value = getTokenValue(token);
return isTagEnd(token) ? value.slice(1) : value
};
const convertTagToText = (token) => {
let text = getChar(CHAR.OPEN_BRAKET);
if (isTagEnd(token)) {
text += getChar(CHAR.SLASH)
}
text += getTokenValue(token);
text += getChar(CHAR.CLOSE_BRAKET);
return text
};
const SPACE_TAB = ' ';
const SPACE = ' ';
class Tokenizer {
constructor(input) {
this.buffer = input;
this.colPos = 0;
this.rowPos = 0;
this.index = 0;
this.tokenIndex = -1;
this.tokens = [];
}
appendToken(token) {
this.tokenIndex++;
this.tokens[this.tokenIndex] = token;
}
tokenize() {
let wordToken = null;
let tagToken = null;
let attrNameToken = null;
let attrValueToken = null;
let attrTokens = [];
this.tokens = new Array(Math.floor(this.buffer.length / 2));
const flushWord = () => {
if (wordToken && wordToken[TOKEN.VALUE_ID]) {
this.appendToken(wordToken);
wordToken = this.createWordToken('')
}
};
const createWord = (value, line, row) => {
if (!wordToken) {
wordToken = this.createWordToken(value, line, row)
}
};
const flushTag = () => {
if (tagToken !== null) {
// [] and [=] tag case
if (!tagToken[TOKEN.VALUE_ID]) {
const value = attrValueToken ? getChar(CHAR.EQ) : '';
const word = getChar(CHAR.OPEN_BRAKET) + value + getChar(CHAR.CLOSE_BRAKET);
createWord('', 0, 0);
wordToken[TOKEN.VALUE_ID] += word;
tagToken = null;
if (attrValueToken) {
attrValueToken = null
}
return;
}
if (attrNameToken && !attrValueToken) {
tagToken[TOKEN.VALUE_ID] += SPACE + attrNameToken[TOKEN.VALUE_ID];
attrNameToken = null
}
this.appendToken(tagToken);
tagToken = null;
}
};
const flushUnclosedTag = () => {
if (tagToken !== null) {
const value = tagToken[TOKEN.VALUE_ID] + (attrValueToken ? getChar(CHAR.EQ) : '');
tagToken[TOKEN.TYPE_ID] = TOKEN.TYPE_WORD;
tagToken[TOKEN.VALUE_ID] = getChar(CHAR.OPEN_BRAKET) + value;
this.appendToken(tagToken);
tagToken = null;
if (attrValueToken) {
attrValueToken = null
}
}
};
const flushAttrNames = () => {
if (attrNameToken) {
attrTokens.push(attrNameToken);
attrNameToken = null;
}
if (attrValueToken) {
attrTokens.push(attrValueToken);
attrValueToken = null
}
};
const flushAttrs = () => {
if (attrTokens.length) {
attrTokens.forEach(this.appendToken.bind(this));
attrTokens = [];
}
};
// console.time('Lexer.tokenize');
while (this.index < this.buffer.length) {
const charCode = this.buffer.charCodeAt(this.index);
switch (charCode) {
case CHAR.TAB:
case CHAR.SPACE:
flushWord();
if (tagToken) {
attrNameToken = this.createAttrNameToken('');
} else {
const spaceCode = charCode === CHAR.TAB ? SPACE_TAB : SPACE;
this.appendToken(this.createSpaceToken(spaceCode));
}
this.colPos++;
break;
case CHAR.N:
flushWord();
this.appendToken(this.createNewLineToken(getChar(charCode)));
this.rowPos++;
this.colPos = 0;
break;
case CHAR.OPEN_BRAKET:
flushWord();
tagToken = this.createTagToken('');
this.colPos++;
break;
case CHAR.CLOSE_BRAKET:
flushTag();
flushAttrNames();
flushAttrs();
this.colPos++;
break;
case CHAR.EQ:
if (tagToken) {
attrValueToken = this.createAttrValueToken('')
} else {
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
break;
case CHAR.QUOTEMARK:
if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) {
flushAttrNames();
} else if (tagToken === null) {
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
break;
default:
if (tagToken && attrValueToken) {
attrValueToken[TOKEN.VALUE_ID] += getChar(charCode)
} else if (tagToken && attrNameToken) {
attrNameToken[TOKEN.VALUE_ID] += getChar(charCode)
} else if (tagToken) {
tagToken[TOKEN.VALUE_ID] += getChar(charCode)
} else {
createWord();
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
break;
}
this.index++;
}
flushWord();
flushUnclosedTag();
this.tokens.length = this.tokenIndex + 1;
return this.tokens;
}
createWordToken(value = '', line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_WORD, value, line, row]
}
createTagToken(value, line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_TAG, value, line, row]
}
createAttrNameToken(value, line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_ATTR_NAME, value, line, row]
}
createAttrValueToken(value, line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_ATTR_VALUE, value, line, row]
}
createSpaceToken(value, line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_SPACE, value, line, row]
}
createNewLineToken(value, line = this.colPos, row = this.rowPos) {
return [TOKEN.TYPE_NEW_LINE, value, line, row]
}
}
// warm up tokenizer to elimitate code branches that never execute
new Tokenizer(`[sc=asdasd`).tokenize();
//new Tokenizer(`[b param="hello"]Sample text[/b]\n\t[Chorus]`).tokenize();
module.exports = Tokenizer;
module.exports.CHAR = CHAR;
module.exports.TYPE = {
WORD: TOKEN.TYPE_WORD,
TAG: TOKEN.TYPE_TAG,
ATTR_NAME: TOKEN.TYPE_ATTR_NAME,
ATTR_VALUE: TOKEN.TYPE_ATTR_VALUE,
SPACE: TOKEN.TYPE_SPACE,
NEW_LINE: TOKEN.TYPE_NEW_LINE,
};
module.exports.TOKEN = {
TYPE_ID: TOKEN.TYPE_ID,
VALUE_ID: TOKEN.VALUE_ID,
LINE_ID: TOKEN.LINE_ID,
COLUMN_ID: TOKEN.COLUMN_ID,
};
module.exports.getChar = getChar;
module.exports.getTokenValue = getTokenValue;
module.exports.getTokenLine = getTokenLine;
module.exports.getTokenColumn = getTokenColumn;
module.exports.isTextToken = isTextToken;
module.exports.isTagToken = isTagToken;
module.exports.isTagStart = isTagStart;
module.exports.isTagEnd = isTagEnd;
module.exports.isAttrNameToken = isAttrNameToken;
module.exports.isAttrValueToken = isAttrValueToken;
module.exports.getTagName = getTagName;
module.exports.convertTokenToText = convertTagToText;