bbob/packages/bbob-parser/Tokenizer.js

const CHAR = require('./char');
const TOKEN = require('./token');

// const TOKEN.TYPE_ID = 0;
// const TOKEN.VALUE_ID = 1;
// const TOKEN.LINE_ID = 2;
// const TOKEN.COLUMN_ID = 3;
//
// const TOKEN.TYPE_WORD = 'word';
// const TOKEN.TYPE_TAG = 'tag';
// const TOKEN.TYPE_ATTR_NAME = 'attr-name';
// const TOKEN.TYPE_ATTR_VALUE = 'attr-value';
// const TOKEN.TYPE_SPACE = 'space';
// const TOKEN.TYPE_NEW_LINE = 'new-line';

const getCharCode = String.fromCharCode;

class Tokenizer {
    constructor(input) {
        this.buffer = input;
        this.colPos = 0;
        this.rowPos = 0;
        this.index = 0;
    }

    tokenize() {
        let wordToken = this.createWordToken('');
        let tagToken = null;
        let attrNameToken = null;
        let attrValueToken = null;
        let attrTokens = [];
        let tokens = new Array(Math.floor(this.buffer.length / 2));
        let tokenIndex = -1;

        const flushWord = () => {
            if (wordToken[TOKEN.VALUE_ID]) {
                tokenIndex++;
                tokens[tokenIndex] = wordToken;
                wordToken = this.createWordToken('')
            }
        };

        const flushTag = () => {
            if (tagToken !== null) {
                tokenIndex++;
                tokens[tokenIndex] = tagToken;
                tagToken = null;
            }
        };

        const flushAttrName = () => {
            if (attrNameToken) {
                attrTokens.push(attrNameToken);
                attrNameToken = null;
            }
        };

        const flushAttrValue = () => {
            if (attrValueToken) {
                attrTokens.push(attrValueToken);
                attrValueToken = null
            }
        };

        const flushAttrs = () => {
            if (attrTokens.length) {
                attrTokens.forEach(attrToken => {
                    tokenIndex++;
                    tokens[tokenIndex] = attrToken
                });

                attrTokens = [];
            }
        };

        // console.time('Lexer.tokenize');

        while (this.index < this.buffer.length) {
            const charCode = this.buffer.charCodeAt(this.index);

            switch (charCode) {
                case CHAR.TAB:
                case CHAR.SPACE:
                    flushWord();

                    if (tagToken) {
                        attrNameToken = this.createAttrNameToken('');
                    }

                    const spaceCode = charCode === CHAR.TAB ? '    ' : ' ';

                    tokenIndex++;
                    tokens[tokenIndex] = this.createSpaceToken(spaceCode);

                    this.colPos++;
                    break;

                case CHAR.N:
                    flushWord();
                    tokenIndex++;
                    tokens[tokenIndex] = this.createNewLineToken(getCharCode(charCode));

                    this.rowPos++;
                    this.colPos = 0;
                    break;

                case CHAR.OPEN_BRAKET:
                    flushWord();
                    tagToken = this.createTagToken('');

                    this.colPos++;
                    break;

                case CHAR.CLOSE_BRAKET:
                    flushTag();
                    flushAttrName();
                    flushAttrValue();
                    flushAttrs();

                    this.colPos++;
                    break;

                case CHAR.EQ:
                    if (tagToken) {
                        attrValueToken = this.createAttrValueToken('')
                    } else {
                        wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
                    }

                    this.colPos++;
                    break;

                case CHAR.QUOTEMARK:
                    if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) {
                        flushAttrName();
                        flushAttrValue();
                    } else if (tagToken === null) {
                        wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
                    }

                    this.colPos++;
                    break;

                default:
                    if (tagToken && attrValueToken) {
                        attrValueToken[TOKEN.VALUE_ID] += getCharCode(charCode)
                    } else if (tagToken && attrNameToken) {
                        attrNameToken[TOKEN.VALUE_ID] += getCharCode(charCode)
                    } else if (tagToken) {
                        tagToken[TOKEN.VALUE_ID] += getCharCode(charCode)
                    } else {
                        wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
                    }

                    this.colPos++;
                    break;
            }

            this.index++;
        }

        flushWord();

        tokens.length = tokenIndex;

        return tokens;
    }

    createWordToken(value) {
        return [TOKEN.TYPE_WORD, value, this.colPos, this.rowPos]
    }

    createTagToken(value) {
        return [TOKEN.TYPE_TAG, value, this.colPos, this.rowPos]
    }

    createAttrNameToken(value) {
        return [TOKEN.TYPE_ATTR_NAME, value, this.colPos, this.rowPos]
    }

    createAttrValueToken(value) {
        return [TOKEN.TYPE_ATTR_VALUE, value, this.colPos, this.rowPos]
    }

    createSpaceToken(value) {
        return [TOKEN.TYPE_SPACE, value, this.colPos, this.rowPos]
    }

    createNewLineToken(value) {
        return [TOKEN.TYPE_NEW_LINE, value, this.colPos, this.rowPos]
    }
}

// warm up tokenizer to elimitate code branches that never execute
new Tokenizer(`[b param="hello"]Sample text[/b]\n\t[Chorus]`).tokenize();

module.exports = Tokenizer;
module.exports.CHAR = CHAR;
module.exports.TYPE = {
    WORD: TOKEN.TYPE_WORD,
    TAG: TOKEN.TYPE_TAG,
    ATTR_NAME: TOKEN.TYPE_ATTR_NAME,
    ATTR_VALUE: TOKEN.TYPE_ATTR_VALUE,
    SPACE: TOKEN.TYPE_SPACE,
    NEW_LINE: TOKEN.TYPE_NEW_LINE,
};
module.exports.TOKEN = {
    TYPE_ID: TOKEN.TYPE_ID,
    VALUE_ID: TOKEN.VALUE_ID,
    LINE_ID: TOKEN.LINE_ID,
    COLUMN_ID: TOKEN.COLUMN_ID,
};
module.exports.getCharCode = getCharCode;