bbob/benchmark/test/RegexParser.js


const attrNameChars = '[a-zA-Z0-9\\.\\-_:;/]'
const attrValueChars = '[a-zA-Z0-9\\.\\-_:;#/\\s]'
const pattern = `\\[(\/\\w*)\\]|\\[(\\w*)+(=(["])${attrValueChars}*\\4)?( (${attrNameChars}+)?=(["])(${attrValueChars}+)\\7)*\\]`

const TAG_RE = new RegExp(pattern, 'g')

const EOL = '\n'
const WHITESPACE = ' '
const isNode = el => typeof el === 'object' && el.tag
const isStringNode = el => typeof el === 'string'
const isChordNode = el => el.tag === 'ch'
const isTabNode = el => el.tag === 'tab'
const isSyllableNode = el => el.tag === 'syllable'
const isTextNode = el => el.tag === 'text'
const isEOL = el => el === EOL

const getNodeLength = node => {
    if (isNode(node)) {
        node.content.reduce((count, contentNode) => count + getNodeLength(contentNode), 0)
    } else if (isStringNode(node)) {
        return node.length
    }

    return 0
}

const tagsDefinition = {
    ch: {
        closable: true,
    },
    syllable: {
        closable: true,
    },
    tab: {
        closable: true,
    },
}

// @TODO: Разбить на парсер и токенайзер, ноды и токены должны жить отдельно
/**
 * Парсит контент таба с BB кодами в AST дерево [{tag:'ch', attrs:{..}, content:[...]}]
 *
 * @example
 *
 *  textTabParser
 *      .parse('[Intro] [ch app=123]G[/ch] hello world', {ch: {closable: true}})
 *
 */
module.exports = {
    parse(str, tags = tagsDefinition) {
        this.tags = tags

        const tokens = this.tokenize(str)
        const ast = this.parseTokens(tokens)

        return ast
    },

    tokenize(str) {
        let tokens = []
        let match
        let lastIndex = 0

        // console.time('tokenize')
        while (match = TAG_RE.exec(str)) {
            const delta = match.index - lastIndex

            if (delta > 0) {
                tokens = tokens.concat(this.toTextTokens(str.substr(lastIndex, delta)))
            }

            tokens.push(this.tagToken(match))
            lastIndex = TAG_RE.lastIndex
        }

        const delta = str.length - lastIndex

        if (delta > 0) {
            tokens = tokens.concat(this.toTextTokens(str.substr(lastIndex, delta)))
        }
        // console.timeEnd('tokenize')

        return tokens
    },

    parseTokens(tokens) {
        const nodes = []
        let curToken
        const nestedNodes = []

        function getNodes() {
            if (nestedNodes.length) {
                const nestedNode = nestedNodes[nestedNodes.length - 1]
                return nestedNode.content
            }

            return nodes
        }

        // console.time('parseTokens')
        while (curToken = tokens.shift()) {
            curToken = this.isTokenSupported(curToken) ? curToken : this.asTextToken(curToken)

            if (curToken.isText) {
                getNodes().push(curToken.text)
            }

            if (curToken.isTag) {
                const node = this.tagNode(curToken.tagName, curToken.attributes)

                if (curToken.isStart) {
                    if (this.isTokenHasCloseTag(curToken)) {
                        nestedNodes.push(node)
                    } else {
                        getNodes().push(node)
                    }
                }

                if (curToken.isEnd) {
                    const lastNestedNode = nestedNodes.pop()

                    if (lastNestedNode) {
                        getNodes().push(lastNestedNode)
                    } else {
                        console.error(`Inconsistent tag '${curToken.tagName}'`)
                    }
                }
            }
        }
        // console.timeEnd('parseTokens')

        return nodes
    },

    isTokenSupported(token) {
        return token.isTag && this.tags && this.tags[token.tagName]
    },

    isTokenHasCloseTag(token) {
        return this.tags && this.tags[token.tagName] && this.tags[token.tagName].closable
    },

    tagNode(name, attrs, content = []) {
        return { tag: name, attrs, content }
    },

    toTextTokens(text) {
        const tokens = []
        const chars = text.split('')
        let currText = ''

        const flushText = () => {
            if (currText) {
                tokens.push(this.textToken(currText))
                currText = ''
            }
        }

        chars.forEach((char) => {
            if (char === EOL || char === WHITESPACE) {
                flushText()
                tokens.push(this.textToken(char))
            } else {
                currText += char
            }
        })

        if (currText) {
            tokens.push(this.textToken(currText))
        }

        return tokens
    },

    textToken(text) {
        return { isText: true, text }
    },

    tagToken(match) {
        if (typeof match[1] === 'undefined') { // Start tag
            const tagName = match[2]
            const attributes = {}
            const ATTR_RE = new RegExp(`(${attrNameChars}+)?=(["])(${attrValueChars}+)\\2`, 'g')
            const attrStr = match[0].substr(1 + tagName.length, match[0].length - 2 - tagName.length)

            let attrMatch

            while (attrMatch = ATTR_RE.exec(attrStr)) {
                if (typeof attrMatch[1] === 'undefined') { // The tag attribute
                    attributes[tagName] = attrMatch[3]
                } else { // Normal attribute
                    attributes[attrMatch[1]] = attrMatch[3]
                }
            }

            return { isStart: true, isTag: true, tagName, attributes, text: match[0] }
        }

        // End tag
        return { isEnd: true, isTag: true, tagName: match[1].substr(1, match[1].length - 1) }
    },

    asTextToken(token) {
        if (token.isTag && token.isStart) {
            return this.textToken(token.text)
        }

        if (token.isTag && token.isEnd) {
            return this.textToken(`[/${token.tagName}]`)
        }

        return token
    },
}