2
0
mirror of https://github.com/tenrok/BBob.git synced 2026-05-24 14:04:06 +03:00

add Tokenizer tests

This commit is contained in:
Nikolay Kostyurin
2018-06-09 00:06:36 +02:00
parent d739ec8ffd
commit 5e34dd9d43
9 changed files with 232 additions and 94 deletions
+37 -45
View File
@@ -1,44 +1,19 @@
const {
convertTokenToText,
getTagName,
getTokenColumn,
getTokenLine,
getTokenValue,
isAttrNameToken,
isAttrValueToken,
isTagStart,
isTagToken,
isTextToken,
isTagEnd
} = require("./Tokenizer");
const Tokenizer = require("./Tokenizer");
const TokenType = Tokenizer.TYPE;
const TokenChar = Tokenizer.CHAR;
const getCharCode = Tokenizer.getCharCode;
const isTextToken = (token) => {
const type = token[Tokenizer.TOKEN.TYPE_ID];
return type === TokenType.SPACE || type === TokenType.NEW_LINE || type === TokenType.WORD
};
const isTagToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.TAG;
const isTagStart = (token) => !isTagEnd(token);
const isTagEnd = (token) => getTokenValue(token).charCodeAt(0) === TokenChar.SLASH;
const isAttrNameToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.ATTR_NAME;
const isAttrValueToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TokenType.ATTR_VALUE;
const getTagName = (token) => {
const value = getTokenValue(token);
return isTagEnd(token) ? value.slice(1) : value
};
const convertTagToText = (token) => {
let text = getCharCode(TokenChar.OPEN_BRAKET);
if (isTagEnd(token)) {
text += getCharCode(TokenChar.SLASH)
}
text += getTokenValue(token);
text += getCharCode(TokenChar.CLOSE_BRAKET);
return text
};
const getTokenValue = (token) => token[Tokenizer.TOKEN.VALUE_ID];
const getChar = Tokenizer.getChar;
const createTagNode = (name, attrs = {}, content = []) => ({tag: name, attrs, content});
@@ -65,6 +40,10 @@ module.exports = class Parser {
const curTags = [];
const curTagsAttrName = [];
const closableTags = this.findNestedTags(tokens);
const isNestedTag = (token) => closableTags.indexOf(getTokenValue(token)) >= 0;
const getCurTag = () => {
if (curTags.length) {
return curTags[curTags.length - 1]
@@ -124,7 +103,7 @@ module.exports = class Parser {
if (isTagStart(token)) {
createCurTag(token);
if (this.isCloseTag(getTokenValue(token))) {
if (isNestedTag(token)) {
nestedNodes.push(getCurTag())
} else {
getNodes().push(getCurTag());
@@ -141,12 +120,11 @@ module.exports = class Parser {
if (lastNestedNode) {
getNodes().push(lastNestedNode)
} else {
debugger;
console.warn(`Inconsistent tag '${getTokenValue(token)}'`);
console.warn(`Inconsistent tag '${getTokenValue(token)}' on line ${getTokenLine(token)} and column ${getTokenColumn(token)}`);
}
}
} else {
getNodes().push(convertTagToText(token))
getNodes().push(convertTokenToText(token))
}
}
@@ -168,8 +146,22 @@ module.exports = class Parser {
return nodes
}
isCloseTag(value) {
return this.options.closableTags && this.options.closableTags.indexOf(value) >= 0
findNestedTags(tokens) {
const tags = tokens.filter(isTagToken).reduce((acc, token) => {
acc[getTokenValue(token)] = true;
return acc
}, {});
const closeChar = getChar(TokenChar.SLASH);
return Object.keys(tags).reduce((arr, key) => {
if (tags[key] && tags[closeChar + key]) {
arr.push(key)
}
return arr;
}, [])
}
isAllowedTag(value) {
+2
View File
@@ -0,0 +1,2 @@
# bbob-parser
Fast BB Code parser written in pure javascript, no dependencies
+82 -36
View File
@@ -1,19 +1,50 @@
const CHAR = require('./char');
const TOKEN = require('./token');
const getChar = String.fromCharCode;
// const TOKEN.TYPE_ID = 0;
// const TOKEN.VALUE_ID = 1;
// const TOKEN.LINE_ID = 2;
// const TOKEN.COLUMN_ID = 3;
//
// const TOKEN.TYPE_WORD = 'word';
// const TOKEN.TYPE_TAG = 'tag';
// const TOKEN.TYPE_ATTR_NAME = 'attr-name';
// const TOKEN.TYPE_ATTR_VALUE = 'attr-value';
// const TOKEN.TYPE_SPACE = 'space';
// const TOKEN.TYPE_NEW_LINE = 'new-line';
const getCharCode = String.fromCharCode;
const getTokenValue = (token) => token[Tokenizer.TOKEN.VALUE_ID];
const getTokenLine = (token) => token[Tokenizer.TOKEN.LINE_ID];
const getTokenColumn = (token) => token[Tokenizer.TOKEN.COLUMN_ID];
const isTextToken = (token) => {
const type = token[Tokenizer.TOKEN.TYPE_ID];
return type === TOKEN.TYPE_SPACE || type === TOKEN.TYPE_NEW_LINE || type === TOKEN.TYPE_WORD
};
const isTagToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_TAG;
const isTagStart = (token) => !isTagEnd(token);
const isTagEnd = (token) => getTokenValue(token).charCodeAt(0) === CHAR.SLASH;
const isAttrNameToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_NAME;
const isAttrValueToken = (token) => token[Tokenizer.TOKEN.TYPE_ID] === TOKEN.TYPE_ATTR_VALUE;
const getTagName = (token) => {
const value = getTokenValue(token);
return isTagEnd(token) ? value.slice(1) : value
};
const convertTagToText = (token) => {
let text = getChar(CHAR.OPEN_BRAKET);
if (isTagEnd(token)) {
text += getChar(CHAR.SLASH)
}
text += getTokenValue(token);
text += getChar(CHAR.CLOSE_BRAKET);
return text
};
const SPACE_TAB = ' ';
const SPACE = ' ';
class Tokenizer {
constructor(input) {
@@ -24,7 +55,7 @@ class Tokenizer {
}
tokenize() {
let wordToken = this.createWordToken('');
let wordToken = null;
let tagToken = null;
let attrNameToken = null;
let attrValueToken = null;
@@ -33,7 +64,7 @@ class Tokenizer {
let tokenIndex = -1;
const flushWord = () => {
if (wordToken[TOKEN.VALUE_ID]) {
if (wordToken && wordToken[TOKEN.VALUE_ID]) {
tokenIndex++;
tokens[tokenIndex] = wordToken;
wordToken = this.createWordToken('')
@@ -42,20 +73,23 @@ class Tokenizer {
const flushTag = () => {
if (tagToken !== null) {
if (attrNameToken && !attrValueToken) {
tagToken[TOKEN.VALUE_ID] += SPACE + attrNameToken[TOKEN.VALUE_ID]
attrNameToken = null
}
tokenIndex++;
tokens[tokenIndex] = tagToken;
tagToken = null;
}
};
const flushAttrName = () => {
const flushAttrNames = () => {
if (attrNameToken) {
attrTokens.push(attrNameToken);
attrNameToken = null;
}
};
const flushAttrValue = () => {
if (attrValueToken) {
attrTokens.push(attrValueToken);
attrValueToken = null
@@ -85,20 +119,19 @@ class Tokenizer {
if (tagToken) {
attrNameToken = this.createAttrNameToken('');
} else {
const spaceCode = charCode === CHAR.TAB ? SPACE_TAB : SPACE;
tokenIndex++;
tokens[tokenIndex] = this.createSpaceToken(spaceCode);
}
const spaceCode = charCode === CHAR.TAB ? ' ' : ' ';
tokenIndex++;
tokens[tokenIndex] = this.createSpaceToken(spaceCode);
this.colPos++;
break;
case CHAR.N:
flushWord();
tokenIndex++;
tokens[tokenIndex] = this.createNewLineToken(getCharCode(charCode));
tokens[tokenIndex] = this.createNewLineToken(getChar(charCode));
this.rowPos++;
this.colPos = 0;
@@ -113,8 +146,7 @@ class Tokenizer {
case CHAR.CLOSE_BRAKET:
flushTag();
flushAttrName();
flushAttrValue();
flushAttrNames();
flushAttrs();
this.colPos++;
@@ -124,7 +156,7 @@ class Tokenizer {
if (tagToken) {
attrValueToken = this.createAttrValueToken('')
} else {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
@@ -132,10 +164,9 @@ class Tokenizer {
case CHAR.QUOTEMARK:
if (attrValueToken && attrValueToken[TOKEN.VALUE_ID] > 0) {
flushAttrName();
flushAttrValue();
flushAttrNames();
} else if (tagToken === null) {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
@@ -143,13 +174,17 @@ class Tokenizer {
default:
if (tagToken && attrValueToken) {
attrValueToken[TOKEN.VALUE_ID] += getCharCode(charCode)
attrValueToken[TOKEN.VALUE_ID] += getChar(charCode)
} else if (tagToken && attrNameToken) {
attrNameToken[TOKEN.VALUE_ID] += getCharCode(charCode)
attrNameToken[TOKEN.VALUE_ID] += getChar(charCode)
} else if (tagToken) {
tagToken[TOKEN.VALUE_ID] += getCharCode(charCode)
tagToken[TOKEN.VALUE_ID] += getChar(charCode)
} else {
wordToken[TOKEN.VALUE_ID] += getCharCode(charCode);
if (!wordToken) {
wordToken = this.createWordToken('')
}
wordToken[TOKEN.VALUE_ID] += getChar(charCode);
}
this.colPos++;
@@ -161,7 +196,7 @@ class Tokenizer {
flushWord();
tokens.length = tokenIndex;
tokens.length = tokenIndex + 1;
return tokens;
}
@@ -210,5 +245,16 @@ module.exports.TOKEN = {
LINE_ID: TOKEN.LINE_ID,
COLUMN_ID: TOKEN.COLUMN_ID,
};
module.exports.getCharCode = getCharCode;
module.exports.getChar = getChar;
module.exports.getTokenValue = getTokenValue;
module.exports.getTokenLine = getTokenLine;
module.exports.getTokenColumn = getTokenColumn;
module.exports.isTextToken = isTextToken;
module.exports.isTagToken = isTagToken;
module.exports.isTagStart = isTagStart;
module.exports.isTagEnd = isTagEnd;
module.exports.isAttrNameToken = isAttrNameToken;
module.exports.isAttrValueToken = isAttrValueToken;
module.exports.getTagName = getTagName;
module.exports.convertTokenToText = convertTagToText;
+85 -4
View File
@@ -1,14 +1,95 @@
const Tokenizer = require('./Tokenizer');
const TYPE = Tokenizer.TYPE;
describe("Tokenizer", () => {
it("tokenize single tag", () => {
test("tokenize single tag", () => {
const input = `[SingleTag]`;
const tokens = new Tokenizer(input).tokenize();
expect(tokens).toBeInstanceOf(Array);
expect(tokens).toEqual([
[TYPE.TAG, 'SingleTag', 0, 0]
])
});
test("tokenize single tag with spaces", () => {
const input = `[Single Tag]`;
const tokens = new Tokenizer(input).tokenize();
expect(tokens).toBeInstanceOf(Array);
expect(tokens).toEqual([
[TYPE.TAG, 'Single Tag', 0, 0]
])
});
test("tokenize tag as param", () => {
const input = `[color="#ff0000"]Text[/color]`;
const tokens = new Tokenizer(input).tokenize();
expect(tokens).toBeInstanceOf(Array);
expect(tokens).toEqual([
[TYPE.TAG, 'color', 0, 0],
[TYPE.ATTR_VALUE, '#ff0000', 6, 0],
[TYPE.WORD, 'Text', 17, 0],
[TYPE.TAG, '/color', 21, 0]
])
});
test("tokenize tag param without quotemarks", () => {
const input = `[style color=#ff0000]Text[/style]`;
const tokens = new Tokenizer(input).tokenize();
expect(tokens).toBeInstanceOf(Array);
expect(tokens).toEqual([
[TYPE.TAG, 'style', 0, 0],
[TYPE.ATTR_NAME, 'color', 6, 0],
[TYPE.ATTR_VALUE, '#ff0000', 12, 0],
[TYPE.WORD, 'Text', 21, 0],
[TYPE.TAG, '/style', 25, 0]
])
});
test("tokenize list tag with items", () => {
const input = `[list]
[*] Item 1.
[*] Item 2.
[*] Item 3.
[/list]`;
const tokens = new Tokenizer(input).tokenize();
console.log('tokens', tokens);
expect(tokens).toBeInstanceOf(Array);
expect(tokens[0]).toEqual(['tag', 'SingleTag', 0, 0])
expect(tokens).toEqual([
[TYPE.TAG, 'list', 0, 0],
[TYPE.NEW_LINE, '\n', 6, 0],
[TYPE.SPACE, ' ', 0, 1],
[TYPE.SPACE, ' ', 1, 1],
[TYPE.SPACE, ' ', 2, 1],
[TYPE.TAG, '*', 3, 1],
[TYPE.SPACE, ' ', 6, 1],
[TYPE.WORD, 'Item', 7, 1],
[TYPE.SPACE, ' ', 11, 1],
[TYPE.WORD, '1.', 11, 1],
[TYPE.NEW_LINE, '\n', 14, 1],
[TYPE.SPACE, ' ', 0, 2],
[TYPE.SPACE, ' ', 1, 2],
[TYPE.SPACE, ' ', 2, 2],
[TYPE.TAG, '*', 3, 2],
[TYPE.SPACE, ' ', 6, 2],
[TYPE.WORD, 'Item', 14, 1],
[TYPE.SPACE, ' ', 11, 2],
[TYPE.WORD, '2.', 11, 2],
[TYPE.NEW_LINE, '\n', 14, 2],
[TYPE.SPACE, ' ', 0, 3],
[TYPE.SPACE, ' ', 1, 3],
[TYPE.SPACE, ' ', 2, 3],
[TYPE.TAG, '*', 3, 3],
[TYPE.SPACE, ' ', 6, 3],
[TYPE.WORD, 'Item', 14, 2],
[TYPE.SPACE, ' ', 11, 3],
[TYPE.WORD, '3.', 11, 3],
[TYPE.NEW_LINE, '\n', 14, 3],
[TYPE.TAG, '/list', 0, 4]
])
})
});
@@ -6,7 +6,7 @@ const options = {
const textStub = require("./test/stub");
const count = 10;
const count = 0;
const parsers3 = [];
console.time('newParser');
@@ -2,7 +2,7 @@ const OldParser = require('./OldParser')
const textStub = require("./test/stub");
const count = 10;
const count = 0;
const oldParsers3 = [];
console.time('oldParser');
for (let i = 0; i <= count; i++) {
+2 -1
View File
@@ -12,7 +12,8 @@
"author": "Nikolay Kostyurin <jilizart@gmail.com>",
"license": "MIT",
"devDependencies": {
"jest": "^23.1.0"
"jest": "^23.1.0",
"xbbcode-parser": "^0.1.2"
},
"publishConfig": {
"registry": "https://registry.npmjs.org/"
+20 -4
View File
@@ -1,9 +1,7 @@
const parse = require('./index');
const OldParser = require('./benchmark/OldParser');
const tabText = require('./benchmark/test/stub');
const options = {
closableTags: ['ch', 'syllable', 'tab'],
allowOnlyTags: ['ch', 'syllable', 'tab'],
};
@@ -15,8 +13,26 @@ describe("parse", () => {
});
test("same as old parser", () => {
const ast1 = parse(tabText, options);
const ast2 = OldParser.parse(tabText);
const input = `[Verse 2]
[ch]Eb[/ch] [ch]Fm[/ch]
I'm walking around
[ch]Ab[/ch] [ch]Cm[/ch]
With my little raincloud
[ch]Eb[/ch] [ch]Fm[/ch]
Hanging over my head
[ch]Cm[/ch] [ch]Ab[/ch]
And it aint coming down
[ch]Eb[/ch] [ch]Fm[/ch]
Where do I go?
[ch]Ab[/ch] [ch]Cm[/ch]
Gimme some sort of sign
[ch]Eb[/ch] [ch]Fm[/ch]
Hit me with lightning!
[ch]Cm[/ch] [ch]Ab[/ch]
Maybe Ill come alive
`;
const ast1 = parse(input, options);
const ast2 = OldParser.parse(input);
expect(ast1).toEqual(ast2);
})
+2 -2
View File
@@ -1,7 +1,7 @@
const TOKEN_TYPE_ID = 0;
const TOKEN_VALUE_ID = 1;
const TOKEN_LINE_ID = 2;
const TOKEN_COLUMN_ID = 3;
const TOKEN_COLUMN_ID = 2;
const TOKEN_LINE_ID = 3;
const TOKEN_TYPE_WORD = 'word';
const TOKEN_TYPE_TAG = 'tag';