feat(parser): rewrite lexer to make it faster (#50)

* feat(parser): first iteration of new lexer * feat(parser): convert token string props to number props * refactor(parser): optimize char grabber * refactor(parser): working on new lexer * refactor(parser): convert token string props to number props * refactor(parser): rebuild lexer, add tag attrs parsing * refactor(parser): rework word parsing and tag parsing * refactor(parser): rework to pass tests * refactor(parser): rework tag parsing * refactor(parser): rework escape tags parsing * refactor(parser): rework tests * refactor(parser): all test pass * refactor(parser): make lexer faster by move mode switching in loop * refactor(parser): remove all state map objects * refactor(parser): order of parsing states * refactor(parser): state switching without return * refactor(parser): rename buffers to chars * refactor(lexer): reduce function calls * feat(lexer): add new parser tests and code to pass it * fix(utils): remove unused variable in char grabber * feat(lexer): add test for new lexer bug * chore(*): add lexer and lexer2 to benchmark * chore(lexer): add some debug info for char grabber * feat(parser): add new test for single attributes without values * fix(lexer): paired tags tests * refactor(lexer): comment breaking changes tests for future releases * feat(core): improve tests * refactor(parser): add more tests, reduce char grabber size * refactor(parser): reduce utils size * refactor(parser): remove unused code from tag parsing code * refactor(parser): remove unused code from word to tag transforming code * chore(benchmark): fix benchmark imports
2026-06-14 18:42:24 +03:00 · 2020-12-09 01:03:48 +02:00
parent fda6ddd6ee
commit 772d422d77
13 changed files with 998 additions and 359 deletions
@@ -1,4 +1,4 @@
-import Token from '../src/Token'
+import Token, { TYPE_WORD, TYPE_TAG, TYPE_ATTR_NAME, TYPE_ATTR_VALUE, TYPE_SPACE, TYPE_NEW_LINE } from '../src/Token'

 describe('Token', () => {
  test('isEmpty', () => {
@@ -7,61 +7,61 @@ describe('Token', () => {
    expect(token.isEmpty()).toBeTruthy()
  });
  test('isText', () => {
-    const token = new Token('word');
+    const token = new Token(TYPE_WORD);

    expect(token.isText()).toBeTruthy();
  });
  test('isTag', () => {
-    const token = new Token('tag');
+    const token = new Token(TYPE_TAG);

    expect(token.isTag()).toBeTruthy();
  });
  test('isAttrName', () => {
-    const token = new Token('attr-name');
+    const token = new Token(TYPE_ATTR_NAME);

    expect(token.isAttrName()).toBeTruthy();
  });
  test('isAttrValue', () => {
-    const token = new Token('attr-value');
+    const token = new Token(TYPE_ATTR_VALUE);

    expect(token.isAttrValue()).toBeTruthy();
  });
  test('isStart', () => {
-    const token = new Token('tag', 'my-tag');
+    const token = new Token(TYPE_TAG, 'my-tag');

    expect(token.isStart()).toBeTruthy();
  });
  test('isEnd', () => {
-    const token = new Token('tag', '/my-tag');
+    const token = new Token(TYPE_TAG, '/my-tag');

    expect(token.isEnd()).toBeTruthy();
  });
  test('getName', () => {
-    const token = new Token('tag', '/my-tag');
+    const token = new Token(TYPE_TAG, '/my-tag');

    expect(token.getName()).toBe('my-tag');
  });
  test('getValue', () => {
-    const token = new Token('tag', '/my-tag');
+    const token = new Token(TYPE_TAG, '/my-tag');

    expect(token.getValue()).toBe('/my-tag');
  });
  test('getLine', () => {
-    const token = new Token('tag', '/my-tag', 12);
+    const token = new Token(TYPE_TAG, '/my-tag', 12);

    expect(token.getLine()).toBe(12);
  });
  test('getColumn', () => {
-    const token = new Token('tag', '/my-tag', 12, 14);
+    const token = new Token(TYPE_TAG, '/my-tag', 12, 14);

    expect(token.getColumn()).toBe(14);
  });
  test('toString', () => {
-    const tokenEnd = new Token('tag', '/my-tag', 12, 14);
+    const tokenEnd = new Token(TYPE_TAG, '/my-tag', 12, 14);

    expect(tokenEnd.toString()).toBe('[/my-tag]');

-    const tokenStart = new Token('tag', 'my-tag', 12, 14);
+    const tokenStart = new Token(TYPE_TAG, 'my-tag', 12, 14);

    expect(tokenStart.toString()).toBe('[my-tag]');
  });
@@ -1,4 +1,4 @@
-import {TYPE_WORD, TYPE_TAG, TYPE_ATTR_NAME, TYPE_ATTR_VALUE, TYPE_SPACE, TYPE_NEW_LINE} from '../src/Token'
+import { TYPE_ID, VALUE_ID, TYPE_WORD, TYPE_TAG, TYPE_ATTR_NAME, TYPE_ATTR_VALUE, TYPE_SPACE, TYPE_NEW_LINE} from '../src/Token'
 import { createLexer } from '../src/lexer'

 const TYPE = {
@@ -10,19 +10,58 @@ const TYPE = {
  NEW_LINE: TYPE_NEW_LINE,
 };

+const TYPE_NAMES = Object.fromEntries(Object.keys(TYPE).map(key => [TYPE[key], key]));
+
 const tokenize = input => (createLexer(input).tokenize());
 const tokenizeEscape = input => (createLexer(input, { enableEscapeTags: true }).tokenize());

 describe('lexer', () => {
-  const expectOutput = (output, tokens) => {
-    expect(tokens.length).toBe(output.length);
-    expect(tokens).toBeInstanceOf(Array);
-    tokens.forEach((token, idx) => {
-      expect(token).toBeInstanceOf(Object);
-      expect(token.type).toEqual(output[idx][0]);
-      expect(token.value).toEqual(output[idx][1]);
-    });
-  };
+  expect.extend({
+    toBeMantchOutput(tokens, output) {
+      if (tokens.length !== output.length) {
+        return {
+          message: () =>
+              `expected tokens length ${tokens.length} to be ${output.length}`,
+          pass: false,
+        };
+      }
+
+      for (let idx = 0; idx < tokens.length; idx++) {
+        const token = tokens[idx];
+        const [type, value] = output[idx];
+
+        if (typeof token !== 'object') {
+          return {
+            message: () =>
+                `token must to be Object`,
+            pass: false,
+          };
+        }
+
+        if (token[TYPE_ID] !== type) {
+          return {
+            message: () =>
+                `expected token type ${TYPE_NAMES[type]} but recieved ${TYPE_NAMES[token[TYPE_ID]]} for ${JSON.stringify(output[idx])}`,
+            pass: false,
+          };
+        }
+
+        if (token[VALUE_ID] !== value) {
+          return {
+            message: () =>
+                `expected token value ${value} but recieved ${token[VALUE_ID]} for ${JSON.stringify(output[idx])}`,
+            pass: false,
+          };
+        }
+      }
+
+      return {
+        message: () =>
+            `no valid output`,
+        pass: true,
+      };
+    },
+  });

  test('single tag', () => {
    const input = '[SingleTag]';
@@ -31,7 +70,7 @@ describe('lexer', () => {
      [TYPE.TAG, 'SingleTag', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('single tag with params', () => {
@@ -42,7 +81,19 @@ describe('lexer', () => {
      [TYPE.ATTR_VALUE, '111', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
+  });
+
+  test('single fake tag', () => {
+    const input = '[ user=111]';
+    const tokens = tokenize(input);
+    const output = [
+      [TYPE.WORD, '[', '0', '0'],
+      [TYPE.SPACE, ' ', '0', '0'],
+      [TYPE.WORD, 'user=111]', '0', '0'],
+    ];
+
+    expect(tokens).toBeMantchOutput(output);
  });

  test('single tag with spaces', () => {
@@ -53,9 +104,25 @@ describe('lexer', () => {
      [TYPE.TAG, 'Single Tag', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

+  // @TODO: this is breaking change behavior
+  test.skip('tags with single attrs like disabled', () => {
+    const input = '[textarea disabled]world[/textarea]';
+    const tokens = tokenize(input);
+
+    const output = [
+      [TYPE.TAG, 'textarea', '0', '0'],
+      [TYPE.ATTR_VALUE, 'disabled', '0', '0'],
+      [TYPE.WORD, 'world"', '0', '0'],
+      [TYPE.TAG, '/textarea', '0', '0'],
+    ];
+
+    expect(tokens).toBeMantchOutput(output);
+  });
+
+
  test('string with quotemarks', () => {
    const input = '"Someone Like You" by Adele';
    const tokens = tokenize(input);
@@ -72,7 +139,7 @@ describe('lexer', () => {
      [TYPE.WORD, 'Adele', '21', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('tags in brakets', () => {
@@ -89,7 +156,7 @@ describe('lexer', () => {
      [TYPE.WORD, ']', '7', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('tag as param', () => {
@@ -102,7 +169,7 @@ describe('lexer', () => {
      [TYPE.TAG, '/color', '21', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('tag with quotemark params with spaces', () => {
@@ -118,7 +185,7 @@ describe('lexer', () => {
      [TYPE.TAG, '/url', '24', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('tag with escaped quotemark param', () => {
@@ -132,7 +199,7 @@ describe('lexer', () => {
      [TYPE.TAG, '/url', '26', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('tag param without quotemarks', () => {
@@ -146,7 +213,7 @@ describe('lexer', () => {
      [TYPE.TAG, '/style', '25', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('list tag with items', () => {
@@ -184,7 +251,29 @@ describe('lexer', () => {
      [TYPE.TAG, '/list', '0', '4'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
+  });
+
+  test('few tags without spaces', () => {
+    const input = '[mytag1 size="15"]Tag1[/mytag1][mytag2 size="16"]Tag2[/mytag2][mytag3]Tag3[/mytag3]';
+    const tokens = tokenize(input);
+    const output = [
+      [TYPE.TAG, 'mytag1', 0, 0],
+      [TYPE.ATTR_NAME, 'size', 0, 0],
+      [TYPE.ATTR_VALUE, '15', 0, 0],
+      [TYPE.WORD, 'Tag1', 0, 0],
+      [TYPE.TAG, '/mytag1', 0, 0],
+      [TYPE.TAG, 'mytag2', 0, 0],
+      [TYPE.ATTR_NAME, 'size', 0, 0],
+      [TYPE.ATTR_VALUE, '16', 0, 0],
+      [TYPE.WORD, 'Tag2', 0, 0],
+      [TYPE.TAG, '/mytag2', 0, 0],
+      [TYPE.TAG, 'mytag3', 0, 0],
+      [TYPE.WORD, 'Tag3', 0, 0],
+      [TYPE.TAG, '/mytag3', 0, 0],
+    ];
+
+    expect(tokens).toBeMantchOutput(output);
  });

  test('bad tags as texts', () => {
@@ -211,8 +300,8 @@ describe('lexer', () => {
      [
        [TYPE.WORD, '!', '0', '0'],
        [TYPE.WORD, '[', '1', '0'],
-        [TYPE.WORD, ']', '1', '0'],
-        [TYPE.WORD, '(image.jpg)', '1', '0'],
+        [TYPE.WORD, '](image.jpg)', '1', '0'],
+        // [TYPE.WORD, '', '1', '0'],
      ],
      [
        [TYPE.WORD, 'x', '0', '0'],
@@ -253,7 +342,7 @@ describe('lexer', () => {
      const tokens = tokenize(input);
      const output = asserts[idx];

-      expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    });
  });

@@ -271,7 +360,7 @@ describe('lexer', () => {
      [TYPE.TAG, 'Finger', '0', '0']
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('no close tag', () => {
@@ -286,7 +375,7 @@ describe('lexer', () => {
      [TYPE.WORD, 'A', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('escaped tag', () => {
@@ -301,7 +390,7 @@ describe('lexer', () => {
      [TYPE.WORD, '[', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('escaped tag and escaped backslash', () => {
@@ -321,7 +410,7 @@ describe('lexer', () => {
      [TYPE.WORD, ']', '0', '0'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  test('bad closed tag with escaped backslash', () => {
@@ -335,7 +424,7 @@ describe('lexer', () => {
      [TYPE.WORD, 'b]', '0', '11'],
    ];

-    expectOutput(output, tokens);
+    expect(tokens).toBeMantchOutput(output);
  });

  describe('html', () => {
@@ -358,7 +447,7 @@ describe('lexer', () => {
        [TYPE.TAG, '/button', 2, 0]
      ];

-      expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    });

    test('attributes with no quotes or value', () => {
@@ -377,7 +466,7 @@ describe('lexer', () => {
        [TYPE.TAG, '/button', 2, 0]
      ];

-      expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    });

    test('attributes with no space between them. No valid, but accepted by the browser', () => {
@@ -395,7 +484,7 @@ describe('lexer', () => {
        [TYPE.TAG, '/button', 2, 0]
      ];

-      expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    });

    test.skip('style tag', () => {
@@ -416,10 +505,10 @@ input.medium{width:100px;height:18px}
 input.buttonred{cursor:hand;font-family:verdana;background:#d12124;color:#fff;height:1.4em;font-weight:bold;font-size:9pt;padding:0px 2px;margin:0px;border:0px none #000}
 -->
 </style>`
-    const tokens = tokenizeHTML(content);
-    const output = [];
+      const tokens = tokenizeHTML(content);
+      const output = [];

-    expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    });

    test.skip('script tag', () => {
@@ -432,7 +521,7 @@ input.buttonred{cursor:hand;font-family:verdana;background:#d12124;color:#fff;he
      const tokens = tokenizeHTML(content);
      const output = [];

-      expectOutput(output, tokens);
+      expect(tokens).toBeMantchOutput(output);
    })
  })
 });
@@ -8,8 +8,7 @@ describe('Parser', () => {

  test('parse paired tags tokens', () => {
    const ast = parse('[best name=value]Foo Bar[/best]');
-
-    expectOutput(ast, [
+    const output = [
      {
        tag: 'best',
        attrs: {
@@ -21,15 +20,16 @@ describe('Parser', () => {
          'Bar',
        ],
      },
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('parse only allowed tags', () => {
    const ast = parse('[h1 name=value]Foo [Bar] [/h1]', {
      onlyAllowTags: ['h1']
    });
-
-    expectOutput(ast, [
+    const output = [
      {
        tag: 'h1',
        attrs: {
@@ -42,13 +42,14 @@ describe('Parser', () => {
          ' '
        ],
      },
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('parse inconsistent tags', () => {
    const ast = parse('[h1 name=value]Foo [Bar] /h1]');
-
-    expectOutput(ast, [
+    const output = [
      {
        attrs: {
          name: 'value'
@@ -65,13 +66,14 @@ describe('Parser', () => {
      },
      ' ',
      '/h1]',
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('parse tag with value param', () => {
    const ast = parse('[url=https://github.com/jilizart/bbob]BBob[/url]');
-
-    expectOutput(ast, [
+    const output = [
      {
        tag: 'url',
        attrs: {
@@ -79,13 +81,14 @@ describe('Parser', () => {
        },
        content: ['BBob'],
      },
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('parse tag with quoted param with spaces', () => {
    const ast = parse('[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]Text[/url]');
-
-    expectOutput(ast, [
+    const output = [
      {
        tag: 'url',
        attrs: {
@@ -95,13 +98,14 @@ describe('Parser', () => {
        },
        content: ['Text'],
      },
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('parse single tag with params', () => {
    const ast = parse('[url=https://github.com/jilizart/bbob]');
-
-    expectOutput(ast, [
+    const output = [
      {
        tag: 'url',
        attrs: {
@@ -109,12 +113,15 @@ describe('Parser', () => {
        },
        content: [],
      },
-    ]);
+    ];
+
+    expectOutput(ast, output);
  });

  test('detect inconsistent tag', () => {
    const onError = jest.fn();
-    const ast = parse('[c][/c][b]hello[/c][/b][b]', { onError });
+
+    parse('[c][/c][b]hello[/c][/b][b]', { onError });

    expect(onError).toHaveBeenCalled();
  });
@@ -145,6 +152,82 @@ describe('Parser', () => {
    ])
  });

+  test('parse few tags without spaces', () => {
+    const ast = parse('[mytag1 size="15"]Tag1[/mytag1][mytag2 size="16"]Tag2[/mytag2][mytag3]Tag3[/mytag3]');
+    const output = [
+      {
+        tag: 'mytag1',
+        attrs: {
+          size: '15',
+        },
+        content: ['Tag1'],
+      },
+      {
+        tag: 'mytag2',
+        attrs: {
+          size: '16',
+        },
+        content: ['Tag2'],
+      },
+      {
+        tag: 'mytag3',
+        attrs: {},
+        content: ['Tag3'],
+      },
+    ];
+
+    expectOutput(ast, output);
+  });
+
+  // @TODO: this is breaking change behavior
+  test.skip('parse tags with single attributes like disabled', () => {
+    const ast = parse('[b]hello[/b] [textarea disabled]world[/textarea]');
+
+    expectOutput(ast, [
+      {
+        tag: 'b',
+        attrs: {},
+        content: ['hello'],
+      },
+        ' ',
+      {
+        tag: 'textarea',
+        attrs: {
+          disabled: 'disabled',
+        },
+        content: ['world'],
+      },
+    ]);
+  });
+
+  test('parse url tag with get params', () => {
+    const ast = parse('[url=https://github.com/JiLiZART/bbob/search?q=any&unscoped_q=any]GET[/url]');
+
+    expectOutput(ast, [
+      {
+        tag: 'url',
+        attrs: {
+          'https://github.com/JiLiZART/bbob/search?q=any&unscoped_q=any': 'https://github.com/JiLiZART/bbob/search?q=any&unscoped_q=any',
+        },
+        content: ['GET'],
+      },
+    ]);
+  });
+
+  test('parse url tag with # and = symbols [google docs]', () => {
+    const ast = parse('[url href=https://docs.google.com/spreadsheets/d/1W9VPUESF_NkbSa_HtRFrQNl0nYo8vPCxJFy7jD3Tpio/edit#gid=0]Docs[/url]');
+
+    expectOutput(ast, [
+      {
+        tag: 'url',
+        attrs: {
+          href: 'https://docs.google.com/spreadsheets/d/1W9VPUESF_NkbSa_HtRFrQNl0nYo8vPCxJFy7jD3Tpio/edit#gid=0',
+        },
+        content: ['Docs'],
+      },
+    ]);
+  });
+
  describe('html', () => {
    const parseHTML = input => parse(input, { openTag: '<', closeTag: '>' });

@@ -0,0 +1,91 @@
+import { createCharGrabber } from '../src/utils';
+
+
+describe('utils', () => {
+  describe('createCharGrabber', () => {
+
+    test('#substrUntilChar ] 1', () => {
+      /**
+
+      }
+       */
+      const bufferGrabber = createCharGrabber('[h1 name=value]Foo [Bar] [/h1]');
+      const substr = bufferGrabber.substrUntilChar(']');
+
+      expect(substr).toBe('[h1 name=value');
+    });
+
+    test('#substrUntilChar ] 2', () => {
+      /**
+       console.log src/utils.js:95
+       substrUntilChar { char: ']', indexOfChar: 63, curPos: 0 } {
+        result: '[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"',
+        source: '[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]Text[/url]'
+      }
+       console.log src/utils.js:104
+       substrUntilChar.new { char: ']', indexOfCharNew: 63, curPos: 0 } {
+        result: '[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]',
+        source: '[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]Text[/url]'
+      }
+       */
+      const bufferGrabber = createCharGrabber('[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"]Text[/url]');
+      const substr = bufferGrabber.substrUntilChar(']');
+
+      expect(substr).toBe('[url href=https://ru.wikipedia.org target=_blank text="Foo Bar"');
+    });
+
+    test('#substrUntilChar ] 3', () => {
+      /**
+       console.log src/utils.js:95
+       substrUntilChar { char: ']', indexOfChar: 14, curPos: 7 } {
+        result: 'blah foo="bar"',
+        source: 'hello [blah foo="bar"]world[/blah]'
+      }
+       console.log src/utils.js:104
+       substrUntilChar.new { char: ']', indexOfCharNew: 21, curPos: 7 } {
+        result: 'blah foo="bar"]world[/',
+        source: 'hello [blah foo="bar"]world[/blah]'
+      }
+       */
+      const bufferGrabber = createCharGrabber('hello [blah foo="bar"]world[/blah]');
+      const substr = bufferGrabber.substrUntilChar('[');
+
+      expect(substr).toBe('hello ');
+    });
+
+    test('#substrUntilChar not existed', () => {
+      /**
+       console.log src/utils.js:95
+       substrUntilChar { char: ']', indexOfChar: 14, curPos: 7 } {
+        result: 'blah foo="bar"',
+        source: 'hello [blah foo="bar"]world[/blah]'
+      }
+       console.log src/utils.js:104
+       substrUntilChar.new { char: ']', indexOfCharNew: 21, curPos: 7 } {
+        result: 'blah foo="bar"]world[/',
+        source: 'hello [blah foo="bar"]world[/blah]'
+      }
+       */
+      const bufferGrabber = createCharGrabber('hello');
+      const substr = bufferGrabber.substrUntilChar('[');
+
+      expect(substr).toBe('');
+    });
+
+    test('getPrev is null', () => {
+      const bufferGrabber = createCharGrabber('');
+      const prev = bufferGrabber.getPrev();
+
+      expect(prev).toBe(null);
+    });
+
+    test('getRest', () => {
+      const bufferGrabber = createCharGrabber('hello [blah foo="bar"]world[/blah]');
+      bufferGrabber.skip();
+      const rest = bufferGrabber.getRest();
+
+      expect(rest).toBe('ello [blah foo="bar"]world[/blah]');
+    });
+
+  })
+});