From 0566241e2315cae0879ecb3ab467c83e99f0cc49 Mon Sep 17 00:00:00 2001 From: Nikolay Kost Date: Sun, 9 Mar 2025 02:23:03 +0200 Subject: [PATCH] feat(#271): whitespaceInTags mode (#272) * fix: test for buggy behavior * feat: implement whitespaceInTags mode * feat: move all char arrays to Map * feat: revert Map for char arrays --- .changeset/tiny-dolls-raise.md | 29 +++++++++++++++++++ packages/bbob-parser/src/lexer.ts | 13 ++++++--- packages/bbob-parser/src/parse.ts | 1 + packages/bbob-parser/test/parse.test.ts | 37 +++++++++++++++++++++++++ packages/bbob-types/src/parser.ts | 1 + 5 files changed, 77 insertions(+), 4 deletions(-) create mode 100644 .changeset/tiny-dolls-raise.md diff --git a/.changeset/tiny-dolls-raise.md b/.changeset/tiny-dolls-raise.md new file mode 100644 index 0000000..3ccfa5a --- /dev/null +++ b/.changeset/tiny-dolls-raise.md @@ -0,0 +1,29 @@ +--- +"@bbob/parser": minor +"@bbob/types": minor +"@bbob/cli": minor +"@bbob/core": minor +"@bbob/html": minor +"@bbob/plugin-helper": minor +"@bbob/preset": minor +"@bbob/preset-html5": minor +"@bbob/preset-react": minor +"@bbob/preset-vue": minor +"@bbob/react": minor +"@bbob/vue2": minor +"@bbob/vue3": minor +--- + +Added `whitespaceInTags` parsing option (true by default) with this option you can disable parsing `[tags with spaces]` it will be considered as text + +```js +import html5 from '@bbob/preset-html5' +import parse from '@bbob/html' + +const html = parse('[b]lorem[/b] [foo bar] [i]ipsum[/i]', html5(), { + whitespaceInTags: false +}) + +console.log(html) // lorem [foo bar] ipsum +``` + diff --git a/packages/bbob-parser/src/lexer.ts b/packages/bbob-parser/src/lexer.ts index c57ba7e..d5d1ce7 100644 --- a/packages/bbob-parser/src/lexer.ts +++ b/packages/bbob-parser/src/lexer.ts @@ -67,7 +67,6 @@ export function createLexer(buffer: string, options: LexerOptions = {}): LexerTo const NOT_CHAR_TOKENS = [ openTag, SPACE, TAB, N, ]; - const isCharReserved = (char: string) => (RESERVED_CHARS.indexOf(char) >= 0); const isCharToken = (char: string) => (NOT_CHAR_TOKENS.indexOf(char) === -1); const isEscapableChar = (char: string) => (char === openTag || char === closeTag || char === BACKSLASH); @@ -198,13 +197,19 @@ export function createLexer(buffer: string, options: LexerOptions = {}): LexerTo const currChar = chars.getCurr(); const nextChar = chars.getNext(); - chars.skip(); + chars.skip(); // skip openTag // detect case where we have '[My word [tag][/tag]' or we have '[My last line word' const substr = chars.substrUntilChar(closeTag); - const hasInvalidChars = substr.length === 0 || substr.indexOf(openTag) >= 0; - if ((nextChar && isCharReserved(nextChar)) || hasInvalidChars || chars.isLast()) { + + const hasInvalidChars = substr.length === 0 || substr.indexOf(openTag) >= 0; + const isNextCharReserved = nextChar && isCharReserved(nextChar) + const isLastChar = chars.isLast() + const hasSpace = substr.indexOf(SPACE) >= 0; + const isSpaceRestricted = hasSpace && options.whitespaceInTags === false; + + if (isNextCharReserved || hasInvalidChars || isLastChar || isSpaceRestricted) { emitToken(TYPE_WORD, currChar); return STATE_WORD; diff --git a/packages/bbob-parser/src/parse.ts b/packages/bbob-parser/src/parse.ts index 1217614..7e77dbd 100644 --- a/packages/bbob-parser/src/parse.ts +++ b/packages/bbob-parser/src/parse.ts @@ -321,6 +321,7 @@ function parse(input: string, opts: ParseOptions = {}) { contextFreeTags: options.contextFreeTags, caseFreeTags: options.caseFreeTags, enableEscapeTags: options.enableEscapeTags, + whitespaceInTags: options.whitespaceInTags, }); // eslint-disable-next-line no-unused-vars diff --git a/packages/bbob-parser/test/parse.test.ts b/packages/bbob-parser/test/parse.test.ts index ec558ca..ca4dca6 100644 --- a/packages/bbob-parser/test/parse.test.ts +++ b/packages/bbob-parser/test/parse.test.ts @@ -869,6 +869,43 @@ sdfasdfasdf ]); }); + test('parse invalid tags', () => { + const input = parse('[b]Press Release[/b] [statement redacted] [i]This is more content[/i]', { + whitespaceInTags: false + }) + + expectOutput(input, [ + { + tag: 'b', + attrs: {}, + content: [ + 'Press', + ' ', + 'Release' + ], + }, + ' ', + '[', + 'statement', + ' ', + 'redacted]', + ' ', + { + tag: 'i', + attrs: {}, + content: [ + 'This', + ' ', + 'is', + ' ', + 'more', + ' ', + 'content' + ], + }, + ]); + }) + describe('html', () => { const parseHTML = (input: string) => parse(input, { openTag: '<', closeTag: '>' }); diff --git a/packages/bbob-types/src/parser.ts b/packages/bbob-types/src/parser.ts index c845dea..6c7463f 100644 --- a/packages/bbob-types/src/parser.ts +++ b/packages/bbob-types/src/parser.ts @@ -32,6 +32,7 @@ export interface CommonOptions { onlyAllowTags?: string[]; enableEscapeTags?: boolean; caseFreeTags?: boolean; + whitespaceInTags?: boolean; contextFreeTags?: string[]; }