From 8f057c63fea6f06e921f2134d881192002dd23bc Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Sun, 28 Feb 2021 16:38:01 -0500 Subject: Improve text parser (#1469) * Merge ungrouped characters * Update iteration * Fix incorrect code point handling * Simplify text * Specify language * Update how parsed status is represented --- ext/js/background/backend.js | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'ext/js/background') diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index 1238673b..715b916b 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -1038,31 +1038,41 @@ class Backend { const {scanning: {length: scanningLength}, parsing: {readingMode}} = options; const findTermsOptions = this._getTranslatorFindTermsOptions({wildcard: null}, options); const results = []; - while (text.length > 0) { - const term = []; + let previousUngroupedSegment = null; + let i = 0; + const ii = text.length; + while (i < ii) { const [definitions, sourceLength] = await this._translator.findTerms( 'simple', - text.substring(0, scanningLength), + text.substring(i, i + scanningLength), findTermsOptions ); + const codePoint = text.codePointAt(i); + const character = String.fromCodePoint(codePoint); if ( definitions.length > 0 && sourceLength > 0 && - (sourceLength !== 1 || this._japaneseUtil.isCodePointJapanese(text[0])) + (sourceLength !== character.length || this._japaneseUtil.isCodePointJapanese(codePoint)) ) { + previousUngroupedSegment = null; const {expression, reading} = definitions[0]; - const source = text.substring(0, sourceLength); + const source = text.substring(i, i + sourceLength); + const term = []; for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) { const reading2 = jp.convertReading(text2, furigana, readingMode); term.push({text: text2, reading: reading2}); } - text = text.substring(source.length); + results.push(term); + i += sourceLength; } else { - const reading = jp.convertReading(text[0], '', readingMode); - term.push({text: text[0], reading}); - text = text.substring(1); + if (previousUngroupedSegment === null) { + previousUngroupedSegment = {text: character, reading: ''}; + results.push([previousUngroupedSegment]); + } else { + previousUngroupedSegment.text += character; + } + i += character.length; } - results.push(term); } return results; } -- cgit v1.2.3