Improve text parser (#1469)

* Merge ungrouped characters * Update iteration * Fix incorrect code point handling * Simplify text * Specify language * Update how parsed status is represented
author: toasted-nutbread <toasted-nutbread@users.noreply.github.com> 2021-02-28 16:38:01 -0500
committer: GitHub <noreply@github.com> 2021-02-28 16:38:01 -0500
commit: 8f057c63fea6f06e921f2134d881192002dd23bc (patch)
tree: dbdaf163493d2f54e6420297ddfbd5f81c50b596 /ext/js/background
parent: ec1a8380b5dd1b091fcdbb96edcdab56c9df9f9e (diff)
1 files changed, 20 insertions, 10 deletions
diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js
index 1238673b..715b916b 100644
--- a/ext/js/background/backend.js
+++ b/ext/js/background/backend.js
@@ -1038,31 +1038,41 @@ class Backend {
         const {scanning: {length: scanningLength}, parsing: {readingMode}} = options;
         const findTermsOptions = this._getTranslatorFindTermsOptions({wildcard: null}, options);
         const results = [];
-        while (text.length > 0) {
-            const term = [];
+        let previousUngroupedSegment = null;
+        let i = 0;
+        const ii = text.length;
+        while (i < ii) {
             const [definitions, sourceLength] = await this._translator.findTerms(
                 'simple',
-                text.substring(0, scanningLength),
+                text.substring(i, i + scanningLength),
                 findTermsOptions
             );
+            const codePoint = text.codePointAt(i);
+            const character = String.fromCodePoint(codePoint);
             if (
                 definitions.length > 0 &&
                 sourceLength > 0 &&
-                (sourceLength !== 1 || this._japaneseUtil.isCodePointJapanese(text[0]))
+                (sourceLength !== character.length || this._japaneseUtil.isCodePointJapanese(codePoint))
             ) {
+                previousUngroupedSegment = null;
                 const {expression, reading} = definitions[0];
-                const source = text.substring(0, sourceLength);
+                const source = text.substring(i, i + sourceLength);
+                const term = [];
                 for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) {
                     const reading2 = jp.convertReading(text2, furigana, readingMode);
                     term.push({text: text2, reading: reading2});
                 }
-                text = text.substring(source.length);
+                results.push(term);
+                i += sourceLength;
             } else {
-                const reading = jp.convertReading(text[0], '', readingMode);
-                term.push({text: text[0], reading});
-                text = text.substring(1);
+                if (previousUngroupedSegment === null) {
+                    previousUngroupedSegment = {text: character, reading: ''};
+                    results.push([previousUngroupedSegment]);
+                } else {
+                    previousUngroupedSegment.text += character;
+                }
+                i += character.length;
             }
-            results.push(term);
         }
         return results;
     }
author	toasted-nutbread <toasted-nutbread@users.noreply.github.com>	2021-02-28 16:38:01 -0500
committer	GitHub <noreply@github.com>	2021-02-28 16:38:01 -0500
commit	8f057c63fea6f06e921f2134d881192002dd23bc (patch)
tree	dbdaf163493d2f54e6420297ddfbd5f81c50b596 /ext/js/background
parent	ec1a8380b5dd1b091fcdbb96edcdab56c9df9f9e (diff)