diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-02-28 16:38:01 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-28 16:38:01 -0500 |
commit | 8f057c63fea6f06e921f2134d881192002dd23bc (patch) | |
tree | dbdaf163493d2f54e6420297ddfbd5f81c50b596 /ext/js/background | |
parent | ec1a8380b5dd1b091fcdbb96edcdab56c9df9f9e (diff) |
Improve text parser (#1469)
* Merge ungrouped characters
* Update iteration
* Fix incorrect code point handling
* Simplify text
* Specify language
* Update how parsed status is represented
Diffstat (limited to 'ext/js/background')
-rw-r--r-- | ext/js/background/backend.js | 30 |
1 files changed, 20 insertions, 10 deletions
diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index 1238673b..715b916b 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -1038,31 +1038,41 @@ class Backend { const {scanning: {length: scanningLength}, parsing: {readingMode}} = options; const findTermsOptions = this._getTranslatorFindTermsOptions({wildcard: null}, options); const results = []; - while (text.length > 0) { - const term = []; + let previousUngroupedSegment = null; + let i = 0; + const ii = text.length; + while (i < ii) { const [definitions, sourceLength] = await this._translator.findTerms( 'simple', - text.substring(0, scanningLength), + text.substring(i, i + scanningLength), findTermsOptions ); + const codePoint = text.codePointAt(i); + const character = String.fromCodePoint(codePoint); if ( definitions.length > 0 && sourceLength > 0 && - (sourceLength !== 1 || this._japaneseUtil.isCodePointJapanese(text[0])) + (sourceLength !== character.length || this._japaneseUtil.isCodePointJapanese(codePoint)) ) { + previousUngroupedSegment = null; const {expression, reading} = definitions[0]; - const source = text.substring(0, sourceLength); + const source = text.substring(i, i + sourceLength); + const term = []; for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) { const reading2 = jp.convertReading(text2, furigana, readingMode); term.push({text: text2, reading: reading2}); } - text = text.substring(source.length); + results.push(term); + i += sourceLength; } else { - const reading = jp.convertReading(text[0], '', readingMode); - term.push({text: text[0], reading}); - text = text.substring(1); + if (previousUngroupedSegment === null) { + previousUngroupedSegment = {text: character, reading: ''}; + results.push([previousUngroupedSegment]); + } else { + previousUngroupedSegment.text += character; + } + i += character.length; } - results.push(term); } return results; } |