diff options
author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-02-17 02:45:24 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-17 01:45:24 +0000 |
commit | 4aaa9f15d97668203741c1731f15e710ae8b8294 (patch) | |
tree | d1885f7fbd7d1510a71176597169d6847ae26572 /ext/js/language/translator.js | |
parent | 4e77741d22778bd09b772fc53f1cbd64107e3d24 (diff) |
add language select, abstract text transformations (#584)
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* part1
* frotend done?
* fix tests
* offscreen and type complications
* add tests
* start fixing tests
* keep fixing tests
* fix tests
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* delete pt
* renames
* add tests
* kebab-case filenames
* lint
* minor fixes
* merge
* fixes
* fix part of comments
* fix more comments
* delete unused types
* comment
* comment
* do backend
* other files
* move fetch utils to own file
* remove extra line
* add extra line
* remove unnecessary export
* simplify folder structure
* remove redundant async
* fix param type in api
* fix language index
* undo changes to cssStyleApplier
* undo changes to utilities.js
* undo changes to utilities.js
* simplify language util
* lint
* undo phantom changes to anki integration
* require textTransformations options
* explicit locale in localeCompare
* punctuate notes
* prefer early exit
* rename LanguageOptionsObjectMap
* rename to textPreprocessor
* tuple with names instead of boolean array
* safe data setting
* optional chaining
* simplify LanguageOptions
* encapsulate languages
* delete language util
* nullable language in text preprocessors controller
* rename transform to process
* remove settings
* make translation advanced again
* remove unused getTextTransformations api call
* comments
* change language types
* RIP flags
* comments
* fix tests
* lint
* Text preprocessor type changes (#10)
* Add types
* Update types
* Simplify type check
* Refactor typing and structuring of language definitions
* lint
* update translator benchmark
* undo markdown changes
* undo markdown changes
* undo markdown changes
* more merge
* simplify language controller
---------
Co-authored-by: toasted-nutbread <toasted-nutbread@users.noreply.github.com>
Co-authored-by: Darius Jahandarie <djahandarie@gmail.com>
Diffstat (limited to 'ext/js/language/translator.js')
-rw-r--r-- | ext/js/language/translator.js | 134 |
1 files changed, 57 insertions, 77 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index b2342e8d..4f9304b5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@ import {applyTextReplacement} from '../general/regex-util.js'; import {TextSourceMap} from '../general/text-source-map.js'; -import {convertAlphabeticToKana} from './ja/japanese-wanakana.js'; -import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js'; +import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; +import {getAllLanguageTextPreprocessors} from './languages.js'; /** * Class which finds term and kanji dictionary entries for text. @@ -41,6 +41,8 @@ export class Translator { this._stringComparer = new Intl.Collator('en-US'); // Invariant locale /** @type {RegExp} */ this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; + /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ + this._textPreprocessors = new Map(); } /** @@ -49,6 +51,14 @@ export class Translator { */ prepare(descriptor) { this._languageTransformer.addDescriptor(descriptor); + for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { + /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ + const optionSpace = new Map(); + for (const {id, textPreprocessor} of textPreprocessors) { + optionSpace.set(id, textPreprocessor.options); + } + this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); + } } /** @@ -415,51 +425,45 @@ export class Translator { } } - // Deinflections and text transformations + // Deinflections and text preprocessing /** * @param {string} text * @param {import('translation').FindTermsOptions} options * @returns {import('translation-internal').DatabaseDeinflection[]} + * @throws {Error} */ _getAlgorithmDeinflections(text, options) { - /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */ - const textOptionVariantArray = [ - this._getTextReplacementsVariants(options), - this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), - this._getTextOptionEntryVariants(options.convertNumericCharacters), - this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), - this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), - this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), - this._getCollapseEmphaticOptions(options) - ]; + const {language} = options; + const info = this._textPreprocessors.get(language); + if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } + const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; + + /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ + const variantSpace = new Map(); + variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); + for (const [key, value] of textPreprocessorOptionsSpace) { + variantSpace.set(key, value); + } /** @type {import('translation-internal').DatabaseDeinflection[]} */ const deinflections = []; const used = new Set(); - for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator<import('translation-internal').TextDeinflectionOptions, void, unknown>} */ (this._getArrayVariants(textOptionVariantArray))) { + + for (const arrayVariant of this._generateArrayVariants(variantSpace)) { + const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); + let text2 = text; const sourceMap = new TextSourceMap(text2); + if (textReplacements !== null) { text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); } - if (halfWidth) { - text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap); - } - if (numeric) { - text2 = convertNumericToFullWidth(text2); - } - if (alphabetic) { - text2 = convertAlphabeticToKana(text2, sourceMap); - } - if (katakana) { - text2 = convertHiraganaToKatakana(text2); - } - if (hiragana) { - text2 = convertKatakanaToHiragana(text2); - } - if (collapseEmphatic) { - text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + + for (const preprocessor of textPreprocessors.values()) { + const {id, textPreprocessor} = preprocessor; + const setting = arrayVariant.get(id); + text2 = textPreprocessor.process(text2, setting, sourceMap); } for ( @@ -527,36 +531,6 @@ export class Translator { } /** - * @param {import('translation').FindTermsVariantMode} value - * @returns {boolean[]} - */ - _getTextOptionEntryVariants(value) { - switch (value) { - case 'true': return [true]; - case 'variant': return [false, true]; - default: return [false]; - } - } - - /** - * @param {import('translation').FindTermsOptions} options - * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} - */ - _getCollapseEmphaticOptions(options) { - /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */ - const collapseEmphaticOptions = [[false, false]]; - switch (options.collapseEmphaticSequences) { - case 'true': - collapseEmphaticOptions.push([true, false]); - break; - case 'full': - collapseEmphaticOptions.push([true, false], [true, true]); - break; - } - return collapseEmphaticOptions; - } - - /** * @param {import('translation').FindTermsOptions} options * @returns {(import('translation').FindTermsTextReplacement[] | null)[]} */ @@ -1343,26 +1317,32 @@ export class Translator { } /** - * @param {[...args: unknown[][]]} arrayVariants - * @yields {[...args: unknown[]]} - * @returns {Generator<unknown[], void, unknown>} + * @param {Map<string, unknown[]>} arrayVariants + * @yields {Map<string, unknown>} + * @returns {Generator<Map<string, unknown>, void, void>} */ - *_getArrayVariants(arrayVariants) { - const ii = arrayVariants.length; - - let total = 1; - for (let i = 0; i < ii; ++i) { - total *= arrayVariants[i].length; + *_generateArrayVariants(arrayVariants) { + const variantKeys = [...arrayVariants.keys()]; + const entryVariantLengths = []; + for (const key of variantKeys) { + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + entryVariantLengths.push(entryVariants.length); } + const totalVariants = entryVariantLengths.reduce((acc, length) => acc * length, 1); + + for (let variantIndex = 0; variantIndex < totalVariants; ++variantIndex) { + /** @type {Map<string, unknown>} */ + const variant = new Map(); + let remainingIndex = variantIndex; - for (let a = 0; a < total; ++a) { - const variant = []; - let index = a; - for (let i = 0; i < ii; ++i) { - const entryVariants = arrayVariants[i]; - variant.push(entryVariants[index % entryVariants.length]); - index = Math.floor(index / entryVariants.length); + for (let keyIndex = 0; keyIndex < variantKeys.length; ++keyIndex) { + const key = variantKeys[keyIndex]; + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + const entryIndex = remainingIndex % entryVariants.length; + variant.set(key, entryVariants[entryIndex]); + remainingIndex = Math.floor(remainingIndex / entryVariants.length); } + yield variant; } } |