diff options
| author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-02-17 02:45:24 +0100 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-02-17 01:45:24 +0000 | 
| commit | 4aaa9f15d97668203741c1731f15e710ae8b8294 (patch) | |
| tree | d1885f7fbd7d1510a71176597169d6847ae26572 /ext/js/language | |
| parent | 4e77741d22778bd09b772fc53f1cbd64107e3d24 (diff) | |
add language select, abstract text transformations (#584)
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* part1
* frotend done?
* fix tests
* offscreen and type complications
* add tests
* start fixing tests
* keep fixing tests
* fix tests
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* delete pt
* renames
* add tests
* kebab-case filenames
* lint
* minor fixes
* merge
* fixes
* fix part of comments
* fix more comments
* delete unused types
* comment
* comment
* do backend
* other files
* move fetch utils to own file
* remove extra line
* add extra line
* remove unnecessary export
* simplify folder structure
* remove redundant async
* fix param type in api
* fix language index
* undo changes to cssStyleApplier
* undo changes to utilities.js
* undo changes to utilities.js
* simplify language util
* lint
* undo phantom changes to anki integration
* require textTransformations options
* explicit locale in localeCompare
* punctuate notes
* prefer early exit
* rename LanguageOptionsObjectMap
* rename to textPreprocessor
* tuple with names instead of boolean array
* safe data setting
* optional chaining
* simplify LanguageOptions
* encapsulate languages
* delete language util
* nullable language in text preprocessors controller
* rename transform to process
* remove settings
* make translation advanced again
* remove unused getTextTransformations api call
* comments
* change language types
* RIP flags
* comments
* fix tests
* lint
* Text preprocessor type changes (#10)
* Add types
* Update types
* Simplify type check
* Refactor typing and structuring of language definitions
* lint
* update translator benchmark
* undo markdown changes
* undo markdown changes
* undo markdown changes
* more merge
* simplify language controller
---------
Co-authored-by: toasted-nutbread <toasted-nutbread@users.noreply.github.com>
Co-authored-by: Darius Jahandarie <djahandarie@gmail.com>
Diffstat (limited to 'ext/js/language')
| -rw-r--r-- | ext/js/language/en/language-english.js | 29 | ||||
| -rw-r--r-- | ext/js/language/ja/language-japanese.js | 77 | ||||
| -rwxr-xr-x | ext/js/language/languages.js | 61 | ||||
| -rwxr-xr-x | ext/js/language/text-preprocessors.js | 35 | ||||
| -rw-r--r-- | ext/js/language/translator.js | 134 | 
5 files changed, 259 insertions, 77 deletions
| diff --git a/ext/js/language/en/language-english.js b/ext/js/language/en/language-english.js new file mode 100644 index 00000000..8268653f --- /dev/null +++ b/ext/js/language/en/language-english.js @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {capitalizeFirstLetter, decapitalize} from '../text-preprocessors.js'; + +/** @type {import('language-english').EnglishLanguageDescriptor} */ +export const descriptor = { +    name: 'English', +    iso: 'en', +    exampleText: 'read', +    textPreprocessors: { +        capitalizeFirstLetter, +        decapitalize +    } +}; diff --git a/ext/js/language/ja/language-japanese.js b/ext/js/language/ja/language-japanese.js new file mode 100644 index 00000000..ced34bcd --- /dev/null +++ b/ext/js/language/ja/language-japanese.js @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {convertAlphabeticToKana} from './japanese-wanakana.js'; +import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth} from './japanese.js'; + +/** @type {import('language-japanese').JapaneseLanguageDescriptor} */ +export const descriptor = { +    name: 'Japanese', +    iso: 'ja', +    exampleText: '読め', +    textPreprocessors: { +        convertHalfWidthCharacters: { +            name: 'Convert half width characters to full width', +            description: 'ヨミチャン → ヨミチャン', +            options: basicTextPreprocessorOptions, +            /** @type {import('language').TextPreprocessorFunction<boolean>} */ +            process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) +        }, +        convertNumericCharacters: { +            name: 'Convert numeric characters to full width', +            description: '1234 → 1234', +            options: basicTextPreprocessorOptions, +            /** @type {import('language').TextPreprocessorFunction<boolean>} */ +            process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) +        }, +        convertAlphabeticCharacters: { +            name: 'Convert alphabetic characters to hiragana', +            description: 'yomichan → よみちゃん', +            options: basicTextPreprocessorOptions, +            /** @type {import('language').TextPreprocessorFunction<boolean>} */ +            process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) +        }, +        convertHiraganaToKatakana: { +            name: 'Convert hiragana to katakana', +            description: 'よみちゃん → ヨミチャン', +            options: basicTextPreprocessorOptions, +            /** @type {import('language').TextPreprocessorFunction<boolean>} */ +            process: (str, setting) => (setting ? convertHiraganaToKatakana(str) : str) +        }, +        convertKatakanaToHiragana: { +            name: 'Convert katakana to hiragana', +            description: 'ヨミチャン → よみちゃん', +            options: basicTextPreprocessorOptions, +            /** @type {import('language').TextPreprocessorFunction<boolean>} */ +            process: (str, setting) => (setting ? convertKatakanaToHiragana(str) : str) +        }, +        collapseEmphaticSequences: { +            name: 'Collapse emphatic character sequences', +            description: 'すっっごーーい → すっごーい / すごい', +            options: [[false, false], [true, false], [true, true]], +            /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ +            process: (str, setting, sourceMap) => { +                const [collapseEmphatic, collapseEmphaticFull] = setting; +                if (collapseEmphatic) { +                    str = collapseEmphaticSequences(str, collapseEmphaticFull, sourceMap); +                } +                return str; +            } +        } +    } +}; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js new file mode 100755 index 00000000..f51ca163 --- /dev/null +++ b/ext/js/language/languages.js @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {descriptor as descriptorEnglish} from './en/language-english.js'; +import {descriptor as descriptorJapanese} from './ja/language-japanese.js'; + +const languageDescriptors = [ +    descriptorEnglish, +    descriptorJapanese +]; + +/** @type {Map<string, typeof languageDescriptors[0]>} */ +const languageDescriptorMap = new Map(); +for (const languageDescriptor of languageDescriptors) { +    languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); +} + +/** + * @returns {import('language').LanguageSummary[]} + */ +export function getLanguageSummaries() { +    const results = []; +    for (const {name, iso, exampleText} of languageDescriptorMap.values()) { +        results.push({name, iso, exampleText}); +    } +    return results; +} + +/** + * @returns {import('language').LanguageAndPreprocessors[]} + * @throws {Error} + */ +export function getAllLanguageTextPreprocessors() { +    const results = []; +    for (const {iso, textPreprocessors} of languageDescriptorMap.values()) { +        /** @type {import('language').TextPreprocessorWithId<unknown>[]} */ +        const textPreprocessorsArray = []; +        for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) { +            textPreprocessorsArray.push({ +                id, +                textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor) +            }); +        } +        results.push({iso, textPreprocessors: textPreprocessorsArray}); +    } +    return results; +} diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js new file mode 100755 index 00000000..12b3d1b6 --- /dev/null +++ b/ext/js/language/text-preprocessors.js @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +/** @type {import('language').TextPreprocessorOptions<boolean>} */ +export const basicTextPreprocessorOptions = [false, true]; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const decapitalize = { +    name: 'Decapitalize text', +    description: 'CAPITALIZED TEXT → capitalized text', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => (setting ? str.toLowerCase() : str) +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const capitalizeFirstLetter = { +    name: 'Capitalize first letter', +    description: 'lowercase text → Lowercase text', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str) +}; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index b2342e8d..4f9304b5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@  import {applyTextReplacement} from '../general/regex-util.js';  import {TextSourceMap} from '../general/text-source-map.js'; -import {convertAlphabeticToKana} from './ja/japanese-wanakana.js'; -import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js'; +import {isCodePointJapanese} from './ja/japanese.js';  import {LanguageTransformer} from './language-transformer.js'; +import {getAllLanguageTextPreprocessors} from './languages.js';  /**   * Class which finds term and kanji dictionary entries for text. @@ -41,6 +41,8 @@ export class Translator {          this._stringComparer = new Intl.Collator('en-US'); // Invariant locale          /** @type {RegExp} */          this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; +        /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ +        this._textPreprocessors = new Map();      }      /** @@ -49,6 +51,14 @@ export class Translator {       */      prepare(descriptor) {          this._languageTransformer.addDescriptor(descriptor); +        for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { +            /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ +            const optionSpace = new Map(); +            for (const {id, textPreprocessor} of textPreprocessors) { +                optionSpace.set(id, textPreprocessor.options); +            } +            this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); +        }      }      /** @@ -415,51 +425,45 @@ export class Translator {          }      } -    // Deinflections and text transformations +    // Deinflections and text preprocessing      /**       * @param {string} text       * @param {import('translation').FindTermsOptions} options       * @returns {import('translation-internal').DatabaseDeinflection[]} +     * @throws {Error}       */      _getAlgorithmDeinflections(text, options) { -        /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */ -        const textOptionVariantArray = [ -            this._getTextReplacementsVariants(options), -            this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), -            this._getTextOptionEntryVariants(options.convertNumericCharacters), -            this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), -            this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), -            this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), -            this._getCollapseEmphaticOptions(options) -        ]; +        const {language} = options; +        const info = this._textPreprocessors.get(language); +        if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } +        const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; + +        /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ +        const variantSpace = new Map(); +        variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); +        for (const [key, value] of textPreprocessorOptionsSpace) { +            variantSpace.set(key, value); +        }          /** @type {import('translation-internal').DatabaseDeinflection[]} */          const deinflections = [];          const used = new Set(); -        for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator<import('translation-internal').TextDeinflectionOptions, void, unknown>} */ (this._getArrayVariants(textOptionVariantArray))) { + +        for (const arrayVariant of this._generateArrayVariants(variantSpace)) { +            const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); +              let text2 = text;              const sourceMap = new TextSourceMap(text2); +              if (textReplacements !== null) {                  text2 = this._applyTextReplacements(text2, sourceMap, textReplacements);              } -            if (halfWidth) { -                text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap); -            } -            if (numeric) { -                text2 = convertNumericToFullWidth(text2); -            } -            if (alphabetic) { -                text2 = convertAlphabeticToKana(text2, sourceMap); -            } -            if (katakana) { -                text2 = convertHiraganaToKatakana(text2); -            } -            if (hiragana) { -                text2 = convertKatakanaToHiragana(text2); -            } -            if (collapseEmphatic) { -                text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + +            for (const preprocessor of textPreprocessors.values()) { +                const {id, textPreprocessor} = preprocessor; +                const setting = arrayVariant.get(id); +                text2 = textPreprocessor.process(text2, setting, sourceMap);              }              for ( @@ -527,36 +531,6 @@ export class Translator {      }      /** -     * @param {import('translation').FindTermsVariantMode} value -     * @returns {boolean[]} -     */ -    _getTextOptionEntryVariants(value) { -        switch (value) { -            case 'true': return [true]; -            case 'variant': return [false, true]; -            default: return [false]; -        } -    } - -    /** -     * @param {import('translation').FindTermsOptions} options -     * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} -     */ -    _getCollapseEmphaticOptions(options) { -        /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */ -        const collapseEmphaticOptions = [[false, false]]; -        switch (options.collapseEmphaticSequences) { -            case 'true': -                collapseEmphaticOptions.push([true, false]); -                break; -            case 'full': -                collapseEmphaticOptions.push([true, false], [true, true]); -                break; -        } -        return collapseEmphaticOptions; -    } - -    /**       * @param {import('translation').FindTermsOptions} options       * @returns {(import('translation').FindTermsTextReplacement[] | null)[]}       */ @@ -1343,26 +1317,32 @@ export class Translator {      }      /** -     * @param {[...args: unknown[][]]} arrayVariants -     * @yields {[...args: unknown[]]} -     * @returns {Generator<unknown[], void, unknown>} +     * @param {Map<string, unknown[]>} arrayVariants +     * @yields {Map<string, unknown>} +     * @returns {Generator<Map<string, unknown>, void, void>}       */ -    *_getArrayVariants(arrayVariants) { -        const ii = arrayVariants.length; - -        let total = 1; -        for (let i = 0; i < ii; ++i) { -            total *= arrayVariants[i].length; +    *_generateArrayVariants(arrayVariants) { +        const variantKeys = [...arrayVariants.keys()]; +        const entryVariantLengths = []; +        for (const key of variantKeys) { +            const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); +            entryVariantLengths.push(entryVariants.length);          } +        const totalVariants = entryVariantLengths.reduce((acc, length) => acc * length, 1); + +        for (let variantIndex = 0; variantIndex < totalVariants; ++variantIndex) { +            /** @type {Map<string, unknown>} */ +            const variant = new Map(); +            let remainingIndex = variantIndex; -        for (let a = 0; a < total; ++a) { -            const variant = []; -            let index = a; -            for (let i = 0; i < ii; ++i) { -                const entryVariants = arrayVariants[i]; -                variant.push(entryVariants[index % entryVariants.length]); -                index = Math.floor(index / entryVariants.length); +            for (let keyIndex = 0; keyIndex < variantKeys.length; ++keyIndex) { +                const key = variantKeys[keyIndex]; +                const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); +                const entryIndex = remainingIndex % entryVariants.length; +                variant.set(key, entryVariants[entryIndex]); +                remainingIndex = Math.floor(remainingIndex / entryVariants.length);              } +              yield variant;          }      } |