diff options
| author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2024-01-31 08:28:05 -0500 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-01-31 13:28:05 +0000 | 
| commit | 3e419aa562aab03ca20421aaf7e4d1a39194a5b4 (patch) | |
| tree | 15e8bfe81fa5e3fae55e54802f14d94a7502a469 /ext/js | |
| parent | 6807b05e9bd41f013364fae0cbcce83cf1ed37b6 (diff) | |
Language transformer (#582)
* Set up new deinflection data file
* Define types
* Test
* Add internal types
* Set up loading for transforms
* Add getPartOfSpeechFlags
* Convert static methods
* Add note
* Add transform function
* Update trace structure
* Add a language tag to the language transform descriptor
* Add clear function
* Add function for multiple parts of speech
* Clarify naming
* Add getConditionFlagsFromConditionType
* Add plural function
* Replace usages of Deinflector
* Update tests
* Update config
* Remove old
* Rename
* Rename files
Diffstat (limited to 'ext/js')
| -rw-r--r-- | ext/js/background/backend.js | 6 | ||||
| -rw-r--r-- | ext/js/background/offscreen-proxy.js | 6 | ||||
| -rw-r--r-- | ext/js/background/offscreen.js | 4 | ||||
| -rw-r--r-- | ext/js/language/deinflector.js | 140 | ||||
| -rw-r--r-- | ext/js/language/language-transformer.js | 245 | ||||
| -rw-r--r-- | ext/js/language/translator.js | 32 | 
6 files changed, 268 insertions, 165 deletions
| diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index 74c1370c..0773dc4b 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -282,9 +282,9 @@ export class Backend {                  log.error(e);              } -            /** @type {import('deinflector').ReasonsRaw} */ -            const deinflectionReasons = await this._fetchJson('/data/deinflect.json'); -            this._translator.prepare(deinflectionReasons); +            /** @type {import('language-transformer').LanguageTransformDescriptor} */ +            const descriptor = await this._fetchJson('/data/language/japanese-transforms.json'); +            this._translator.prepare(descriptor);              await this._optionsUtil.prepare();              this._defaultAnkiFieldTemplates = (await this._fetchText('/data/templates/default-anki-field-templates.handlebars')).trim(); diff --git a/ext/js/background/offscreen-proxy.js b/ext/js/background/offscreen-proxy.js index 555c3abc..80ff31c0 100644 --- a/ext/js/background/offscreen-proxy.js +++ b/ext/js/background/offscreen-proxy.js @@ -159,10 +159,10 @@ export class TranslatorProxy {      }      /** -     * @param {import('deinflector').ReasonsRaw} deinflectionReasons +     * @param {import('language-transformer').LanguageTransformDescriptor} descriptor       */ -    async prepare(deinflectionReasons) { -        await this._offscreen.sendMessagePromise({action: 'translatorPrepareOffscreen', params: {deinflectionReasons}}); +    async prepare(descriptor) { +        await this._offscreen.sendMessagePromise({action: 'translatorPrepareOffscreen', params: {descriptor}});      }      /** diff --git a/ext/js/background/offscreen.js b/ext/js/background/offscreen.js index a0f5592e..ef05508a 100644 --- a/ext/js/background/offscreen.js +++ b/ext/js/background/offscreen.js @@ -115,8 +115,8 @@ export class Offscreen {      }      /** @type {import('offscreen').ApiHandler<'translatorPrepareOffscreen'>} */ -    _prepareTranslatorHandler({deinflectionReasons}) { -        this._translator.prepare(deinflectionReasons); +    _prepareTranslatorHandler({descriptor}) { +        this._translator.prepare(descriptor);      }      /** @type {import('offscreen').ApiHandler<'findKanjiOffscreen'>} */ diff --git a/ext/js/language/deinflector.js b/ext/js/language/deinflector.js deleted file mode 100644 index b52b7f5b..00000000 --- a/ext/js/language/deinflector.js +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2023-2024  Yomitan Authors - * Copyright (C) 2016-2022  Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <https://www.gnu.org/licenses/>. - */ - -export class Deinflector { -    /* eslint-disable no-multi-spaces */ -    /** @type {Map<string, import('translation-internal').DeinflectionRuleFlags>} @readonly */ -    static _ruleTypes = new Map([ -        ['v1',    /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000011)], // Verb ichidan -        ['v1d',   /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000010)], // Verb ichidan dictionary form -        ['v1p',   /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000001)], // Verb ichidan progressive or perfect -        ['v5',    /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000100)], // Verb godan -        ['vs',    /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000001000)], // Verb suru -        ['vk',    /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000010000)], // Verb kuru -        ['vz',    /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000100000)], // Verb zuru -        ['adj-i', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b001000000)], // Adjective i -        ['iru',   /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b010000000)] // Intermediate -iru endings for progressive or perfect tense -    ]); -    /* eslint-enable no-multi-spaces */ - -    /** -     * @param {import('deinflector').ReasonsRaw} reasons -     * @example -     * const deinflectionReasons = parseJson( -     *   readFileSync(path.join('ext/data/deinflect.json')).toString(), -     * ); -     * const deinflector = new Deinflector(deinflectionReasons); -     */ -    constructor(reasons) { -        /** @type {import('deinflector').Reason[]} */ -        this.reasons = Deinflector.normalizeReasons(reasons); -    } - -    /** -     * Deinflects a Japanese term to all of its possible dictionary forms. -     * @param {string} source The source term to deinflect. -     * @returns {import('translation-internal').Deinflection[]} -     * @example -     * const deinflector = new Deinflector(deinflectionReasons); -     * // [{ term: '食べた', rules: 0, reasons: [] }, { term: '食べる', rules: 1, reasons: ['past'] }, { term: '食ぶ', rules: 2, reasons: ['potential', 'past'] }] -     * console.log(deinflector.deinflect('食べた')); -     */ -    deinflect(source) { -        const results = [this._createDeinflection(source, 0, [])]; -        for (let i = 0; i < results.length; ++i) { -            const {rules, term, reasons} = results[i]; -            for (const [reason, variants] of this.reasons) { -                for (const [kanaIn, kanaOut, rulesIn, rulesOut] of variants) { -                    if ( -                        !Deinflector.rulesMatch(rules, rulesIn) || -                        !term.endsWith(kanaIn) || -                        (term.length - kanaIn.length + kanaOut.length) <= 0 -                    ) { -                        continue; -                    } - -                    results.push(this._createDeinflection( -                        term.substring(0, term.length - kanaIn.length) + kanaOut, -                        rulesOut, -                        [reason, ...reasons] -                    )); -                } -            } -        } -        return results; -    } - -    /** -     * @param {string} term -     * @param {import('translation-internal').DeinflectionRuleFlags} rules -     * @param {import('dictionary').InflectionRuleChain} reasons -     * @returns {import('translation-internal').Deinflection} -     */ -    _createDeinflection(term, rules, reasons) { -        return {term, rules, reasons}; -    } - -    /** -     * @param {import('deinflector').ReasonsRaw} reasons -     * @returns {import('deinflector').Reason[]} -     */ -    static normalizeReasons(reasons) { -        /** @type {import('deinflector').Reason[]} */ -        const normalizedReasons = []; -        for (const [reason, reasonInfo] of Object.entries(reasons)) { -            /** @type {import('deinflector').ReasonVariant[]} */ -            const variants = []; -            for (const {kanaIn, kanaOut, rulesIn, rulesOut} of reasonInfo) { -                variants.push([ -                    kanaIn, -                    kanaOut, -                    this.rulesToRuleFlags(rulesIn), -                    this.rulesToRuleFlags(rulesOut) -                ]); -            } -            normalizedReasons.push([reason, variants]); -        } -        return normalizedReasons; -    } - -    /** -     * @param {string[]} rules -     * @returns {import('translation-internal').DeinflectionRuleFlags} -     */ -    static rulesToRuleFlags(rules) { -        const ruleTypes = this._ruleTypes; -        let value = 0; -        for (const rule of rules) { -            const ruleBits = ruleTypes.get(rule); -            if (typeof ruleBits === 'undefined') { continue; } -            value |= ruleBits; -        } -        return value; -    } - -    /** -     * If `currentRules` is `0`, then `nextRules` is ignored and `true` is returned. -     * Otherwise, there must be at least one shared rule between `currentRules` and `nextRules`. -     * @param {number} currentRules -     * @param {number} nextRules -     * @returns {boolean} -     */ -    static rulesMatch(currentRules, nextRules) { -        return currentRules === 0 || (currentRules & nextRules) !== 0; -    } -} diff --git a/ext/js/language/language-transformer.js b/ext/js/language/language-transformer.js new file mode 100644 index 00000000..c9e261ea --- /dev/null +++ b/ext/js/language/language-transformer.js @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +export class LanguageTransformer { +    constructor() { +        /** @type {number} */ +        this._nextFlagIndex = 0; +        /** @type {import('language-transformer-internal').Transform[]} */ +        this._transforms = []; +        /** @type {Map<string, number>} */ +        this._conditionTypeToConditionFlagsMap = new Map(); +        /** @type {Map<string, number>} */ +        this._partOfSpeechToConditionFlagsMap = new Map(); +    } + +    /** */ +    clear() { +        this._nextFlagIndex = 0; +        this._transforms = []; +        this._conditionTypeToConditionFlagsMap.clear(); +        this._partOfSpeechToConditionFlagsMap.clear(); +    } + +    /** +     * Note: this function does not currently combine properly with previous descriptors, +     * they are treated as completely separate collections. This should eventually be changed. +     * @param {import('language-transformer').LanguageTransformDescriptor} descriptor +     * @throws {Error} +     */ +    addDescriptor(descriptor) { +        const {conditions, transforms} = descriptor; +        const conditionEntries = Object.entries(conditions); +        const {conditionFlagsMap, nextFlagIndex} = this._getConditionFlagsMap(conditionEntries, this._nextFlagIndex); + +        /** @type {import('language-transformer-internal').Transform[]} */ +        const transforms2 = []; +        for (let i = 0, ii = transforms.length; i < ii; ++i) { +            const {name, rules} = transforms[i]; +            /** @type {import('language-transformer-internal').Rule[]} */ +            const rules2 = []; +            for (let j = 0, jj = rules.length; j < jj; ++j) { +                const {suffixIn, suffixOut, conditionsIn, conditionsOut} = rules[j]; +                const conditionFlagsIn = this._getConditionFlags(conditionFlagsMap, conditionsIn); +                if (conditionFlagsIn === null) { throw new Error(`Invalid conditionsIn for transform[${i}].rules[${j}]`); } +                const conditionFlagsOut = this._getConditionFlags(conditionFlagsMap, conditionsOut); +                if (conditionFlagsOut === null) { throw new Error(`Invalid conditionsOut for transform[${i}].rules[${j}]`); } +                rules2.push({ +                    suffixIn, +                    suffixOut, +                    conditionsIn: conditionFlagsIn, +                    conditionsOut: conditionFlagsOut +                }); +            } +            transforms2.push({name, rules: rules2}); +        } + +        this._nextFlagIndex = nextFlagIndex; +        for (const transform of transforms2) { +            this._transforms.push(transform); +        } + +        for (const [type, condition] of conditionEntries) { +            const flags = conditionFlagsMap.get(type); +            if (typeof flags === 'undefined') { continue; } // This case should never happen +            this._conditionTypeToConditionFlagsMap.set(type, flags); +            for (const partOfSpeech of condition.partsOfSpeech) { +                this._partOfSpeechToConditionFlagsMap.set(partOfSpeech, this.getConditionFlagsFromPartOfSpeech(partOfSpeech) | flags); +            } +        } +    } + +    /** +     * @param {string} partOfSpeech +     * @returns {number} +     */ +    getConditionFlagsFromPartOfSpeech(partOfSpeech) { +        const conditionFlags = this._partOfSpeechToConditionFlagsMap.get(partOfSpeech); +        return typeof conditionFlags !== 'undefined' ? conditionFlags : 0; +    } + +    /** +     * @param {string[]} partsOfSpeech +     * @returns {number} +     */ +    getConditionFlagsFromPartsOfSpeech(partsOfSpeech) { +        let result = 0; +        for (const partOfSpeech of partsOfSpeech) { +            result |= this.getConditionFlagsFromPartOfSpeech(partOfSpeech); +        } +        return result; +    } + +    /** +     * @param {string} conditionType +     * @returns {number} +     */ +    getConditionFlagsFromConditionType(conditionType) { +        const conditionFlags = this._conditionTypeToConditionFlagsMap.get(conditionType); +        return typeof conditionFlags !== 'undefined' ? conditionFlags : 0; +    } + +    /** +     * @param {string[]} conditionTypes +     * @returns {number} +     */ +    getConditionFlagsFromConditionTypes(conditionTypes) { +        let result = 0; +        for (const conditionType of conditionTypes) { +            result |= this.getConditionFlagsFromConditionType(conditionType); +        } +        return result; +    } + +    /** +     * @param {string} sourceText +     * @returns {import('language-transformer-internal').TransformedText[]} +     */ +    transform(sourceText) { +        const results = [this._createTransformedText(sourceText, 0, [])]; +        for (let i = 0; i < results.length; ++i) { +            const {text, conditions, trace} = results[i]; +            for (const {name, rules} of this._transforms) { +                for (let j = 0, jj = rules.length; j < jj; ++j) { +                    const rule = rules[j]; +                    if (!LanguageTransformer.conditionsMatch(conditions, rule.conditionsIn)) { continue; } +                    const {suffixIn, suffixOut} = rule; +                    if (!text.endsWith(suffixIn) || (text.length - suffixIn.length + suffixOut.length) <= 0) { continue; } +                    results.push(this._createTransformedText( +                        text.substring(0, text.length - suffixIn.length) + suffixOut, +                        rule.conditionsOut, +                        this._extendTrace(trace, {transform: name, ruleIndex: j}) +                    )); +                } +            } +        } +        return results; +    } + +    /** +     * @param {import('language-transformer').ConditionMapEntries} conditions +     * @param {number} nextFlagIndex +     * @returns {{conditionFlagsMap: Map<string, number>, nextFlagIndex: number}} +     * @throws {Error} +     */ +    _getConditionFlagsMap(conditions, nextFlagIndex) { +        /** @type {Map<string, number>} */ +        const conditionFlagsMap = new Map(); +        /** @type {import('language-transformer').ConditionMapEntries} */ +        let targets = conditions; +        while (targets.length > 0) { +            const nextTargets = []; +            for (const target of targets) { +                const [type, condition] = target; +                const {subConditions} = condition; +                let flags = 0; +                if (typeof subConditions === 'undefined') { +                    if (nextFlagIndex >= 32) { +                        // Flags greater than or equal to 32 don't work because JavaScript only supports up to 32-bit integer operations +                        throw new Error('Maximum number of conditions was exceeded'); +                    } +                    flags = 1 << nextFlagIndex; +                    ++nextFlagIndex; +                } else { +                    const multiFlags = this._getConditionFlags(conditionFlagsMap, subConditions); +                    if (multiFlags === null) { +                        nextTargets.push(target); +                        continue; +                    } else { +                        flags = multiFlags; +                    } +                } +                conditionFlagsMap.set(type, flags); +            } +            if (nextTargets.length === targets.length) { +                // Cycle in subRule declaration +                throw new Error('Maximum number of conditions was exceeded'); +            } +            targets = nextTargets; +        } +        return {conditionFlagsMap, nextFlagIndex}; +    } + +    /** +     * @param {Map<string, number>} conditionFlagsMap +     * @param {string[]} conditionTypes +     * @returns {?number} +     */ +    _getConditionFlags(conditionFlagsMap, conditionTypes) { +        let flags = 0; +        for (const conditionType of conditionTypes) { +            const flags2 = conditionFlagsMap.get(conditionType); +            if (typeof flags2 === 'undefined') { return null; } +            flags |= flags2; +        } +        return flags; +    } + +    /** +     * @param {string} text +     * @param {number} conditions +     * @param {import('language-transformer-internal').Trace} trace +     * @returns {import('language-transformer-internal').TransformedText} +     */ +    _createTransformedText(text, conditions, trace) { +        return {text, conditions, trace}; +    } + +    /** +     * @param {import('language-transformer-internal').Trace} trace +     * @param {import('language-transformer-internal').TraceFrame} newFrame +     * @returns {import('language-transformer-internal').Trace} +     */ +    _extendTrace(trace, newFrame) { +        const newTrace = [newFrame]; +        for (const {transform, ruleIndex} of trace) { +            newTrace.push({transform, ruleIndex}); +        } +        return newTrace; +    } + +    /** +     * If `currentConditions` is `0`, then `nextConditions` is ignored and `true` is returned. +     * Otherwise, there must be at least one shared condition between `currentConditions` and `nextConditions`. +     * @param {number} currentConditions +     * @param {number} nextConditions +     * @returns {boolean} +     */ +    static conditionsMatch(currentConditions, nextConditions) { +        return currentConditions === 0 || (currentConditions & nextConditions) !== 0; +    } +} diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 66eeb69f..9d2f18e0 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@  import {RegexUtil} from '../general/regex-util.js';  import {TextSourceMap} from '../general/text-source-map.js'; -import {Deinflector} from './deinflector.js';  import {convertAlphabeticToKana} from './japanese-wanakana.js';  import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './japanese.js'; +import {LanguageTransformer} from './language-transformer.js';  /**   * Class which finds term and kanji dictionary entries for text. @@ -33,8 +33,8 @@ export class Translator {      constructor({database}) {          /** @type {import('../dictionary/dictionary-database.js').DictionaryDatabase} */          this._database = database; -        /** @type {?Deinflector} */ -        this._deinflector = null; +        /** @type {LanguageTransformer} */ +        this._languageTransformer = new LanguageTransformer();          /** @type {import('translator').DictionaryTagCache} */          this._tagCache = new Map();          /** @type {Intl.Collator} */ @@ -44,12 +44,11 @@ export class Translator {      }      /** -     * Initializes the instance for use. The public API should not be used until -     * this function has been called. -     * @param {import('deinflector').ReasonsRaw} deinflectionReasons The raw deinflections reasons data that the Deinflector uses. +     * Initializes the instance for use. The public API should not be used until this function has been called. +     * @param {import('language-transformer').LanguageTransformDescriptor} descriptor       */ -    prepare(deinflectionReasons) { -        this._deinflector = new Deinflector(deinflectionReasons); +    prepare(descriptor) { +        this._languageTransformer.addDescriptor(descriptor);      }      /** @@ -407,10 +406,9 @@ export class Translator {              const entryDictionary = /** @type {import('translation').FindTermDictionary} */ (enabledDictionaryMap.get(databaseEntry.dictionary));              const {partsOfSpeechFilter} = entryDictionary; -            const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules); +            const definitionConditions = this._languageTransformer.getConditionFlagsFromPartsOfSpeech(databaseEntry.rules);              for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) { -                const deinflectionRules = deinflection.rules; -                if (!partsOfSpeechFilter || Deinflector.rulesMatch(deinflectionRules, definitionRules)) { +                if (!partsOfSpeechFilter || LanguageTransformer.conditionsMatch(deinflection.conditions, definitionConditions)) {                      deinflection.databaseEntries.push(databaseEntry);                  }              } @@ -473,13 +471,13 @@ export class Translator {                  if (used.has(source)) { break; }                  used.add(source);                  const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); -                for (const {term, rules, reasons} of /** @type {Deinflector} */ (this._deinflector).deinflect(source)) { +                for (const {text: transformedText, conditions, trace} of this._languageTransformer.transform(source)) {                      /** @type {import('dictionary').InflectionRuleChainCandidate} */                      const inflectionRuleChainCandidate = {                          source: 'algorithm', -                        inflectionRules: reasons +                        inflectionRules: trace.map((frame) => frame.transform)                      }; -                    deinflections.push(this._createDeinflection(rawSource, source, term, rules, [inflectionRuleChainCandidate])); +                    deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate]));                  }              }          } @@ -570,12 +568,12 @@ export class Translator {       * @param {string} originalText       * @param {string} transformedText       * @param {string} deinflectedText -     * @param {import('translation-internal').DeinflectionRuleFlags} rules +     * @param {number} conditions       * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates       * @returns {import('translation-internal').DatabaseDeinflection}       */ -    _createDeinflection(originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates) { -        return {originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates, databaseEntries: []}; +    _createDeinflection(originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates) { +        return {originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates, databaseEntries: []};      }      // Term dictionary entry grouping |