From 3e419aa562aab03ca20421aaf7e4d1a39194a5b4 Mon Sep 17 00:00:00 2001 From: toasted-nutbread Date: Wed, 31 Jan 2024 08:28:05 -0500 Subject: Language transformer (#582) * Set up new deinflection data file * Define types * Test * Add internal types * Set up loading for transforms * Add getPartOfSpeechFlags * Convert static methods * Add note * Add transform function * Update trace structure * Add a language tag to the language transform descriptor * Add clear function * Add function for multiple parts of speech * Clarify naming * Add getConditionFlagsFromConditionType * Add plural function * Replace usages of Deinflector * Update tests * Update config * Remove old * Rename * Rename files --- ext/js/background/backend.js | 6 +- ext/js/background/offscreen-proxy.js | 6 +- ext/js/background/offscreen.js | 4 +- ext/js/language/deinflector.js | 140 ------------------ ext/js/language/language-transformer.js | 245 ++++++++++++++++++++++++++++++++ ext/js/language/translator.js | 32 ++--- 6 files changed, 268 insertions(+), 165 deletions(-) delete mode 100644 ext/js/language/deinflector.js create mode 100644 ext/js/language/language-transformer.js (limited to 'ext/js') diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index 74c1370c..0773dc4b 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -282,9 +282,9 @@ export class Backend { log.error(e); } - /** @type {import('deinflector').ReasonsRaw} */ - const deinflectionReasons = await this._fetchJson('/data/deinflect.json'); - this._translator.prepare(deinflectionReasons); + /** @type {import('language-transformer').LanguageTransformDescriptor} */ + const descriptor = await this._fetchJson('/data/language/japanese-transforms.json'); + this._translator.prepare(descriptor); await this._optionsUtil.prepare(); this._defaultAnkiFieldTemplates = (await this._fetchText('/data/templates/default-anki-field-templates.handlebars')).trim(); diff --git a/ext/js/background/offscreen-proxy.js b/ext/js/background/offscreen-proxy.js index 555c3abc..80ff31c0 100644 --- a/ext/js/background/offscreen-proxy.js +++ b/ext/js/background/offscreen-proxy.js @@ -159,10 +159,10 @@ export class TranslatorProxy { } /** - * @param {import('deinflector').ReasonsRaw} deinflectionReasons + * @param {import('language-transformer').LanguageTransformDescriptor} descriptor */ - async prepare(deinflectionReasons) { - await this._offscreen.sendMessagePromise({action: 'translatorPrepareOffscreen', params: {deinflectionReasons}}); + async prepare(descriptor) { + await this._offscreen.sendMessagePromise({action: 'translatorPrepareOffscreen', params: {descriptor}}); } /** diff --git a/ext/js/background/offscreen.js b/ext/js/background/offscreen.js index a0f5592e..ef05508a 100644 --- a/ext/js/background/offscreen.js +++ b/ext/js/background/offscreen.js @@ -115,8 +115,8 @@ export class Offscreen { } /** @type {import('offscreen').ApiHandler<'translatorPrepareOffscreen'>} */ - _prepareTranslatorHandler({deinflectionReasons}) { - this._translator.prepare(deinflectionReasons); + _prepareTranslatorHandler({descriptor}) { + this._translator.prepare(descriptor); } /** @type {import('offscreen').ApiHandler<'findKanjiOffscreen'>} */ diff --git a/ext/js/language/deinflector.js b/ext/js/language/deinflector.js deleted file mode 100644 index b52b7f5b..00000000 --- a/ext/js/language/deinflector.js +++ /dev/null @@ -1,140 +0,0 @@ -/* - * Copyright (C) 2023-2024 Yomitan Authors - * Copyright (C) 2016-2022 Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -export class Deinflector { - /* eslint-disable no-multi-spaces */ - /** @type {Map} @readonly */ - static _ruleTypes = new Map([ - ['v1', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000011)], // Verb ichidan - ['v1d', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000010)], // Verb ichidan dictionary form - ['v1p', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000001)], // Verb ichidan progressive or perfect - ['v5', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000100)], // Verb godan - ['vs', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000001000)], // Verb suru - ['vk', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000010000)], // Verb kuru - ['vz', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000100000)], // Verb zuru - ['adj-i', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b001000000)], // Adjective i - ['iru', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b010000000)] // Intermediate -iru endings for progressive or perfect tense - ]); - /* eslint-enable no-multi-spaces */ - - /** - * @param {import('deinflector').ReasonsRaw} reasons - * @example - * const deinflectionReasons = parseJson( - * readFileSync(path.join('ext/data/deinflect.json')).toString(), - * ); - * const deinflector = new Deinflector(deinflectionReasons); - */ - constructor(reasons) { - /** @type {import('deinflector').Reason[]} */ - this.reasons = Deinflector.normalizeReasons(reasons); - } - - /** - * Deinflects a Japanese term to all of its possible dictionary forms. - * @param {string} source The source term to deinflect. - * @returns {import('translation-internal').Deinflection[]} - * @example - * const deinflector = new Deinflector(deinflectionReasons); - * // [{ term: '食べた', rules: 0, reasons: [] }, { term: '食べる', rules: 1, reasons: ['past'] }, { term: '食ぶ', rules: 2, reasons: ['potential', 'past'] }] - * console.log(deinflector.deinflect('食べた')); - */ - deinflect(source) { - const results = [this._createDeinflection(source, 0, [])]; - for (let i = 0; i < results.length; ++i) { - const {rules, term, reasons} = results[i]; - for (const [reason, variants] of this.reasons) { - for (const [kanaIn, kanaOut, rulesIn, rulesOut] of variants) { - if ( - !Deinflector.rulesMatch(rules, rulesIn) || - !term.endsWith(kanaIn) || - (term.length - kanaIn.length + kanaOut.length) <= 0 - ) { - continue; - } - - results.push(this._createDeinflection( - term.substring(0, term.length - kanaIn.length) + kanaOut, - rulesOut, - [reason, ...reasons] - )); - } - } - } - return results; - } - - /** - * @param {string} term - * @param {import('translation-internal').DeinflectionRuleFlags} rules - * @param {import('dictionary').InflectionRuleChain} reasons - * @returns {import('translation-internal').Deinflection} - */ - _createDeinflection(term, rules, reasons) { - return {term, rules, reasons}; - } - - /** - * @param {import('deinflector').ReasonsRaw} reasons - * @returns {import('deinflector').Reason[]} - */ - static normalizeReasons(reasons) { - /** @type {import('deinflector').Reason[]} */ - const normalizedReasons = []; - for (const [reason, reasonInfo] of Object.entries(reasons)) { - /** @type {import('deinflector').ReasonVariant[]} */ - const variants = []; - for (const {kanaIn, kanaOut, rulesIn, rulesOut} of reasonInfo) { - variants.push([ - kanaIn, - kanaOut, - this.rulesToRuleFlags(rulesIn), - this.rulesToRuleFlags(rulesOut) - ]); - } - normalizedReasons.push([reason, variants]); - } - return normalizedReasons; - } - - /** - * @param {string[]} rules - * @returns {import('translation-internal').DeinflectionRuleFlags} - */ - static rulesToRuleFlags(rules) { - const ruleTypes = this._ruleTypes; - let value = 0; - for (const rule of rules) { - const ruleBits = ruleTypes.get(rule); - if (typeof ruleBits === 'undefined') { continue; } - value |= ruleBits; - } - return value; - } - - /** - * If `currentRules` is `0`, then `nextRules` is ignored and `true` is returned. - * Otherwise, there must be at least one shared rule between `currentRules` and `nextRules`. - * @param {number} currentRules - * @param {number} nextRules - * @returns {boolean} - */ - static rulesMatch(currentRules, nextRules) { - return currentRules === 0 || (currentRules & nextRules) !== 0; - } -} diff --git a/ext/js/language/language-transformer.js b/ext/js/language/language-transformer.js new file mode 100644 index 00000000..c9e261ea --- /dev/null +++ b/ext/js/language/language-transformer.js @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +export class LanguageTransformer { + constructor() { + /** @type {number} */ + this._nextFlagIndex = 0; + /** @type {import('language-transformer-internal').Transform[]} */ + this._transforms = []; + /** @type {Map} */ + this._conditionTypeToConditionFlagsMap = new Map(); + /** @type {Map} */ + this._partOfSpeechToConditionFlagsMap = new Map(); + } + + /** */ + clear() { + this._nextFlagIndex = 0; + this._transforms = []; + this._conditionTypeToConditionFlagsMap.clear(); + this._partOfSpeechToConditionFlagsMap.clear(); + } + + /** + * Note: this function does not currently combine properly with previous descriptors, + * they are treated as completely separate collections. This should eventually be changed. + * @param {import('language-transformer').LanguageTransformDescriptor} descriptor + * @throws {Error} + */ + addDescriptor(descriptor) { + const {conditions, transforms} = descriptor; + const conditionEntries = Object.entries(conditions); + const {conditionFlagsMap, nextFlagIndex} = this._getConditionFlagsMap(conditionEntries, this._nextFlagIndex); + + /** @type {import('language-transformer-internal').Transform[]} */ + const transforms2 = []; + for (let i = 0, ii = transforms.length; i < ii; ++i) { + const {name, rules} = transforms[i]; + /** @type {import('language-transformer-internal').Rule[]} */ + const rules2 = []; + for (let j = 0, jj = rules.length; j < jj; ++j) { + const {suffixIn, suffixOut, conditionsIn, conditionsOut} = rules[j]; + const conditionFlagsIn = this._getConditionFlags(conditionFlagsMap, conditionsIn); + if (conditionFlagsIn === null) { throw new Error(`Invalid conditionsIn for transform[${i}].rules[${j}]`); } + const conditionFlagsOut = this._getConditionFlags(conditionFlagsMap, conditionsOut); + if (conditionFlagsOut === null) { throw new Error(`Invalid conditionsOut for transform[${i}].rules[${j}]`); } + rules2.push({ + suffixIn, + suffixOut, + conditionsIn: conditionFlagsIn, + conditionsOut: conditionFlagsOut + }); + } + transforms2.push({name, rules: rules2}); + } + + this._nextFlagIndex = nextFlagIndex; + for (const transform of transforms2) { + this._transforms.push(transform); + } + + for (const [type, condition] of conditionEntries) { + const flags = conditionFlagsMap.get(type); + if (typeof flags === 'undefined') { continue; } // This case should never happen + this._conditionTypeToConditionFlagsMap.set(type, flags); + for (const partOfSpeech of condition.partsOfSpeech) { + this._partOfSpeechToConditionFlagsMap.set(partOfSpeech, this.getConditionFlagsFromPartOfSpeech(partOfSpeech) | flags); + } + } + } + + /** + * @param {string} partOfSpeech + * @returns {number} + */ + getConditionFlagsFromPartOfSpeech(partOfSpeech) { + const conditionFlags = this._partOfSpeechToConditionFlagsMap.get(partOfSpeech); + return typeof conditionFlags !== 'undefined' ? conditionFlags : 0; + } + + /** + * @param {string[]} partsOfSpeech + * @returns {number} + */ + getConditionFlagsFromPartsOfSpeech(partsOfSpeech) { + let result = 0; + for (const partOfSpeech of partsOfSpeech) { + result |= this.getConditionFlagsFromPartOfSpeech(partOfSpeech); + } + return result; + } + + /** + * @param {string} conditionType + * @returns {number} + */ + getConditionFlagsFromConditionType(conditionType) { + const conditionFlags = this._conditionTypeToConditionFlagsMap.get(conditionType); + return typeof conditionFlags !== 'undefined' ? conditionFlags : 0; + } + + /** + * @param {string[]} conditionTypes + * @returns {number} + */ + getConditionFlagsFromConditionTypes(conditionTypes) { + let result = 0; + for (const conditionType of conditionTypes) { + result |= this.getConditionFlagsFromConditionType(conditionType); + } + return result; + } + + /** + * @param {string} sourceText + * @returns {import('language-transformer-internal').TransformedText[]} + */ + transform(sourceText) { + const results = [this._createTransformedText(sourceText, 0, [])]; + for (let i = 0; i < results.length; ++i) { + const {text, conditions, trace} = results[i]; + for (const {name, rules} of this._transforms) { + for (let j = 0, jj = rules.length; j < jj; ++j) { + const rule = rules[j]; + if (!LanguageTransformer.conditionsMatch(conditions, rule.conditionsIn)) { continue; } + const {suffixIn, suffixOut} = rule; + if (!text.endsWith(suffixIn) || (text.length - suffixIn.length + suffixOut.length) <= 0) { continue; } + results.push(this._createTransformedText( + text.substring(0, text.length - suffixIn.length) + suffixOut, + rule.conditionsOut, + this._extendTrace(trace, {transform: name, ruleIndex: j}) + )); + } + } + } + return results; + } + + /** + * @param {import('language-transformer').ConditionMapEntries} conditions + * @param {number} nextFlagIndex + * @returns {{conditionFlagsMap: Map, nextFlagIndex: number}} + * @throws {Error} + */ + _getConditionFlagsMap(conditions, nextFlagIndex) { + /** @type {Map} */ + const conditionFlagsMap = new Map(); + /** @type {import('language-transformer').ConditionMapEntries} */ + let targets = conditions; + while (targets.length > 0) { + const nextTargets = []; + for (const target of targets) { + const [type, condition] = target; + const {subConditions} = condition; + let flags = 0; + if (typeof subConditions === 'undefined') { + if (nextFlagIndex >= 32) { + // Flags greater than or equal to 32 don't work because JavaScript only supports up to 32-bit integer operations + throw new Error('Maximum number of conditions was exceeded'); + } + flags = 1 << nextFlagIndex; + ++nextFlagIndex; + } else { + const multiFlags = this._getConditionFlags(conditionFlagsMap, subConditions); + if (multiFlags === null) { + nextTargets.push(target); + continue; + } else { + flags = multiFlags; + } + } + conditionFlagsMap.set(type, flags); + } + if (nextTargets.length === targets.length) { + // Cycle in subRule declaration + throw new Error('Maximum number of conditions was exceeded'); + } + targets = nextTargets; + } + return {conditionFlagsMap, nextFlagIndex}; + } + + /** + * @param {Map} conditionFlagsMap + * @param {string[]} conditionTypes + * @returns {?number} + */ + _getConditionFlags(conditionFlagsMap, conditionTypes) { + let flags = 0; + for (const conditionType of conditionTypes) { + const flags2 = conditionFlagsMap.get(conditionType); + if (typeof flags2 === 'undefined') { return null; } + flags |= flags2; + } + return flags; + } + + /** + * @param {string} text + * @param {number} conditions + * @param {import('language-transformer-internal').Trace} trace + * @returns {import('language-transformer-internal').TransformedText} + */ + _createTransformedText(text, conditions, trace) { + return {text, conditions, trace}; + } + + /** + * @param {import('language-transformer-internal').Trace} trace + * @param {import('language-transformer-internal').TraceFrame} newFrame + * @returns {import('language-transformer-internal').Trace} + */ + _extendTrace(trace, newFrame) { + const newTrace = [newFrame]; + for (const {transform, ruleIndex} of trace) { + newTrace.push({transform, ruleIndex}); + } + return newTrace; + } + + /** + * If `currentConditions` is `0`, then `nextConditions` is ignored and `true` is returned. + * Otherwise, there must be at least one shared condition between `currentConditions` and `nextConditions`. + * @param {number} currentConditions + * @param {number} nextConditions + * @returns {boolean} + */ + static conditionsMatch(currentConditions, nextConditions) { + return currentConditions === 0 || (currentConditions & nextConditions) !== 0; + } +} diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 66eeb69f..9d2f18e0 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@ import {RegexUtil} from '../general/regex-util.js'; import {TextSourceMap} from '../general/text-source-map.js'; -import {Deinflector} from './deinflector.js'; import {convertAlphabeticToKana} from './japanese-wanakana.js'; import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './japanese.js'; +import {LanguageTransformer} from './language-transformer.js'; /** * Class which finds term and kanji dictionary entries for text. @@ -33,8 +33,8 @@ export class Translator { constructor({database}) { /** @type {import('../dictionary/dictionary-database.js').DictionaryDatabase} */ this._database = database; - /** @type {?Deinflector} */ - this._deinflector = null; + /** @type {LanguageTransformer} */ + this._languageTransformer = new LanguageTransformer(); /** @type {import('translator').DictionaryTagCache} */ this._tagCache = new Map(); /** @type {Intl.Collator} */ @@ -44,12 +44,11 @@ export class Translator { } /** - * Initializes the instance for use. The public API should not be used until - * this function has been called. - * @param {import('deinflector').ReasonsRaw} deinflectionReasons The raw deinflections reasons data that the Deinflector uses. + * Initializes the instance for use. The public API should not be used until this function has been called. + * @param {import('language-transformer').LanguageTransformDescriptor} descriptor */ - prepare(deinflectionReasons) { - this._deinflector = new Deinflector(deinflectionReasons); + prepare(descriptor) { + this._languageTransformer.addDescriptor(descriptor); } /** @@ -407,10 +406,9 @@ export class Translator { const entryDictionary = /** @type {import('translation').FindTermDictionary} */ (enabledDictionaryMap.get(databaseEntry.dictionary)); const {partsOfSpeechFilter} = entryDictionary; - const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules); + const definitionConditions = this._languageTransformer.getConditionFlagsFromPartsOfSpeech(databaseEntry.rules); for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) { - const deinflectionRules = deinflection.rules; - if (!partsOfSpeechFilter || Deinflector.rulesMatch(deinflectionRules, definitionRules)) { + if (!partsOfSpeechFilter || LanguageTransformer.conditionsMatch(deinflection.conditions, definitionConditions)) { deinflection.databaseEntries.push(databaseEntry); } } @@ -473,13 +471,13 @@ export class Translator { if (used.has(source)) { break; } used.add(source); const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); - for (const {term, rules, reasons} of /** @type {Deinflector} */ (this._deinflector).deinflect(source)) { + for (const {text: transformedText, conditions, trace} of this._languageTransformer.transform(source)) { /** @type {import('dictionary').InflectionRuleChainCandidate} */ const inflectionRuleChainCandidate = { source: 'algorithm', - inflectionRules: reasons + inflectionRules: trace.map((frame) => frame.transform) }; - deinflections.push(this._createDeinflection(rawSource, source, term, rules, [inflectionRuleChainCandidate])); + deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate])); } } } @@ -570,12 +568,12 @@ export class Translator { * @param {string} originalText * @param {string} transformedText * @param {string} deinflectedText - * @param {import('translation-internal').DeinflectionRuleFlags} rules + * @param {number} conditions * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates * @returns {import('translation-internal').DatabaseDeinflection} */ - _createDeinflection(originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates) { - return {originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates, databaseEntries: []}; + _createDeinflection(originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates) { + return {originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates, databaseEntries: []}; } // Term dictionary entry grouping -- cgit v1.2.3