summaryrefslogtreecommitdiff
path: root/ext/js/language
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2024-01-31 08:28:05 -0500
committerGitHub <noreply@github.com>2024-01-31 13:28:05 +0000
commit3e419aa562aab03ca20421aaf7e4d1a39194a5b4 (patch)
tree15e8bfe81fa5e3fae55e54802f14d94a7502a469 /ext/js/language
parent6807b05e9bd41f013364fae0cbcce83cf1ed37b6 (diff)
Language transformer (#582)
* Set up new deinflection data file * Define types * Test * Add internal types * Set up loading for transforms * Add getPartOfSpeechFlags * Convert static methods * Add note * Add transform function * Update trace structure * Add a language tag to the language transform descriptor * Add clear function * Add function for multiple parts of speech * Clarify naming * Add getConditionFlagsFromConditionType * Add plural function * Replace usages of Deinflector * Update tests * Update config * Remove old * Rename * Rename files
Diffstat (limited to 'ext/js/language')
-rw-r--r--ext/js/language/deinflector.js140
-rw-r--r--ext/js/language/language-transformer.js245
-rw-r--r--ext/js/language/translator.js32
3 files changed, 260 insertions, 157 deletions
diff --git a/ext/js/language/deinflector.js b/ext/js/language/deinflector.js
deleted file mode 100644
index b52b7f5b..00000000
--- a/ext/js/language/deinflector.js
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2023-2024 Yomitan Authors
- * Copyright (C) 2016-2022 Yomichan Authors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-export class Deinflector {
- /* eslint-disable no-multi-spaces */
- /** @type {Map<string, import('translation-internal').DeinflectionRuleFlags>} @readonly */
- static _ruleTypes = new Map([
- ['v1', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000011)], // Verb ichidan
- ['v1d', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000010)], // Verb ichidan dictionary form
- ['v1p', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000001)], // Verb ichidan progressive or perfect
- ['v5', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000000100)], // Verb godan
- ['vs', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000001000)], // Verb suru
- ['vk', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000010000)], // Verb kuru
- ['vz', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b000100000)], // Verb zuru
- ['adj-i', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b001000000)], // Adjective i
- ['iru', /** @type {import('translation-internal').DeinflectionRuleFlags} */ (0b010000000)] // Intermediate -iru endings for progressive or perfect tense
- ]);
- /* eslint-enable no-multi-spaces */
-
- /**
- * @param {import('deinflector').ReasonsRaw} reasons
- * @example
- * const deinflectionReasons = parseJson(
- * readFileSync(path.join('ext/data/deinflect.json')).toString(),
- * );
- * const deinflector = new Deinflector(deinflectionReasons);
- */
- constructor(reasons) {
- /** @type {import('deinflector').Reason[]} */
- this.reasons = Deinflector.normalizeReasons(reasons);
- }
-
- /**
- * Deinflects a Japanese term to all of its possible dictionary forms.
- * @param {string} source The source term to deinflect.
- * @returns {import('translation-internal').Deinflection[]}
- * @example
- * const deinflector = new Deinflector(deinflectionReasons);
- * // [{ term: '食べた', rules: 0, reasons: [] }, { term: '食べる', rules: 1, reasons: ['past'] }, { term: '食ぶ', rules: 2, reasons: ['potential', 'past'] }]
- * console.log(deinflector.deinflect('食べた'));
- */
- deinflect(source) {
- const results = [this._createDeinflection(source, 0, [])];
- for (let i = 0; i < results.length; ++i) {
- const {rules, term, reasons} = results[i];
- for (const [reason, variants] of this.reasons) {
- for (const [kanaIn, kanaOut, rulesIn, rulesOut] of variants) {
- if (
- !Deinflector.rulesMatch(rules, rulesIn) ||
- !term.endsWith(kanaIn) ||
- (term.length - kanaIn.length + kanaOut.length) <= 0
- ) {
- continue;
- }
-
- results.push(this._createDeinflection(
- term.substring(0, term.length - kanaIn.length) + kanaOut,
- rulesOut,
- [reason, ...reasons]
- ));
- }
- }
- }
- return results;
- }
-
- /**
- * @param {string} term
- * @param {import('translation-internal').DeinflectionRuleFlags} rules
- * @param {import('dictionary').InflectionRuleChain} reasons
- * @returns {import('translation-internal').Deinflection}
- */
- _createDeinflection(term, rules, reasons) {
- return {term, rules, reasons};
- }
-
- /**
- * @param {import('deinflector').ReasonsRaw} reasons
- * @returns {import('deinflector').Reason[]}
- */
- static normalizeReasons(reasons) {
- /** @type {import('deinflector').Reason[]} */
- const normalizedReasons = [];
- for (const [reason, reasonInfo] of Object.entries(reasons)) {
- /** @type {import('deinflector').ReasonVariant[]} */
- const variants = [];
- for (const {kanaIn, kanaOut, rulesIn, rulesOut} of reasonInfo) {
- variants.push([
- kanaIn,
- kanaOut,
- this.rulesToRuleFlags(rulesIn),
- this.rulesToRuleFlags(rulesOut)
- ]);
- }
- normalizedReasons.push([reason, variants]);
- }
- return normalizedReasons;
- }
-
- /**
- * @param {string[]} rules
- * @returns {import('translation-internal').DeinflectionRuleFlags}
- */
- static rulesToRuleFlags(rules) {
- const ruleTypes = this._ruleTypes;
- let value = 0;
- for (const rule of rules) {
- const ruleBits = ruleTypes.get(rule);
- if (typeof ruleBits === 'undefined') { continue; }
- value |= ruleBits;
- }
- return value;
- }
-
- /**
- * If `currentRules` is `0`, then `nextRules` is ignored and `true` is returned.
- * Otherwise, there must be at least one shared rule between `currentRules` and `nextRules`.
- * @param {number} currentRules
- * @param {number} nextRules
- * @returns {boolean}
- */
- static rulesMatch(currentRules, nextRules) {
- return currentRules === 0 || (currentRules & nextRules) !== 0;
- }
-}
diff --git a/ext/js/language/language-transformer.js b/ext/js/language/language-transformer.js
new file mode 100644
index 00000000..c9e261ea
--- /dev/null
+++ b/ext/js/language/language-transformer.js
@@ -0,0 +1,245 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+export class LanguageTransformer {
+ constructor() {
+ /** @type {number} */
+ this._nextFlagIndex = 0;
+ /** @type {import('language-transformer-internal').Transform[]} */
+ this._transforms = [];
+ /** @type {Map<string, number>} */
+ this._conditionTypeToConditionFlagsMap = new Map();
+ /** @type {Map<string, number>} */
+ this._partOfSpeechToConditionFlagsMap = new Map();
+ }
+
+ /** */
+ clear() {
+ this._nextFlagIndex = 0;
+ this._transforms = [];
+ this._conditionTypeToConditionFlagsMap.clear();
+ this._partOfSpeechToConditionFlagsMap.clear();
+ }
+
+ /**
+ * Note: this function does not currently combine properly with previous descriptors,
+ * they are treated as completely separate collections. This should eventually be changed.
+ * @param {import('language-transformer').LanguageTransformDescriptor} descriptor
+ * @throws {Error}
+ */
+ addDescriptor(descriptor) {
+ const {conditions, transforms} = descriptor;
+ const conditionEntries = Object.entries(conditions);
+ const {conditionFlagsMap, nextFlagIndex} = this._getConditionFlagsMap(conditionEntries, this._nextFlagIndex);
+
+ /** @type {import('language-transformer-internal').Transform[]} */
+ const transforms2 = [];
+ for (let i = 0, ii = transforms.length; i < ii; ++i) {
+ const {name, rules} = transforms[i];
+ /** @type {import('language-transformer-internal').Rule[]} */
+ const rules2 = [];
+ for (let j = 0, jj = rules.length; j < jj; ++j) {
+ const {suffixIn, suffixOut, conditionsIn, conditionsOut} = rules[j];
+ const conditionFlagsIn = this._getConditionFlags(conditionFlagsMap, conditionsIn);
+ if (conditionFlagsIn === null) { throw new Error(`Invalid conditionsIn for transform[${i}].rules[${j}]`); }
+ const conditionFlagsOut = this._getConditionFlags(conditionFlagsMap, conditionsOut);
+ if (conditionFlagsOut === null) { throw new Error(`Invalid conditionsOut for transform[${i}].rules[${j}]`); }
+ rules2.push({
+ suffixIn,
+ suffixOut,
+ conditionsIn: conditionFlagsIn,
+ conditionsOut: conditionFlagsOut
+ });
+ }
+ transforms2.push({name, rules: rules2});
+ }
+
+ this._nextFlagIndex = nextFlagIndex;
+ for (const transform of transforms2) {
+ this._transforms.push(transform);
+ }
+
+ for (const [type, condition] of conditionEntries) {
+ const flags = conditionFlagsMap.get(type);
+ if (typeof flags === 'undefined') { continue; } // This case should never happen
+ this._conditionTypeToConditionFlagsMap.set(type, flags);
+ for (const partOfSpeech of condition.partsOfSpeech) {
+ this._partOfSpeechToConditionFlagsMap.set(partOfSpeech, this.getConditionFlagsFromPartOfSpeech(partOfSpeech) | flags);
+ }
+ }
+ }
+
+ /**
+ * @param {string} partOfSpeech
+ * @returns {number}
+ */
+ getConditionFlagsFromPartOfSpeech(partOfSpeech) {
+ const conditionFlags = this._partOfSpeechToConditionFlagsMap.get(partOfSpeech);
+ return typeof conditionFlags !== 'undefined' ? conditionFlags : 0;
+ }
+
+ /**
+ * @param {string[]} partsOfSpeech
+ * @returns {number}
+ */
+ getConditionFlagsFromPartsOfSpeech(partsOfSpeech) {
+ let result = 0;
+ for (const partOfSpeech of partsOfSpeech) {
+ result |= this.getConditionFlagsFromPartOfSpeech(partOfSpeech);
+ }
+ return result;
+ }
+
+ /**
+ * @param {string} conditionType
+ * @returns {number}
+ */
+ getConditionFlagsFromConditionType(conditionType) {
+ const conditionFlags = this._conditionTypeToConditionFlagsMap.get(conditionType);
+ return typeof conditionFlags !== 'undefined' ? conditionFlags : 0;
+ }
+
+ /**
+ * @param {string[]} conditionTypes
+ * @returns {number}
+ */
+ getConditionFlagsFromConditionTypes(conditionTypes) {
+ let result = 0;
+ for (const conditionType of conditionTypes) {
+ result |= this.getConditionFlagsFromConditionType(conditionType);
+ }
+ return result;
+ }
+
+ /**
+ * @param {string} sourceText
+ * @returns {import('language-transformer-internal').TransformedText[]}
+ */
+ transform(sourceText) {
+ const results = [this._createTransformedText(sourceText, 0, [])];
+ for (let i = 0; i < results.length; ++i) {
+ const {text, conditions, trace} = results[i];
+ for (const {name, rules} of this._transforms) {
+ for (let j = 0, jj = rules.length; j < jj; ++j) {
+ const rule = rules[j];
+ if (!LanguageTransformer.conditionsMatch(conditions, rule.conditionsIn)) { continue; }
+ const {suffixIn, suffixOut} = rule;
+ if (!text.endsWith(suffixIn) || (text.length - suffixIn.length + suffixOut.length) <= 0) { continue; }
+ results.push(this._createTransformedText(
+ text.substring(0, text.length - suffixIn.length) + suffixOut,
+ rule.conditionsOut,
+ this._extendTrace(trace, {transform: name, ruleIndex: j})
+ ));
+ }
+ }
+ }
+ return results;
+ }
+
+ /**
+ * @param {import('language-transformer').ConditionMapEntries} conditions
+ * @param {number} nextFlagIndex
+ * @returns {{conditionFlagsMap: Map<string, number>, nextFlagIndex: number}}
+ * @throws {Error}
+ */
+ _getConditionFlagsMap(conditions, nextFlagIndex) {
+ /** @type {Map<string, number>} */
+ const conditionFlagsMap = new Map();
+ /** @type {import('language-transformer').ConditionMapEntries} */
+ let targets = conditions;
+ while (targets.length > 0) {
+ const nextTargets = [];
+ for (const target of targets) {
+ const [type, condition] = target;
+ const {subConditions} = condition;
+ let flags = 0;
+ if (typeof subConditions === 'undefined') {
+ if (nextFlagIndex >= 32) {
+ // Flags greater than or equal to 32 don't work because JavaScript only supports up to 32-bit integer operations
+ throw new Error('Maximum number of conditions was exceeded');
+ }
+ flags = 1 << nextFlagIndex;
+ ++nextFlagIndex;
+ } else {
+ const multiFlags = this._getConditionFlags(conditionFlagsMap, subConditions);
+ if (multiFlags === null) {
+ nextTargets.push(target);
+ continue;
+ } else {
+ flags = multiFlags;
+ }
+ }
+ conditionFlagsMap.set(type, flags);
+ }
+ if (nextTargets.length === targets.length) {
+ // Cycle in subRule declaration
+ throw new Error('Maximum number of conditions was exceeded');
+ }
+ targets = nextTargets;
+ }
+ return {conditionFlagsMap, nextFlagIndex};
+ }
+
+ /**
+ * @param {Map<string, number>} conditionFlagsMap
+ * @param {string[]} conditionTypes
+ * @returns {?number}
+ */
+ _getConditionFlags(conditionFlagsMap, conditionTypes) {
+ let flags = 0;
+ for (const conditionType of conditionTypes) {
+ const flags2 = conditionFlagsMap.get(conditionType);
+ if (typeof flags2 === 'undefined') { return null; }
+ flags |= flags2;
+ }
+ return flags;
+ }
+
+ /**
+ * @param {string} text
+ * @param {number} conditions
+ * @param {import('language-transformer-internal').Trace} trace
+ * @returns {import('language-transformer-internal').TransformedText}
+ */
+ _createTransformedText(text, conditions, trace) {
+ return {text, conditions, trace};
+ }
+
+ /**
+ * @param {import('language-transformer-internal').Trace} trace
+ * @param {import('language-transformer-internal').TraceFrame} newFrame
+ * @returns {import('language-transformer-internal').Trace}
+ */
+ _extendTrace(trace, newFrame) {
+ const newTrace = [newFrame];
+ for (const {transform, ruleIndex} of trace) {
+ newTrace.push({transform, ruleIndex});
+ }
+ return newTrace;
+ }
+
+ /**
+ * If `currentConditions` is `0`, then `nextConditions` is ignored and `true` is returned.
+ * Otherwise, there must be at least one shared condition between `currentConditions` and `nextConditions`.
+ * @param {number} currentConditions
+ * @param {number} nextConditions
+ * @returns {boolean}
+ */
+ static conditionsMatch(currentConditions, nextConditions) {
+ return currentConditions === 0 || (currentConditions & nextConditions) !== 0;
+ }
+}
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index 66eeb69f..9d2f18e0 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -18,9 +18,9 @@
import {RegexUtil} from '../general/regex-util.js';
import {TextSourceMap} from '../general/text-source-map.js';
-import {Deinflector} from './deinflector.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './japanese.js';
+import {LanguageTransformer} from './language-transformer.js';
/**
* Class which finds term and kanji dictionary entries for text.
@@ -33,8 +33,8 @@ export class Translator {
constructor({database}) {
/** @type {import('../dictionary/dictionary-database.js').DictionaryDatabase} */
this._database = database;
- /** @type {?Deinflector} */
- this._deinflector = null;
+ /** @type {LanguageTransformer} */
+ this._languageTransformer = new LanguageTransformer();
/** @type {import('translator').DictionaryTagCache} */
this._tagCache = new Map();
/** @type {Intl.Collator} */
@@ -44,12 +44,11 @@ export class Translator {
}
/**
- * Initializes the instance for use. The public API should not be used until
- * this function has been called.
- * @param {import('deinflector').ReasonsRaw} deinflectionReasons The raw deinflections reasons data that the Deinflector uses.
+ * Initializes the instance for use. The public API should not be used until this function has been called.
+ * @param {import('language-transformer').LanguageTransformDescriptor} descriptor
*/
- prepare(deinflectionReasons) {
- this._deinflector = new Deinflector(deinflectionReasons);
+ prepare(descriptor) {
+ this._languageTransformer.addDescriptor(descriptor);
}
/**
@@ -407,10 +406,9 @@ export class Translator {
const entryDictionary = /** @type {import('translation').FindTermDictionary} */ (enabledDictionaryMap.get(databaseEntry.dictionary));
const {partsOfSpeechFilter} = entryDictionary;
- const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules);
+ const definitionConditions = this._languageTransformer.getConditionFlagsFromPartsOfSpeech(databaseEntry.rules);
for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) {
- const deinflectionRules = deinflection.rules;
- if (!partsOfSpeechFilter || Deinflector.rulesMatch(deinflectionRules, definitionRules)) {
+ if (!partsOfSpeechFilter || LanguageTransformer.conditionsMatch(deinflection.conditions, definitionConditions)) {
deinflection.databaseEntries.push(databaseEntry);
}
}
@@ -473,13 +471,13 @@ export class Translator {
if (used.has(source)) { break; }
used.add(source);
const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i));
- for (const {term, rules, reasons} of /** @type {Deinflector} */ (this._deinflector).deinflect(source)) {
+ for (const {text: transformedText, conditions, trace} of this._languageTransformer.transform(source)) {
/** @type {import('dictionary').InflectionRuleChainCandidate} */
const inflectionRuleChainCandidate = {
source: 'algorithm',
- inflectionRules: reasons
+ inflectionRules: trace.map((frame) => frame.transform)
};
- deinflections.push(this._createDeinflection(rawSource, source, term, rules, [inflectionRuleChainCandidate]));
+ deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate]));
}
}
}
@@ -570,12 +568,12 @@ export class Translator {
* @param {string} originalText
* @param {string} transformedText
* @param {string} deinflectedText
- * @param {import('translation-internal').DeinflectionRuleFlags} rules
+ * @param {number} conditions
* @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates
* @returns {import('translation-internal').DatabaseDeinflection}
*/
- _createDeinflection(originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates) {
- return {originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates, databaseEntries: []};
+ _createDeinflection(originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates) {
+ return {originalText, transformedText, deinflectedText, conditions, inflectionRuleChainCandidates, databaseEntries: []};
}
// Term dictionary entry grouping