aboutsummaryrefslogtreecommitdiff
path: root/ext/js/language
diff options
context:
space:
mode:
authorStefanVukovic99 <stefanvukovic44@gmail.com>2024-01-20 02:25:23 +0100
committerGitHub <noreply@github.com>2024-01-20 01:25:23 +0000
commit2b87c919bcd879c7f356308bc522b95f33e35f3b (patch)
tree1301aa8bf3d9b91ad96bbe9372a66dceba190346 /ext/js/language
parent48f1d012ad5045319d4e492dfbefa39da92817b2 (diff)
Dictionary deinflections (#503)
* wip * wip * fix v3 * wip * fix tests * fix maxitems * hide deinflection definitions * fix anki template * undo unnecessary change * delete console.log * refactor * add set false to handlebars * lint * fix tests * fix comments * fix * use Map in areArraysEqualIgnoreOrder * move inflection source icons to css * lint * improve naming * fix tests * add test * typescript * use for of * wip * comments * anki template upgrade * update descriptions
Diffstat (limited to 'ext/js/language')
-rw-r--r--ext/js/language/deinflector.js2
-rw-r--r--ext/js/language/translator.js235
2 files changed, 195 insertions, 42 deletions
diff --git a/ext/js/language/deinflector.js b/ext/js/language/deinflector.js
index d2d92e53..e2b66cb4 100644
--- a/ext/js/language/deinflector.js
+++ b/ext/js/language/deinflector.js
@@ -80,7 +80,7 @@ export class Deinflector {
/**
* @param {string} term
* @param {import('translation-internal').DeinflectionRuleFlags} rules
- * @param {string[]} reasons
+ * @param {import('dictionary').InflectionRuleChain} reasons
* @returns {import('translation-internal').Deinflection}
*/
_createDeinflection(term, rules, reasons) {
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index 5441294b..89a3e5ec 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -70,7 +70,7 @@ export class Translator {
async findTerms(mode, text, options) {
const {enabledDictionaryMap, excludeDictionaryDefinitions, sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options;
const tagAggregator = new TranslatorTagAggregator();
- let {dictionaryEntries, originalTextLength} = await this._findTermsInternalWrapper(text, enabledDictionaryMap, options, tagAggregator);
+ let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options, tagAggregator);
switch (mode) {
case 'group':
@@ -208,7 +208,7 @@ export class Translator {
* @param {TranslatorTagAggregator} tagAggregator
* @returns {Promise<import('translator').FindTermsResult>}
*/
- async _findTermsInternalWrapper(text, enabledDictionaryMap, options, tagAggregator) {
+ async _findTermsInternal(text, enabledDictionaryMap, options, tagAggregator) {
if (options.removeNonJapaneseCharacters) {
text = this._getJapaneseOnlyText(text);
}
@@ -216,18 +216,30 @@ export class Translator {
return {dictionaryEntries: [], originalTextLength: 0};
}
- const deinflections = await this._findTermsInternal(text, enabledDictionaryMap, options);
+ const deinflections = await this._getDeinflections(text, enabledDictionaryMap, options);
let originalTextLength = 0;
+ /** @type {import('dictionary').TermDictionaryEntry[]} */
const dictionaryEntries = [];
const ids = new Set();
- for (const {databaseEntries, originalText, transformedText, deinflectedText, reasons} of deinflections) {
+ for (const {databaseEntries, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates} of deinflections) {
if (databaseEntries.length === 0) { continue; }
originalTextLength = Math.max(originalTextLength, originalText.length);
for (const databaseEntry of databaseEntries) {
const {id} = databaseEntry;
- if (ids.has(id)) { continue; }
- const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, true, enabledDictionaryMap, tagAggregator);
+ if (ids.has(id)) {
+ const existingEntry = dictionaryEntries.find((entry) => {
+ return entry.definitions.some((definition) => definition.id === id);
+ });
+
+ if (existingEntry && transformedText.length >= existingEntry.headwords[0].sources[0].transformedText.length) {
+ this._mergeInflectionRuleChains(existingEntry, inflectionRuleChainCandidates);
+ }
+
+ continue;
+ }
+
+ const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates, true, enabledDictionaryMap, tagAggregator);
dictionaryEntries.push(dictionaryEntry);
ids.add(id);
}
@@ -237,40 +249,163 @@ export class Translator {
}
/**
+ * @param {import('dictionary').TermDictionaryEntry} existingEntry
+ * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates
+ */
+ _mergeInflectionRuleChains(existingEntry, inflectionRuleChainCandidates) {
+ const existingChains = existingEntry.inflectionRuleChainCandidates;
+
+ for (const {source, inflectionRules} of inflectionRuleChainCandidates) {
+ const duplicate = existingChains.find((existingChain) => this._areArraysEqualIgnoreOrder(existingChain.inflectionRules, inflectionRules));
+ if (!duplicate) {
+ existingEntry.inflectionRuleChainCandidates.push({source, inflectionRules});
+ } else if (duplicate.source !== source) {
+ duplicate.source = 'both';
+ }
+ }
+ }
+
+ /**
+ * @param {string[]} array1
+ * @param {string[]} array2
+ * @returns {boolean}
+ */
+ _areArraysEqualIgnoreOrder(array1, array2) {
+ if (array1.length !== array2.length) {
+ return false;
+ }
+
+ const frequencyCounter = new Map();
+
+ for (const element of array1) {
+ frequencyCounter.set(element, (frequencyCounter.get(element) || 0) + 1);
+ }
+
+ for (const element of array2) {
+ const frequency = frequencyCounter.get(element);
+ if (!frequency) {
+ return false;
+ }
+ frequencyCounter.set(element, frequency - 1);
+ }
+
+ return true;
+ }
+
+
+ /**
* @param {string} text
* @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap
* @param {import('translation').FindTermsOptions} options
* @returns {Promise<import('translation-internal').DatabaseDeinflection[]>}
*/
- async _findTermsInternal(text, enabledDictionaryMap, options) {
- const deinflections = (
+ async _getDeinflections(text, enabledDictionaryMap, options) {
+ let deinflections = (
options.deinflect ?
- this._getAllDeinflections(text, options) :
+ this._getAlgorithmDeinflections(text, options) :
[this._createDeinflection(text, text, text, 0, [])]
);
if (deinflections.length === 0) { return []; }
- const uniqueDeinflectionTerms = [];
- const uniqueDeinflectionArrays = [];
- const uniqueDeinflectionsMap = new Map();
+ const {matchType} = options;
+
+ await this._addEntriesToDeinflections(deinflections, enabledDictionaryMap, matchType);
+
+ const dictionaryDeinflections = await this._getDictionaryDeinflections(deinflections, enabledDictionaryMap, matchType);
+ deinflections.push(...dictionaryDeinflections);
+
for (const deinflection of deinflections) {
- const term = deinflection.deinflectedText;
- let deinflectionArray = uniqueDeinflectionsMap.get(term);
+ for (const entry of deinflection.databaseEntries) {
+ entry.definitions = entry.definitions.filter((definition) => !Array.isArray(definition));
+ }
+ deinflection.databaseEntries = deinflection.databaseEntries.filter((entry) => entry.definitions.length);
+ }
+ deinflections = deinflections.filter((deinflection) => deinflection.databaseEntries.length);
+
+ return deinflections;
+ }
+
+ /**
+ * @param {import('translation-internal').DatabaseDeinflection[]} deinflections
+ * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap
+ * @param {import('dictionary').TermSourceMatchType} matchType
+ * @returns {Promise<import('translation-internal').DatabaseDeinflection[]>}
+ */
+ async _getDictionaryDeinflections(deinflections, enabledDictionaryMap, matchType) {
+ /** @type {import('translation-internal').DatabaseDeinflection[]} */
+ const dictionaryDeinflections = [];
+ for (const deinflection of deinflections) {
+ const {originalText, transformedText, inflectionRuleChainCandidates: algorithmChains, databaseEntries} = deinflection;
+ for (const entry of databaseEntries) {
+ const {dictionary, definitions} = entry;
+ const entryDictionary = enabledDictionaryMap.get(dictionary);
+ const useDeinflections = entryDictionary?.useDeinflections ?? true;
+ if (!useDeinflections) { continue; }
+ for (const definition of definitions) {
+ if (Array.isArray(definition)) {
+ const [formOf, inflectionRules] = definition;
+ if (!formOf) { continue; }
+
+ const inflectionRuleChainCandidates = algorithmChains.map(({inflectionRules: algInflections}) => {
+ return {
+ source: /** @type {import('dictionary').InflectionSource} */ (algInflections.length === 0 ? 'dictionary' : 'both'),
+ inflectionRules: [...algInflections, ...inflectionRules]
+ };
+ });
+
+ const dictionaryDeinflection = this._createDeinflection(originalText, transformedText, formOf, 0, inflectionRuleChainCandidates);
+ dictionaryDeinflections.push(dictionaryDeinflection);
+ }
+ }
+ }
+ }
+
+ await this._addEntriesToDeinflections(dictionaryDeinflections, enabledDictionaryMap, matchType);
+
+ return dictionaryDeinflections;
+ }
+
+ /**
+ * @param {import('translation-internal').DatabaseDeinflection[]} deinflections
+ * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap
+ * @param {import('dictionary').TermSourceMatchType} matchType
+ */
+ async _addEntriesToDeinflections(deinflections, enabledDictionaryMap, matchType) {
+ const uniqueDeinflectionsMap = this._groupDeinflectionsByTerm(deinflections);
+ const uniqueDeinflectionArrays = [...uniqueDeinflectionsMap.values()];
+ const uniqueDeinflectionTerms = [...uniqueDeinflectionsMap.keys()];
+
+ const databaseEntries = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, matchType);
+ this._matchEntriesToDeinflections(databaseEntries, uniqueDeinflectionArrays, enabledDictionaryMap);
+ }
+
+ /**
+ * @param {import('translation-internal').DatabaseDeinflection[]} deinflections
+ * @returns {Map<string, import('translation-internal').DatabaseDeinflection[]>}
+ */
+ _groupDeinflectionsByTerm(deinflections) {
+ const result = new Map();
+ for (const deinflection of deinflections) {
+ const {deinflectedText} = deinflection;
+ let deinflectionArray = result.get(deinflectedText);
if (typeof deinflectionArray === 'undefined') {
deinflectionArray = [];
- uniqueDeinflectionTerms.push(term);
- uniqueDeinflectionArrays.push(deinflectionArray);
- uniqueDeinflectionsMap.set(term, deinflectionArray);
+ result.set(deinflectedText, deinflectionArray);
}
deinflectionArray.push(deinflection);
}
+ return result;
+ }
- const {matchType} = options;
- const databaseEntries = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, matchType);
-
+ /**
+ * @param {import('dictionary-database').TermEntry[]} databaseEntries
+ * @param {import('translation-internal').DatabaseDeinflection[][]} uniqueDeinflectionArrays
+ * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap
+ */
+ _matchEntriesToDeinflections(databaseEntries, uniqueDeinflectionArrays, enabledDictionaryMap) {
for (const databaseEntry of databaseEntries) {
const entryDictionary = /** @type {import('translation').FindTermDictionary} */ (enabledDictionaryMap.get(databaseEntry.dictionary));
- const partsOfSpeechFilter = entryDictionary.partsOfSpeechFilter;
+ const {partsOfSpeechFilter} = entryDictionary;
const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules);
for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) {
@@ -280,8 +415,6 @@ export class Translator {
}
}
}
-
- return deinflections;
}
// Deinflections and text transformations
@@ -291,7 +424,7 @@ export class Translator {
* @param {import('translation').FindTermsOptions} options
* @returns {import('translation-internal').DatabaseDeinflection[]}
*/
- _getAllDeinflections(text, options) {
+ _getAlgorithmDeinflections(text, options) {
/** @type {import('translation-internal').TextDeinflectionOptionsArrays} */
const textOptionVariantArray = [
this._getTextReplacementsVariants(options),
@@ -342,7 +475,12 @@ export class Translator {
used.add(source);
const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i));
for (const {term, rules, reasons} of /** @type {Deinflector} */ (this._deinflector).deinflect(source)) {
- deinflections.push(this._createDeinflection(rawSource, source, term, rules, reasons));
+ /** @type {import('dictionary').InflectionRuleChainCandidate} */
+ const inflectionRuleChainCandidate = {
+ source: 'algorithm',
+ inflectionRules: reasons
+ };
+ deinflections.push(this._createDeinflection(rawSource, source, term, rules, [inflectionRuleChainCandidate]));
}
}
}
@@ -435,11 +573,11 @@ export class Translator {
* @param {string} transformedText
* @param {string} deinflectedText
* @param {import('translation-internal').DeinflectionRuleFlags} rules
- * @param {string[]} reasons
+ * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates
* @returns {import('translation-internal').DatabaseDeinflection}
*/
- _createDeinflection(originalText, transformedText, deinflectedText, rules, reasons) {
- return {originalText, transformedText, deinflectedText, rules, reasons, databaseEntries: []};
+ _createDeinflection(originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates) {
+ return {originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates, databaseEntries: []};
}
// Term dictionary entry grouping
@@ -597,8 +735,8 @@ export class Translator {
_groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) {
const groups = new Map();
for (const dictionaryEntry of dictionaryEntries) {
- const {inflections, headwords: [{term, reading}]} = dictionaryEntry;
- const key = this._createMapKey([term, reading, ...inflections]);
+ const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry;
+ const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]);
let groupDictionaryEntries = groups.get(key);
if (typeof groupDictionaryEntries === 'undefined') {
groupDictionaryEntries = [];
@@ -1370,7 +1508,7 @@ export class Translator {
* @param {number[]} sequences
* @param {boolean} isPrimary
* @param {import('dictionary').Tag[]} tags
- * @param {import('dictionary-data').TermGlossary[]} entries
+ * @param {import('dictionary-data').TermGlossaryContent[]} entries
* @returns {import('dictionary').TermDefinition}
*/
_createTermDefinition(index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries) {
@@ -1421,7 +1559,7 @@ export class Translator {
/**
* @param {boolean} isPrimary
- * @param {string[]} inflections
+ * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates
* @param {number} score
* @param {number} dictionaryIndex
* @param {number} dictionaryPriority
@@ -1431,11 +1569,11 @@ export class Translator {
* @param {import('dictionary').TermDefinition[]} definitions
* @returns {import('dictionary').TermDictionaryEntry}
*/
- _createTermDictionaryEntry(isPrimary, inflections, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxTransformedTextLength, headwords, definitions) {
+ _createTermDictionaryEntry(isPrimary, inflectionRuleChainCandidates, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxTransformedTextLength, headwords, definitions) {
return {
type: 'term',
isPrimary,
- inflections,
+ inflectionRuleChainCandidates,
score,
frequencyOrder: 0,
dictionaryIndex,
@@ -1454,14 +1592,29 @@ export class Translator {
* @param {string} originalText
* @param {string} transformedText
* @param {string} deinflectedText
- * @param {string[]} reasons
+ * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates
* @param {boolean} isPrimary
* @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap
* @param {TranslatorTagAggregator} tagAggregator
* @returns {import('dictionary').TermDictionaryEntry}
*/
- _createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, isPrimary, enabledDictionaryMap, tagAggregator) {
- const {matchType, matchSource, term, reading: rawReading, definitionTags, termTags, definitions, score, dictionary, id, sequence: rawSequence, rules} = databaseEntry;
+ _createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates, isPrimary, enabledDictionaryMap, tagAggregator) {
+ const {
+ matchType,
+ matchSource,
+ term,
+ reading: rawReading,
+ definitionTags,
+ termTags,
+ definitions,
+ score,
+ dictionary,
+ id,
+ sequence: rawSequence,
+ rules
+ } = databaseEntry;
+ // cast is safe because getDeinflections filters out deinflection definitions
+ const contentDefinitions = /** @type {import('dictionary-data').TermGlossaryContent[]} */ (definitions);
const reading = (rawReading.length > 0 ? rawReading : term);
const {index: dictionaryIndex, priority: dictionaryPriority} = this._getDictionaryOrder(dictionary, enabledDictionaryMap);
const sourceTermExactMatchCount = (isPrimary && deinflectedText === term ? 1 : 0);
@@ -1479,14 +1632,14 @@ export class Translator {
return this._createTermDictionaryEntry(
isPrimary,
- reasons,
+ inflectionRuleChainCandidates,
score,
dictionaryIndex,
dictionaryPriority,
sourceTermExactMatchCount,
maxTransformedTextLength,
[this._createTermHeadword(0, term, reading, [source], headwordTagGroups, rules)],
- [this._createTermDefinition(0, [0], dictionary, dictionaryIndex, dictionaryPriority, id, score, [sequence], isPrimary, definitionTagGroups, definitions)]
+ [this._createTermDefinition(0, [0], dictionary, dictionaryIndex, dictionaryPriority, id, score, [sequence], isPrimary, definitionTagGroups, contentDefinitions)]
);
}
@@ -1530,7 +1683,7 @@ export class Translator {
if (dictionaryEntry.isPrimary) {
isPrimary = true;
maxTransformedTextLength = Math.max(maxTransformedTextLength, dictionaryEntry.maxTransformedTextLength);
- const dictionaryEntryInflections = dictionaryEntry.inflections;
+ const dictionaryEntryInflections = dictionaryEntry.inflectionRuleChainCandidates;
if (inflections === null || dictionaryEntryInflections.length < inflections.length) {
inflections = dictionaryEntryInflections;
}
@@ -1742,7 +1895,7 @@ export class Translator {
if (i !== 0) { return i; }
// Sort by the number of inflection reasons
- i = v1.inflections.length - v2.inflections.length;
+ i = v1.inflectionRuleChainCandidates.length - v2.inflectionRuleChainCandidates.length;
if (i !== 0) { return i; }
// Sort by how many terms exactly match the source (e.g. for exact kana prioritization)