diff options
author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-01-20 02:25:23 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-01-20 01:25:23 +0000 |
commit | 2b87c919bcd879c7f356308bc522b95f33e35f3b (patch) | |
tree | 1301aa8bf3d9b91ad96bbe9372a66dceba190346 /ext/js/language/translator.js | |
parent | 48f1d012ad5045319d4e492dfbefa39da92817b2 (diff) |
Dictionary deinflections (#503)
* wip
* wip
* fix v3
* wip
* fix tests
* fix maxitems
* hide deinflection definitions
* fix anki template
* undo unnecessary change
* delete console.log
* refactor
* add set false to handlebars
* lint
* fix tests
* fix comments
* fix
* use Map in areArraysEqualIgnoreOrder
* move inflection source icons to css
* lint
* improve naming
* fix tests
* add test
* typescript
* use for of
* wip
* comments
* anki template upgrade
* update descriptions
Diffstat (limited to 'ext/js/language/translator.js')
-rw-r--r-- | ext/js/language/translator.js | 235 |
1 files changed, 194 insertions, 41 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 5441294b..89a3e5ec 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -70,7 +70,7 @@ export class Translator { async findTerms(mode, text, options) { const {enabledDictionaryMap, excludeDictionaryDefinitions, sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options; const tagAggregator = new TranslatorTagAggregator(); - let {dictionaryEntries, originalTextLength} = await this._findTermsInternalWrapper(text, enabledDictionaryMap, options, tagAggregator); + let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options, tagAggregator); switch (mode) { case 'group': @@ -208,7 +208,7 @@ export class Translator { * @param {TranslatorTagAggregator} tagAggregator * @returns {Promise<import('translator').FindTermsResult>} */ - async _findTermsInternalWrapper(text, enabledDictionaryMap, options, tagAggregator) { + async _findTermsInternal(text, enabledDictionaryMap, options, tagAggregator) { if (options.removeNonJapaneseCharacters) { text = this._getJapaneseOnlyText(text); } @@ -216,18 +216,30 @@ export class Translator { return {dictionaryEntries: [], originalTextLength: 0}; } - const deinflections = await this._findTermsInternal(text, enabledDictionaryMap, options); + const deinflections = await this._getDeinflections(text, enabledDictionaryMap, options); let originalTextLength = 0; + /** @type {import('dictionary').TermDictionaryEntry[]} */ const dictionaryEntries = []; const ids = new Set(); - for (const {databaseEntries, originalText, transformedText, deinflectedText, reasons} of deinflections) { + for (const {databaseEntries, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates} of deinflections) { if (databaseEntries.length === 0) { continue; } originalTextLength = Math.max(originalTextLength, originalText.length); for (const databaseEntry of databaseEntries) { const {id} = databaseEntry; - if (ids.has(id)) { continue; } - const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, true, enabledDictionaryMap, tagAggregator); + if (ids.has(id)) { + const existingEntry = dictionaryEntries.find((entry) => { + return entry.definitions.some((definition) => definition.id === id); + }); + + if (existingEntry && transformedText.length >= existingEntry.headwords[0].sources[0].transformedText.length) { + this._mergeInflectionRuleChains(existingEntry, inflectionRuleChainCandidates); + } + + continue; + } + + const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates, true, enabledDictionaryMap, tagAggregator); dictionaryEntries.push(dictionaryEntry); ids.add(id); } @@ -237,40 +249,163 @@ export class Translator { } /** + * @param {import('dictionary').TermDictionaryEntry} existingEntry + * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates + */ + _mergeInflectionRuleChains(existingEntry, inflectionRuleChainCandidates) { + const existingChains = existingEntry.inflectionRuleChainCandidates; + + for (const {source, inflectionRules} of inflectionRuleChainCandidates) { + const duplicate = existingChains.find((existingChain) => this._areArraysEqualIgnoreOrder(existingChain.inflectionRules, inflectionRules)); + if (!duplicate) { + existingEntry.inflectionRuleChainCandidates.push({source, inflectionRules}); + } else if (duplicate.source !== source) { + duplicate.source = 'both'; + } + } + } + + /** + * @param {string[]} array1 + * @param {string[]} array2 + * @returns {boolean} + */ + _areArraysEqualIgnoreOrder(array1, array2) { + if (array1.length !== array2.length) { + return false; + } + + const frequencyCounter = new Map(); + + for (const element of array1) { + frequencyCounter.set(element, (frequencyCounter.get(element) || 0) + 1); + } + + for (const element of array2) { + const frequency = frequencyCounter.get(element); + if (!frequency) { + return false; + } + frequencyCounter.set(element, frequency - 1); + } + + return true; + } + + + /** * @param {string} text * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap * @param {import('translation').FindTermsOptions} options * @returns {Promise<import('translation-internal').DatabaseDeinflection[]>} */ - async _findTermsInternal(text, enabledDictionaryMap, options) { - const deinflections = ( + async _getDeinflections(text, enabledDictionaryMap, options) { + let deinflections = ( options.deinflect ? - this._getAllDeinflections(text, options) : + this._getAlgorithmDeinflections(text, options) : [this._createDeinflection(text, text, text, 0, [])] ); if (deinflections.length === 0) { return []; } - const uniqueDeinflectionTerms = []; - const uniqueDeinflectionArrays = []; - const uniqueDeinflectionsMap = new Map(); + const {matchType} = options; + + await this._addEntriesToDeinflections(deinflections, enabledDictionaryMap, matchType); + + const dictionaryDeinflections = await this._getDictionaryDeinflections(deinflections, enabledDictionaryMap, matchType); + deinflections.push(...dictionaryDeinflections); + for (const deinflection of deinflections) { - const term = deinflection.deinflectedText; - let deinflectionArray = uniqueDeinflectionsMap.get(term); + for (const entry of deinflection.databaseEntries) { + entry.definitions = entry.definitions.filter((definition) => !Array.isArray(definition)); + } + deinflection.databaseEntries = deinflection.databaseEntries.filter((entry) => entry.definitions.length); + } + deinflections = deinflections.filter((deinflection) => deinflection.databaseEntries.length); + + return deinflections; + } + + /** + * @param {import('translation-internal').DatabaseDeinflection[]} deinflections + * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap + * @param {import('dictionary').TermSourceMatchType} matchType + * @returns {Promise<import('translation-internal').DatabaseDeinflection[]>} + */ + async _getDictionaryDeinflections(deinflections, enabledDictionaryMap, matchType) { + /** @type {import('translation-internal').DatabaseDeinflection[]} */ + const dictionaryDeinflections = []; + for (const deinflection of deinflections) { + const {originalText, transformedText, inflectionRuleChainCandidates: algorithmChains, databaseEntries} = deinflection; + for (const entry of databaseEntries) { + const {dictionary, definitions} = entry; + const entryDictionary = enabledDictionaryMap.get(dictionary); + const useDeinflections = entryDictionary?.useDeinflections ?? true; + if (!useDeinflections) { continue; } + for (const definition of definitions) { + if (Array.isArray(definition)) { + const [formOf, inflectionRules] = definition; + if (!formOf) { continue; } + + const inflectionRuleChainCandidates = algorithmChains.map(({inflectionRules: algInflections}) => { + return { + source: /** @type {import('dictionary').InflectionSource} */ (algInflections.length === 0 ? 'dictionary' : 'both'), + inflectionRules: [...algInflections, ...inflectionRules] + }; + }); + + const dictionaryDeinflection = this._createDeinflection(originalText, transformedText, formOf, 0, inflectionRuleChainCandidates); + dictionaryDeinflections.push(dictionaryDeinflection); + } + } + } + } + + await this._addEntriesToDeinflections(dictionaryDeinflections, enabledDictionaryMap, matchType); + + return dictionaryDeinflections; + } + + /** + * @param {import('translation-internal').DatabaseDeinflection[]} deinflections + * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap + * @param {import('dictionary').TermSourceMatchType} matchType + */ + async _addEntriesToDeinflections(deinflections, enabledDictionaryMap, matchType) { + const uniqueDeinflectionsMap = this._groupDeinflectionsByTerm(deinflections); + const uniqueDeinflectionArrays = [...uniqueDeinflectionsMap.values()]; + const uniqueDeinflectionTerms = [...uniqueDeinflectionsMap.keys()]; + + const databaseEntries = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, matchType); + this._matchEntriesToDeinflections(databaseEntries, uniqueDeinflectionArrays, enabledDictionaryMap); + } + + /** + * @param {import('translation-internal').DatabaseDeinflection[]} deinflections + * @returns {Map<string, import('translation-internal').DatabaseDeinflection[]>} + */ + _groupDeinflectionsByTerm(deinflections) { + const result = new Map(); + for (const deinflection of deinflections) { + const {deinflectedText} = deinflection; + let deinflectionArray = result.get(deinflectedText); if (typeof deinflectionArray === 'undefined') { deinflectionArray = []; - uniqueDeinflectionTerms.push(term); - uniqueDeinflectionArrays.push(deinflectionArray); - uniqueDeinflectionsMap.set(term, deinflectionArray); + result.set(deinflectedText, deinflectionArray); } deinflectionArray.push(deinflection); } + return result; + } - const {matchType} = options; - const databaseEntries = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, matchType); - + /** + * @param {import('dictionary-database').TermEntry[]} databaseEntries + * @param {import('translation-internal').DatabaseDeinflection[][]} uniqueDeinflectionArrays + * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap + */ + _matchEntriesToDeinflections(databaseEntries, uniqueDeinflectionArrays, enabledDictionaryMap) { for (const databaseEntry of databaseEntries) { const entryDictionary = /** @type {import('translation').FindTermDictionary} */ (enabledDictionaryMap.get(databaseEntry.dictionary)); - const partsOfSpeechFilter = entryDictionary.partsOfSpeechFilter; + const {partsOfSpeechFilter} = entryDictionary; const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules); for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) { @@ -280,8 +415,6 @@ export class Translator { } } } - - return deinflections; } // Deinflections and text transformations @@ -291,7 +424,7 @@ export class Translator { * @param {import('translation').FindTermsOptions} options * @returns {import('translation-internal').DatabaseDeinflection[]} */ - _getAllDeinflections(text, options) { + _getAlgorithmDeinflections(text, options) { /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */ const textOptionVariantArray = [ this._getTextReplacementsVariants(options), @@ -342,7 +475,12 @@ export class Translator { used.add(source); const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); for (const {term, rules, reasons} of /** @type {Deinflector} */ (this._deinflector).deinflect(source)) { - deinflections.push(this._createDeinflection(rawSource, source, term, rules, reasons)); + /** @type {import('dictionary').InflectionRuleChainCandidate} */ + const inflectionRuleChainCandidate = { + source: 'algorithm', + inflectionRules: reasons + }; + deinflections.push(this._createDeinflection(rawSource, source, term, rules, [inflectionRuleChainCandidate])); } } } @@ -435,11 +573,11 @@ export class Translator { * @param {string} transformedText * @param {string} deinflectedText * @param {import('translation-internal').DeinflectionRuleFlags} rules - * @param {string[]} reasons + * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates * @returns {import('translation-internal').DatabaseDeinflection} */ - _createDeinflection(originalText, transformedText, deinflectedText, rules, reasons) { - return {originalText, transformedText, deinflectedText, rules, reasons, databaseEntries: []}; + _createDeinflection(originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates) { + return {originalText, transformedText, deinflectedText, rules, inflectionRuleChainCandidates, databaseEntries: []}; } // Term dictionary entry grouping @@ -597,8 +735,8 @@ export class Translator { _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) { const groups = new Map(); for (const dictionaryEntry of dictionaryEntries) { - const {inflections, headwords: [{term, reading}]} = dictionaryEntry; - const key = this._createMapKey([term, reading, ...inflections]); + const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry; + const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]); let groupDictionaryEntries = groups.get(key); if (typeof groupDictionaryEntries === 'undefined') { groupDictionaryEntries = []; @@ -1370,7 +1508,7 @@ export class Translator { * @param {number[]} sequences * @param {boolean} isPrimary * @param {import('dictionary').Tag[]} tags - * @param {import('dictionary-data').TermGlossary[]} entries + * @param {import('dictionary-data').TermGlossaryContent[]} entries * @returns {import('dictionary').TermDefinition} */ _createTermDefinition(index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries) { @@ -1421,7 +1559,7 @@ export class Translator { /** * @param {boolean} isPrimary - * @param {string[]} inflections + * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates * @param {number} score * @param {number} dictionaryIndex * @param {number} dictionaryPriority @@ -1431,11 +1569,11 @@ export class Translator { * @param {import('dictionary').TermDefinition[]} definitions * @returns {import('dictionary').TermDictionaryEntry} */ - _createTermDictionaryEntry(isPrimary, inflections, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxTransformedTextLength, headwords, definitions) { + _createTermDictionaryEntry(isPrimary, inflectionRuleChainCandidates, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxTransformedTextLength, headwords, definitions) { return { type: 'term', isPrimary, - inflections, + inflectionRuleChainCandidates, score, frequencyOrder: 0, dictionaryIndex, @@ -1454,14 +1592,29 @@ export class Translator { * @param {string} originalText * @param {string} transformedText * @param {string} deinflectedText - * @param {string[]} reasons + * @param {import('dictionary').InflectionRuleChainCandidate[]} inflectionRuleChainCandidates * @param {boolean} isPrimary * @param {Map<string, import('translation').FindTermDictionary>} enabledDictionaryMap * @param {TranslatorTagAggregator} tagAggregator * @returns {import('dictionary').TermDictionaryEntry} */ - _createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, isPrimary, enabledDictionaryMap, tagAggregator) { - const {matchType, matchSource, term, reading: rawReading, definitionTags, termTags, definitions, score, dictionary, id, sequence: rawSequence, rules} = databaseEntry; + _createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, inflectionRuleChainCandidates, isPrimary, enabledDictionaryMap, tagAggregator) { + const { + matchType, + matchSource, + term, + reading: rawReading, + definitionTags, + termTags, + definitions, + score, + dictionary, + id, + sequence: rawSequence, + rules + } = databaseEntry; + // cast is safe because getDeinflections filters out deinflection definitions + const contentDefinitions = /** @type {import('dictionary-data').TermGlossaryContent[]} */ (definitions); const reading = (rawReading.length > 0 ? rawReading : term); const {index: dictionaryIndex, priority: dictionaryPriority} = this._getDictionaryOrder(dictionary, enabledDictionaryMap); const sourceTermExactMatchCount = (isPrimary && deinflectedText === term ? 1 : 0); @@ -1479,14 +1632,14 @@ export class Translator { return this._createTermDictionaryEntry( isPrimary, - reasons, + inflectionRuleChainCandidates, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxTransformedTextLength, [this._createTermHeadword(0, term, reading, [source], headwordTagGroups, rules)], - [this._createTermDefinition(0, [0], dictionary, dictionaryIndex, dictionaryPriority, id, score, [sequence], isPrimary, definitionTagGroups, definitions)] + [this._createTermDefinition(0, [0], dictionary, dictionaryIndex, dictionaryPriority, id, score, [sequence], isPrimary, definitionTagGroups, contentDefinitions)] ); } @@ -1530,7 +1683,7 @@ export class Translator { if (dictionaryEntry.isPrimary) { isPrimary = true; maxTransformedTextLength = Math.max(maxTransformedTextLength, dictionaryEntry.maxTransformedTextLength); - const dictionaryEntryInflections = dictionaryEntry.inflections; + const dictionaryEntryInflections = dictionaryEntry.inflectionRuleChainCandidates; if (inflections === null || dictionaryEntryInflections.length < inflections.length) { inflections = dictionaryEntryInflections; } @@ -1742,7 +1895,7 @@ export class Translator { if (i !== 0) { return i; } // Sort by the number of inflection reasons - i = v1.inflections.length - v2.inflections.length; + i = v1.inflectionRuleChainCandidates.length - v2.inflectionRuleChainCandidates.length; if (i !== 0) { return i; } // Sort by how many terms exactly match the source (e.g. for exact kana prioritization) |