diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-03-25 19:55:31 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-03-25 19:55:31 -0400 |
commit | 4be5c8fd9f7860e701d0b7d3c8c0ee934bc60a4f (patch) | |
tree | dcd78316afdf00bbb67d3d1aa6555a9c8ea3efec /ext/js/language/translator.js | |
parent | e7035dcff41d94f20c0bc8865d413412afc7c229 (diff) |
Refactor Translator and dictionary entry format (#1553)
* Update test data
* Move translator.js
* Create new version of Translator
* Update Backend
* Update DictionaryDataUtil
* Update DisplayGenerator
* Create AnkiNoteDataCreator
* Replace AnkiNoteData with AnkiNoteDataCreator
* Update tests
* Remove AnkiNoteData
* Update test data
* Remove translator-old.js
* Add TypeScript interface definitions for the new translator data format
Diffstat (limited to 'ext/js/language/translator.js')
-rw-r--r-- | ext/js/language/translator.js | 1572 |
1 files changed, 750 insertions, 822 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 151b1172..934c8e4a 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -22,7 +22,7 @@ */ /** - * Class which finds term and kanji definitions for text. + * Class which finds term and kanji dictionary entries for text. */ class Translator { /** @@ -59,6 +59,7 @@ class Translator { * One of: 'group', 'merge', 'split', 'simple' * @param text The text to find terms for. * @param options An object using the following structure: + * ``` * { * wildcard: (enum: null, 'prefix', 'suffix'), * mainDictionary: (string), @@ -85,22 +86,35 @@ class Translator { * } * ]) * } - * @returns An array of [definitions, textLength]. The structure of each definition depends on the - * mode parameter, see the _create?TermDefinition?() functions for structure details. + * ``` + * @returns An object of the structure `{dictionaryEntries, originalTextLength}`. */ async findTerms(mode, text, options) { + const {enabledDictionaryMap} = options; + let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options); + switch (mode) { case 'group': - return await this._findTermsGrouped(text, options); + dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries); + break; case 'merge': - return await this._findTermsMerged(text, options); - case 'split': - return await this._findTermsSplit(text, options); - case 'simple': - return await this._findTermsSimple(text, options); - default: - return [[], 0]; + dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options.mainDictionary, enabledDictionaryMap); + break; } + + if (dictionaryEntries.length > 1) { + this._sortTermDictionaryEntries(dictionaryEntries); + } + + if (mode === 'simple') { + this._clearTermTags(dictionaryEntries); + } else { + await this._addTermMeta(dictionaryEntries, enabledDictionaryMap); + await this._expandTermTags(dictionaryEntries); + this._sortTermDictionaryEntryData(dictionaryEntries); + } + + return {dictionaryEntries, originalTextLength}; } /** @@ -127,93 +141,28 @@ class Translator { kanjiUnique.add(c); } - const databaseDefinitions = await this._database.findKanjiBulk([...kanjiUnique], enabledDictionaryMap); - if (databaseDefinitions.length === 0) { return []; } - - this._sortDatabaseDefinitionsByIndex(databaseDefinitions); - - const definitions = []; - for (const {character, onyomi, kunyomi, tags, glossary, stats, dictionary} of databaseDefinitions) { - const expandedStats = await this._expandStats(stats, dictionary); - const expandedTags = await this._expandTags(tags, dictionary); - this._sortTags(expandedTags); - - const definition = this._createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, expandedTags, expandedStats); - definitions.push(definition); - } - - await this._buildKanjiMeta(definitions, enabledDictionaryMap); - - return definitions; - } - - // Find terms core functions - - async _findTermsSimple(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - this._sortDefinitions(definitions); - return [definitions, length]; - } - - async _findTermsSplit(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - await this._buildTermMeta(definitions, enabledDictionaryMap); - this._sortDefinitions(definitions); - return [definitions, length]; - } - - async _findTermsGrouped(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - - const groupedDefinitions = this._groupTerms(definitions, enabledDictionaryMap); - await this._buildTermMeta(groupedDefinitions, enabledDictionaryMap); - this._sortDefinitions(groupedDefinitions); + const databaseEntries = await this._database.findKanjiBulk([...kanjiUnique], enabledDictionaryMap); + if (databaseEntries.length === 0) { return []; } - for (const definition of groupedDefinitions) { - this._flagRedundantDefinitionTags(definition.definitions); - } + this._sortDatabaseEntriesByIndex(databaseEntries); - return [groupedDefinitions, length]; - } + const dictionaryEntries = []; + for (const {character, onyomi, kunyomi, tags, glossary, stats, dictionary} of databaseEntries) { + const expandedStats = await this._expandKanjiStats(stats, dictionary); - async _findTermsMerged(text, options) { - const {mainDictionary, enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - const {sequencedDefinitions, unsequencedDefinitions} = await this._getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap); - const definitionsMerged = []; + const tagGroups = []; + if (tags.length > 0) { tagGroups.push(this._createTagGroup(dictionary, tags)); } - for (const {relatedDefinitions, secondaryDefinitions} of sequencedDefinitions) { - const mergedDefinition = this._getMergedDefinition(relatedDefinitions, secondaryDefinitions); - definitionsMerged.push(mergedDefinition); + const dictionaryEntry = this._createKanjiDictionaryEntry(character, dictionary, onyomi, kunyomi, tagGroups, expandedStats, glossary); + dictionaryEntries.push(dictionaryEntry); } - for (const groupedDefinition of this._groupTerms(unsequencedDefinitions, enabledDictionaryMap)) { - const {reasons, score, expression, reading, source, rawSource, definitions: definitions2} = groupedDefinition; - const termDetailsList = this._createTermDetailsList(definitions2); - const compatibilityDefinition = this._createMergedTermDefinition( - source, - rawSource, - this._convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions2), - [expression], - [reading], - termDetailsList, - reasons, - score - ); - definitionsMerged.push(compatibilityDefinition); - } - - await this._buildTermMeta(definitionsMerged, enabledDictionaryMap); - this._sortDefinitions(definitionsMerged); + await this._addKanjiMeta(dictionaryEntries, enabledDictionaryMap); + await this._expandKanjiTags(dictionaryEntries); - for (const definition of definitionsMerged) { - this._flagRedundantDefinitionTags(definition.definitions); - } + this._sortKanjiDictionaryEntryData(dictionaryEntries); - return [definitionsMerged, length]; + return dictionaryEntries; } // Find terms internal implementation @@ -225,33 +174,33 @@ class Translator { return [[], 0]; } - const deinflections = ( + const deinflections = await ( wildcard ? - await this._findTermWildcard(text, enabledDictionaryMap, wildcard) : - await this._findTermDeinflections(text, enabledDictionaryMap, options) + this._findTermsWildcard(text, enabledDictionaryMap, wildcard) : + this._findTermDeinflections(text, enabledDictionaryMap, options) ); - let maxLength = 0; - const definitions = []; - const definitionIds = new Set(); - for (const {databaseDefinitions, source, rawSource, term, reasons} of deinflections) { - if (databaseDefinitions.length === 0) { continue; } - maxLength = Math.max(maxLength, rawSource.length); - for (const databaseDefinition of databaseDefinitions) { - const {id} = databaseDefinition; - if (definitionIds.has(id)) { continue; } - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, term, reasons, true, enabledDictionaryMap); - definitions.push(definition); - definitionIds.add(id); + let originalTextLength = 0; + const dictionaryEntries = []; + const ids = new Set(); + for (const {databaseEntries, originalText, transformedText, deinflectedText, reasons} of deinflections) { + if (databaseEntries.length === 0) { continue; } + originalTextLength = Math.max(originalTextLength, originalText.length); + for (const databaseEntry of databaseEntries) { + const {id} = databaseEntry; + if (ids.has(id)) { continue; } + const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, true, enabledDictionaryMap); + dictionaryEntries.push(dictionaryEntry); + ids.add(id); } } - return [definitions, maxLength]; + return {dictionaryEntries, originalTextLength}; } - async _findTermWildcard(text, enabledDictionaryMap, wildcard) { - const databaseDefinitions = await this._database.findTermsBulk([text], enabledDictionaryMap, wildcard); - return databaseDefinitions.length > 0 ? [this._createDeinflection(text, text, text, 0, [], databaseDefinitions)] : []; + async _findTermsWildcard(text, enabledDictionaryMap, wildcard) { + const databaseEntries = await this._database.findTermsBulk([text], enabledDictionaryMap, wildcard); + return databaseEntries.length > 0 ? [this._createDeinflection(text, text, text, 0, [], databaseEntries)] : []; } async _findTermDeinflections(text, enabledDictionaryMap, options) { @@ -265,7 +214,7 @@ class Translator { const uniqueDeinflectionArrays = []; const uniqueDeinflectionsMap = new Map(); for (const deinflection of deinflections) { - const term = deinflection.term; + const term = deinflection.deinflectedText; let deinflectionArray = uniqueDeinflectionsMap.get(term); if (typeof deinflectionArray === 'undefined') { deinflectionArray = []; @@ -276,14 +225,14 @@ class Translator { deinflectionArray.push(deinflection); } - const databaseDefinitions = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, null); + const databaseEntries = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, null); - for (const databaseDefinition of databaseDefinitions) { - const definitionRules = Deinflector.rulesToRuleFlags(databaseDefinition.rules); - for (const deinflection of uniqueDeinflectionArrays[databaseDefinition.index]) { + for (const databaseEntry of databaseEntries) { + const definitionRules = Deinflector.rulesToRuleFlags(databaseEntry.rules); + for (const deinflection of uniqueDeinflectionArrays[databaseEntry.index]) { const deinflectionRules = deinflection.rules; if (deinflectionRules === 0 || (definitionRules & deinflectionRules) !== 0) { - deinflection.databaseDefinitions.push(databaseDefinition); + deinflection.databaseEntries.push(databaseEntry); } } } @@ -291,6 +240,8 @@ class Translator { return deinflections; } + // Deinflections and text transformations + _getAllDeinflections(text, options) { const textOptionVariantArray = [ this._getTextReplacementsVariants(options), @@ -336,120 +287,159 @@ class Translator { used.add(source); const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); for (const {term, rules, reasons} of this._deinflector.deinflect(source)) { - deinflections.push(this._createDeinflection(source, rawSource, term, rules, reasons, [])); + deinflections.push(this._createDeinflection(rawSource, source, term, rules, reasons, [])); } } } return deinflections; } - _createDeinflection(source, rawSource, term, rules, reasons, databaseDefinitions) { - return {source, rawSource, term, rules, reasons, databaseDefinitions}; + _applyTextReplacements(text, sourceMap, replacements) { + for (const {pattern, replacement} of replacements) { + text = RegexUtil.applyTextReplacement(text, sourceMap, pattern, replacement); + } + return text; + } + + _getSearchableText(text, allowAlphanumericCharacters) { + if (allowAlphanumericCharacters) { return text; } + const jp = this._japaneseUtil; + let length = 0; + for (const c of text) { + if (!jp.isCodePointJapanese(c.codePointAt(0))) { break; } + length += c.length; + } + return length >= text.length ? text : text.substring(0, length); } - /** - * @param definitions An array of 'term' definitions. - * @param mainDictionary The name of the main dictionary. - * @param enabledDictionaryMap The map of enabled dictionaries and their settings. - */ - async _getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap) { - const secondarySearchDictionaryMap = this._getSecondarySearchDictionaryMap(enabledDictionaryMap); + _getTextOptionEntryVariants(value) { + switch (value) { + case 'true': return [true]; + case 'variant': return [false, true]; + default: return [false]; + } + } + + _getCollapseEmphaticOptions(options) { + const collapseEmphaticOptions = [[false, false]]; + switch (options.collapseEmphaticSequences) { + case 'true': + collapseEmphaticOptions.push([true, false]); + break; + case 'full': + collapseEmphaticOptions.push([true, false], [true, true]); + break; + } + return collapseEmphaticOptions; + } + + _getTextReplacementsVariants(options) { + return options.textReplacements; + } + + _createDeinflection(originalText, transformedText, deinflectedText, rules, reasons, databaseEntries) { + return {originalText, transformedText, deinflectedText, rules, reasons, databaseEntries}; + } + + // Term dictionary entry grouping + + async _getRelatedDictionaryEntries(dictionaryEntries, mainDictionary, enabledDictionaryMap) { const sequenceList = []; - const sequencedDefinitionMap = new Map(); - const sequencedDefinitions = []; - const unsequencedDefinitions = new Map(); - for (const definition of definitions) { - const {sequence, dictionary, id} = definition; + const groupedDictionaryEntries = []; + const groupedDictionaryEntriesMap = new Map(); + const ungroupedDictionaryEntriesMap = new Map(); + for (const dictionaryEntry of dictionaryEntries) { + const {id, sequence, definitions: [{dictionary}]} = dictionaryEntry; if (mainDictionary === dictionary && sequence >= 0) { - let sequencedDefinition = sequencedDefinitionMap.get(sequence); - if (typeof sequencedDefinition === 'undefined') { - sequencedDefinition = { - relatedDefinitions: [], - definitionIds: new Set(), - secondaryDefinitions: [] - }; - sequencedDefinitionMap.set(sequence, sequencedDefinition); - sequencedDefinitions.push(sequencedDefinition); - sequenceList.push(sequence); + let group = groupedDictionaryEntriesMap.get(sequence); + if (typeof group === 'undefined') { + group = {ids: new Set(), dictionaryEntries: []}; + sequenceList.push({query: sequence, dictionary}); + groupedDictionaryEntries.push(group); + groupedDictionaryEntriesMap.set(sequence, group); } - sequencedDefinition.relatedDefinitions.push(definition); - sequencedDefinition.definitionIds.add(id); + group.dictionaryEntries.push(dictionaryEntry); + group.ids.add(id); } else { - unsequencedDefinitions.set(id, definition); + ungroupedDictionaryEntriesMap.set(id, dictionaryEntry); } } if (sequenceList.length > 0) { - await this._addRelatedDefinitions(sequencedDefinitions, unsequencedDefinitions, sequenceList, mainDictionary, enabledDictionaryMap); - await this._addSecondaryDefinitions(sequencedDefinitions, unsequencedDefinitions, enabledDictionaryMap, secondarySearchDictionaryMap); + const secondarySearchDictionaryMap = this._getSecondarySearchDictionaryMap(enabledDictionaryMap); + await this._addRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, sequenceList, mainDictionary, enabledDictionaryMap); + for (const group of groupedDictionaryEntries) { + this._sortTermDictionaryEntriesById(group.dictionaryEntries); + } + if (ungroupedDictionaryEntriesMap.size !== 0 || secondarySearchDictionaryMap.size !== 0) { + await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap); + } } - for (const {relatedDefinitions} of sequencedDefinitions) { - this._sortDefinitionsById(relatedDefinitions); + const newDictionaryEntries = []; + for (const group of groupedDictionaryEntries) { + newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true)); } - - return {sequencedDefinitions, unsequencedDefinitions: [...unsequencedDefinitions.values()]}; - } - - async _addRelatedDefinitions(sequencedDefinitions, unsequencedDefinitions, sequenceList, mainDictionary, enabledDictionaryMap) { - const items = sequenceList.map((query) => ({query, dictionary: mainDictionary})); - const databaseDefinitions = await this._database.findTermsBySequenceBulk(items); - for (const databaseDefinition of databaseDefinitions) { - const {relatedDefinitions, definitionIds} = sequencedDefinitions[databaseDefinition.index]; - const {id} = databaseDefinition; - if (definitionIds.has(id)) { continue; } - - const {source, rawSource, sourceTerm} = relatedDefinitions[0]; - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, [], false, enabledDictionaryMap); - relatedDefinitions.push(definition); - definitionIds.add(id); - unsequencedDefinitions.delete(id); + newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values())); + return newDictionaryEntries; + } + + async _addRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, sequenceList, mainDictionary, enabledDictionaryMap) { + const databaseEntries = await this._database.findTermsBySequenceBulk(sequenceList); + for (const databaseEntry of databaseEntries) { + const {dictionaryEntries, ids} = groupedDictionaryEntries[databaseEntry.index]; + const {id} = databaseEntry; + if (ids.has(id)) { continue; } + + const sourceText = databaseEntry.expression; + const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, sourceText, sourceText, sourceText, [], false, enabledDictionaryMap); + dictionaryEntries.push(dictionaryEntry); + ids.add(id); + ungroupedDictionaryEntriesMap.delete(id); } } - async _addSecondaryDefinitions(sequencedDefinitions, unsequencedDefinitions, enabledDictionaryMap, secondarySearchDictionaryMap) { - if (unsequencedDefinitions.length === 0 && secondarySearchDictionaryMap.size === 0) { return; } - + async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap) { // Prepare grouping info const termList = []; const targetList = []; const targetMap = new Map(); - for (const sequencedDefinition of sequencedDefinitions) { - const {relatedDefinitions} = sequencedDefinition; - for (const definition of relatedDefinitions) { - const {expressions: [{expression, reading}]} = definition; - const key = this._createMapKey([expression, reading]); + for (const group of groupedDictionaryEntries) { + const {dictionaryEntries} = group; + for (const dictionaryEntry of dictionaryEntries) { + const {term, reading} = dictionaryEntry.headwords[0]; + const key = this._createMapKey([term, reading]); let target = targetMap.get(key); if (typeof target === 'undefined') { target = { - sequencedDefinitions: [], + groups: [], searchSecondary: false }; targetMap.set(key, target); } - target.sequencedDefinitions.push(sequencedDefinition); - if (!definition.isPrimary && !target.searchSecondary) { + target.groups.push(group); + if (!dictionaryEntry.isPrimary && !target.searchSecondary) { target.searchSecondary = true; - termList.push({expression, reading}); + termList.push({expression: term, reading}); targetList.push(target); } } } - // Group unsequenced definitions with sequenced definitions that have a matching [expression, reading]. - for (const [id, definition] of unsequencedDefinitions.entries()) { - const {expressions: [{expression, reading}]} = definition; - const key = this._createMapKey([expression, reading]); + // Group unsequenced dictionary entries with sequenced entries that have a matching [expression, reading]. + for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) { + const {term, reading} = dictionaryEntry.headwords[0]; + const key = this._createMapKey([term, reading]); const target = targetMap.get(key); if (typeof target === 'undefined') { continue; } - for (const {definitionIds, secondaryDefinitions} of target.sequencedDefinitions) { - if (definitionIds.has(id)) { continue; } + for (const {ids, dictionaryEntries} of target.groups) { + if (ids.has(id)) { continue; } - secondaryDefinitions.push(definition); - definitionIds.add(id); - unsequencedDefinitions.delete(id); + dictionaryEntries.push(dictionaryEntry); + ids.add(id); + ungroupedDictionaryEntriesMap.delete(id); break; } } @@ -457,102 +447,200 @@ class Translator { // Search database for additional secondary terms if (termList.length === 0 || secondarySearchDictionaryMap.size === 0) { return; } - const databaseDefinitions = await this._database.findTermsExactBulk(termList, secondarySearchDictionaryMap); - this._sortDatabaseDefinitionsByIndex(databaseDefinitions); + const databaseEntries = await this._database.findTermsExactBulk(termList, secondarySearchDictionaryMap); + this._sortDatabaseEntriesByIndex(databaseEntries); - for (const databaseDefinition of databaseDefinitions) { - const {index, id} = databaseDefinition; - const source = termList[index].expression; + for (const databaseEntry of databaseEntries) { + const {index, id} = databaseEntry; + const sourceText = termList[index].expression; const target = targetList[index]; - for (const {definitionIds, secondaryDefinitions} of target.sequencedDefinitions) { - if (definitionIds.has(id)) { continue; } + for (const {ids, dictionaryEntries} of target.groups) { + if (ids.has(id)) { continue; } - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, source, source, [], false, enabledDictionaryMap); - secondaryDefinitions.push(definition); - definitionIds.add(id); - unsequencedDefinitions.delete(id); + const dictionaryEntry = this._createTermDictionaryEntryFromDatabaseEntry(databaseEntry, sourceText, sourceText, sourceText, [], false, enabledDictionaryMap); + dictionaryEntries.push(dictionaryEntry); + ids.add(id); + ungroupedDictionaryEntriesMap.delete(id); } } } - _getMergedDefinition(relatedDefinitions, secondaryDefinitions) { - const {reasons, source, rawSource} = relatedDefinitions[0]; - const allDefinitions = secondaryDefinitions.length > 0 ? [...relatedDefinitions, ...secondaryDefinitions] : relatedDefinitions; - const score = this._getMaxPrimaryDefinitionScore(allDefinitions); + _groupDictionaryEntriesByHeadword(dictionaryEntries) { + const groups = new Map(); + for (const dictionaryEntry of dictionaryEntries) { + const {inflections, headwords: [{term, reading}]} = dictionaryEntry; + const key = this._createMapKey([term, reading, ...inflections]); + let dictionaryEntries2 = groups.get(key); + if (typeof dictionaryEntries2 === 'undefined') { + dictionaryEntries2 = []; + groups.set(key, dictionaryEntries2); + } + dictionaryEntries2.push(dictionaryEntry); + } - // Merge by glossary - const allExpressions = new Set(); - const allReadings = new Set(); - const glossaryDefinitionGroupMap = new Map(); - for (const definition of allDefinitions) { - const {dictionary, glossary, expressions: [{expression, reading}]} = definition; + const results = []; + for (const dictionaryEntries2 of groups.values()) { + const dictionaryEntry = this._createGroupedDictionaryEntry(dictionaryEntries2, false); + results.push(dictionaryEntry); + } + return results; + } - const key = this._createMapKey([dictionary, ...glossary]); - let group = glossaryDefinitionGroupMap.get(key); - if (typeof group === 'undefined') { - group = { - expressions: new Set(), - readings: new Set(), - definitions: [] - }; - glossaryDefinitionGroupMap.set(key, group); + // Tags + + _getTermTagTargets(dictionaryEntries) { + const tagTargets = []; + for (const {headwords, definitions, pronunciations} of dictionaryEntries) { + this._addTagExpansionTargets(tagTargets, headwords); + this._addTagExpansionTargets(tagTargets, definitions); + for (const {pitches} of pronunciations) { + this._addTagExpansionTargets(tagTargets, pitches); } + } + return tagTargets; + } + + _clearTermTags(dictionaryEntries) { + this._getTermTagTargets(dictionaryEntries); + } + + async _expandTermTags(dictionaryEntries) { + const tagTargets = this._getTermTagTargets(dictionaryEntries); + await this._expandTagGroups(tagTargets); + this._groupTags(tagTargets); + } - allExpressions.add(expression); - allReadings.add(reading); - group.expressions.add(expression); - group.readings.add(reading); - group.definitions.push(definition); + async _expandKanjiTags(dictionaryEntries) { + const tagTargets = []; + this._addTagExpansionTargets(tagTargets, dictionaryEntries); + await this._expandTagGroups(tagTargets); + this._groupTags(tagTargets); + } + + async _expandTagGroups(tagTargets) { + const allItems = []; + const targetMap = new Map(); + for (const {tagGroups, tags} of tagTargets) { + for (const {dictionary, tagNames} of tagGroups) { + let dictionaryItems = targetMap.get(dictionary); + if (typeof dictionaryItems === 'undefined') { + dictionaryItems = new Map(); + targetMap.set(dictionary, dictionaryItems); + } + for (const tagName of tagNames) { + let item = dictionaryItems.get(tagName); + if (typeof item === 'undefined') { + const query = this._getNameBase(tagName); + item = {query, dictionary, tagName, cache: null, databaseTag: null, targets: []}; + dictionaryItems.set(tagName, item); + allItems.push(item); + } + item.targets.push(tags); + } + } } - const glossaryDefinitions = []; - for (const {expressions, readings, definitions} of glossaryDefinitionGroupMap.values()) { - const glossaryDefinition = this._createMergedGlossaryTermDefinition( - source, - rawSource, - definitions, - expressions, - readings, - allExpressions, - allReadings - ); - glossaryDefinitions.push(glossaryDefinition); + const nonCachedItems = []; + const tagCache = this._tagCache; + for (const [dictionary, dictionaryItems] of targetMap.entries()) { + let cache = tagCache.get(dictionary); + if (typeof cache === 'undefined') { + cache = new Map(); + tagCache.set(dictionary, cache); + } + for (const item of dictionaryItems.values()) { + const databaseTag = cache.get(item.query); + if (typeof databaseTag !== 'undefined') { + item.databaseTag = databaseTag; + } else { + item.cache = cache; + nonCachedItems.push(item); + } + } } - this._sortDefinitions(glossaryDefinitions, false); - const termDetailsList = this._createTermDetailsList(allDefinitions); + const nonCachedItemCount = nonCachedItems.length; + if (nonCachedItemCount > 0) { + const databaseTags = await this._database.findTagMetaBulk(nonCachedItems); + for (let i = 0; i < nonCachedItemCount; ++i) { + const item = nonCachedItems[i]; + let databaseTag = databaseTags[i]; + if (typeof databaseTag === 'undefined') { databaseTag = null; } + item.databaseTag = databaseTag; + item.cache.set(item.query, databaseTag); + } + } - return this._createMergedTermDefinition( - source, - rawSource, - glossaryDefinitions, - [...allExpressions], - [...allReadings], - termDetailsList, - reasons, - score - ); + for (const {dictionary, tagName, databaseTag, targets} of allItems) { + for (const tags of targets) { + tags.push(this._createTag(databaseTag, tagName, dictionary)); + } + } } - _getUniqueDefinitionTags(definitions) { - const definitionTagsMap = new Map(); - for (const {definitionTags} of definitions) { - for (const tag of definitionTags) { - const {name} = tag; - if (definitionTagsMap.has(name)) { continue; } - definitionTagsMap.set(name, this._cloneTag(tag)); + _groupTags(tagTargets) { + const stringComparer = this._stringComparer; + const compare = (v1, v2) => { + const i = v1.order - v2.order; + return i !== 0 ? i : stringComparer.compare(v1.name, v2.name); + }; + + for (const {tags} of tagTargets) { + if (tags.length <= 1) { continue; } + this._mergeSimilarTags(tags); + tags.sort(compare); + } + } + + _addTagExpansionTargets(tagTargets, objects) { + for (const value of objects) { + const tagGroups = value.tags; + if (tagGroups.length === 0) { continue; } + const tags = []; + value.tags = tags; + tagTargets.push({tagGroups, tags}); + } + } + + _mergeSimilarTags(tags) { + let tagCount = tags.length; + for (let i = 0; i < tagCount; ++i) { + const tag1 = tags[i]; + const {category, name} = tag1; + for (let j = i + 1; j < tagCount; ++j) { + const tag2 = tags[j]; + if (tag2.name !== name || tag2.category !== category) { continue; } + // Merge tag + tag1.order = Math.min(tag1.order, tag2.order); + tag1.score = Math.max(tag1.score, tag2.score); + tag1.dictionaries.push(...tag2.dictionaries); + this._addUniqueStrings(tag1.content, tag2.content); + tags.splice(j, 1); + --tagCount; + --j; } } - return [...definitionTagsMap.values()]; + } + + _getTagNamesWithCategory(tags, category) { + const results = []; + for (const tag of tags) { + if (tag.category !== category) { continue; } + results.push(tag.name); + } + results.sort(); + return results; } _flagRedundantDefinitionTags(definitions) { + if (definitions.length === 0) { return; } + let lastDictionary = null; let lastPartOfSpeech = ''; const removeCategoriesSet = new Set(); - for (const {dictionary, definitionTags} of definitions) { - const partOfSpeech = this._createMapKey(this._getTagNamesWithCategory(definitionTags, 'partOfSpeech')); + for (const {dictionary, tags} of definitions) { + const partOfSpeech = this._createMapKey(this._getTagNamesWithCategory(tags, 'partOfSpeech')); if (lastDictionary !== dictionary) { lastDictionary = dictionary; @@ -566,87 +654,46 @@ class Translator { } if (removeCategoriesSet.size > 0) { - this._flagTagsWithCategoryAsRedundant(definitionTags, removeCategoriesSet); + for (const tag of tags) { + if (removeCategoriesSet.has(tag.category)) { + tag.redundant = true; + } + } removeCategoriesSet.clear(); } } } - /** - * Groups definitions with the same [source, expression, reading, reasons]. - * @param definitions An array of 'term' definitions. - * @returns An array of 'termGrouped' definitions. - */ - _groupTerms(definitions) { - const groups = new Map(); - for (const definition of definitions) { - const {source, reasons, expressions: [{expression, reading}]} = definition; - const key = this._createMapKey([source, expression, reading, ...reasons]); - let groupDefinitions = groups.get(key); - if (typeof groupDefinitions === 'undefined') { - groupDefinitions = []; - groups.set(key, groupDefinitions); - } - - groupDefinitions.push(definition); - } - - const results = []; - for (const groupDefinitions of groups.values()) { - this._sortDefinitions(groupDefinitions, false); - const definition = this._createGroupedTermDefinition(groupDefinitions); - results.push(definition); - } - - return results; - } - - _convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions) { - const convertedDefinitions = []; - for (const definition of definitions) { - const {source, rawSource, expression, reading} = definition; - const expressions = new Set([expression]); - const readings = new Set([reading]); - const convertedDefinition = this._createMergedGlossaryTermDefinition(source, rawSource, [definition], expressions, readings, expressions, readings); - convertedDefinitions.push(convertedDefinition); - } - return convertedDefinitions; - } - - // Metadata building + // Metadata - async _buildTermMeta(definitions, enabledDictionaryMap) { - const allDefinitions = this._getAllDefinitions(definitions); - const expressionMap = new Map(); - const expressionValues = []; - const expressionKeys = []; + async _addTermMeta(dictionaryEntries, enabledDictionaryMap) { + const headwordMap = new Map(); + const headwordMapKeys = []; + const headwordReadingMaps = []; - for (const {expressions, frequencies: frequencies1, pitches: pitches1} of allDefinitions) { - for (let i = 0, ii = expressions.length; i < ii; ++i) { - const {expression, reading, frequencies: frequencies2, pitches: pitches2} = expressions[i]; - let readingMap = expressionMap.get(expression); + for (const {headwords, pronunciations, frequencies} of dictionaryEntries) { + for (let i = 0, ii = headwords.length; i < ii; ++i) { + const {term, reading} = headwords[i]; + let readingMap = headwordMap.get(term); if (typeof readingMap === 'undefined') { readingMap = new Map(); - expressionMap.set(expression, readingMap); - expressionValues.push(readingMap); - expressionKeys.push(expression); + headwordMap.set(term, readingMap); + headwordMapKeys.push(term); + headwordReadingMaps.push(readingMap); } let targets = readingMap.get(reading); if (typeof targets === 'undefined') { targets = []; readingMap.set(reading, targets); } - targets.push( - {frequencies: frequencies1, pitches: pitches1, index: i}, - {frequencies: frequencies2, pitches: pitches2, index: i} - ); + targets.push({headwordIndex: i, pronunciations, frequencies}); } } - const metas = await this._database.findTermMetaBulk(expressionKeys, enabledDictionaryMap); - for (const {expression, mode, data, dictionary, index} of metas) { - const dictionaryOrder = this._getDictionaryOrder(dictionary, enabledDictionaryMap); - const map2 = expressionValues[index]; + const metas = await this._database.findTermMetaBulk(headwordMapKeys, enabledDictionaryMap); + for (const {mode, data, dictionary, index} of metas) { + const {index: dictionaryIndex, priority: dictionaryPriority} = this._getDictionaryOrder(dictionary, enabledDictionaryMap); + const map2 = headwordReadingMaps[index]; for (const [reading, targets] of map2.entries()) { switch (mode) { case 'freq': @@ -657,171 +704,124 @@ class Translator { if (data.reading !== reading) { continue; } frequency = data.frequency; } - for (const {frequencies, index: expressionIndex} of targets) { - frequencies.push({index: frequencies.length, expressionIndex, dictionary, dictionaryOrder, expression, reading, hasReading, frequency}); + for (const {frequencies, headwordIndex} of targets) { + frequencies.push(this._createTermFrequency( + frequencies.length, + headwordIndex, + dictionary, + dictionaryIndex, + dictionaryPriority, + hasReading, + frequency + )); } } break; case 'pitch': { if (data.reading !== reading) { continue; } - const pitches2 = []; - for (let {position, tags} of data.pitches) { - tags = Array.isArray(tags) ? await this._expandTags(tags, dictionary) : []; - pitches2.push({position, tags}); + const pitches = []; + for (const {position, tags} of data.pitches) { + const tags2 = []; + if (Array.isArray(tags) && tags.length > 0) { + tags2.push(this._createTagGroup(dictionary, tags)); + } + pitches.push({position, tags: tags2}); } - for (const {pitches, index: expressionIndex} of targets) { - pitches.push({index: pitches.length, expressionIndex, dictionary, dictionaryOrder, expression, reading, pitches: pitches2}); + for (const {pronunciations, headwordIndex} of targets) { + pronunciations.push(this._createTermPronunciation( + pronunciations.length, + headwordIndex, + dictionary, + dictionaryIndex, + dictionaryPriority, + pitches + )); } } break; } } } - - for (const definition of allDefinitions) { - this._sortTermDefinitionMeta(definition); - } } - async _buildKanjiMeta(definitions, enabledDictionaryMap) { + async _addKanjiMeta(dictionaryEntries, enabledDictionaryMap) { const kanjiList = []; - for (const {character} of definitions) { + for (const {character} of dictionaryEntries) { kanjiList.push(character); } const metas = await this._database.findKanjiMetaBulk(kanjiList, enabledDictionaryMap); for (const {character, mode, data, dictionary, index} of metas) { - const dictionaryOrder = this._getDictionaryOrder(dictionary, enabledDictionaryMap); + const {index: dictionaryIndex, priority: dictionaryPriority} = this._getDictionaryOrder(dictionary, enabledDictionaryMap); switch (mode) { case 'freq': { - const {frequencies} = definitions[index]; - frequencies.push({index: frequencies.length, dictionary, dictionaryOrder, character, frequency: data}); + const {frequencies} = dictionaryEntries[index]; + frequencies.push(this._createKanjiFrequency( + frequencies.length, + dictionary, + dictionaryIndex, + dictionaryPriority, + character, + data + )); } break; } } - - for (const definition of definitions) { - this._sortKanjiDefinitionMeta(definition); - } } - async _expandTags(names, dictionary) { - const tagMetaList = await this._getTagMetaList(names, dictionary); - const results = []; - for (let i = 0, ii = tagMetaList.length; i < ii; ++i) { - const meta = tagMetaList[i]; - const name = names[i]; - const {category, notes, order, score} = (meta !== null ? meta : {}); - const tag = this._createTag(name, category, notes, order, score, dictionary, false); - results.push(tag); + async _expandKanjiStats(stats, dictionary) { + const statsEntries = Object.entries(stats); + const items = []; + for (const [name] of statsEntries) { + const query = this._getNameBase(name); + items.push({query, dictionary}); } - return results; - } - async _expandStats(items, dictionary) { - const names = Object.keys(items); - const tagMetaList = await this._getTagMetaList(names, dictionary); + const databaseInfos = await this._database.findTagMetaBulk(items); const statsGroups = new Map(); - for (let i = 0; i < names.length; ++i) { - const name = names[i]; - const meta = tagMetaList[i]; - if (meta === null) { continue; } + for (let i = 0, ii = statsEntries.length; i < ii; ++i) { + const databaseInfo = databaseInfos[i]; + if (databaseInfo === null) { continue; } - const {category, notes, order, score} = meta; + const [name, value] = statsEntries[i]; + const {category} = databaseInfo; let group = statsGroups.get(category); if (typeof group === 'undefined') { group = []; statsGroups.set(category, group); } - const value = items[name]; - const stat = this._createKanjiStat(name, category, notes, order, score, dictionary, value); - group.push(stat); + group.push(this._createKanjiStat(name, value, databaseInfo, dictionary)); } - const stats = {}; + const groupedStats = {}; for (const [category, group] of statsGroups.entries()) { this._sortKanjiStats(group); - stats[category] = group; + groupedStats[category] = group; } - return stats; + return groupedStats; } - async _getTagMetaList(names, dictionary) { - const tagMetaList = []; - let cache = this._tagCache.get(dictionary); - if (typeof cache === 'undefined') { - cache = new Map(); - this._tagCache.set(dictionary, cache); - } - - for (const name of names) { - const base = this._getNameBase(name); - - let tagMeta = cache.get(base); - if (typeof tagMeta === 'undefined') { - tagMeta = await this._database.findTagForTitle(base, dictionary); - cache.set(base, tagMeta); - } - - tagMetaList.push(tagMeta); - } - - return tagMetaList; + _sortKanjiStats(stats) { + if (stats.length <= 1) { return; } + const stringComparer = this._stringComparer; + stats.sort((v1, v2) => { + const i = v1.order - v2.order; + return (i !== 0) ? i : stringComparer.compare(v1.content, v2.content); + }); } - // Simple helpers + // Helpers _getNameBase(name) { const pos = name.indexOf(':'); return (pos >= 0 ? name.substring(0, pos) : name); } - _getSearchableText(text, allowAlphanumericCharacters) { - if (allowAlphanumericCharacters) { - return text; - } - - const jp = this._japaneseUtil; - let newText = ''; - for (const c of text) { - if (!jp.isCodePointJapanese(c.codePointAt(0))) { - break; - } - newText += c; - } - return newText; - } - - _getTextOptionEntryVariants(value) { - switch (value) { - case 'true': return [true]; - case 'variant': return [false, true]; - default: return [false]; - } - } - - _getCollapseEmphaticOptions(options) { - const collapseEmphaticOptions = [[false, false]]; - switch (options.collapseEmphaticSequences) { - case 'true': - collapseEmphaticOptions.push([true, false]); - break; - case 'full': - collapseEmphaticOptions.push([true, false], [true, true]); - break; - } - return collapseEmphaticOptions; - } - - _getTextReplacementsVariants(options) { - return options.textReplacements; - } - _getSecondarySearchDictionaryMap(enabledDictionaryMap) { const secondarySearchDictionaryMap = new Map(); for (const [dictionary, details] of enabledDictionaryMap.entries()) { @@ -837,58 +837,6 @@ class Translator { return {index, priority}; } - _getTagNamesWithCategory(tags, category) { - const results = []; - for (const tag of tags) { - if (tag.category !== category) { continue; } - results.push(tag.name); - } - results.sort(); - return results; - } - - _flagTagsWithCategoryAsRedundant(tags, removeCategoriesSet) { - for (const tag of tags) { - if (removeCategoriesSet.has(tag.category)) { - tag.redundant = true; - } - } - } - - _getUniqueDictionaryNames(definitions) { - const uniqueDictionaryNames = new Set(); - for (const {dictionaryNames} of definitions) { - for (const dictionaryName of dictionaryNames) { - uniqueDictionaryNames.add(dictionaryName); - } - } - return [...uniqueDictionaryNames]; - } - - _getUniqueTermTags(definitions) { - const newTermTags = []; - if (definitions.length <= 1) { - for (const {termTags} of definitions) { - for (const tag of termTags) { - newTermTags.push(this._cloneTag(tag)); - } - } - } else { - const tagsSet = new Set(); - let checkTagsMap = false; - for (const {termTags} of definitions) { - for (const tag of termTags) { - const key = this._getTagMapKey(tag); - if (checkTagsMap && tagsSet.has(key)) { continue; } - tagsSet.add(key); - newTermTags.push(this._cloneTag(tag)); - } - checkTagsMap = true; - } - } - return newTermTags; - } - *_getArrayVariants(arrayVariants) { const ii = arrayVariants.length; @@ -909,110 +857,18 @@ class Translator { } } - _areSetsEqual(set1, set2) { - if (set1.size !== set2.size) { - return false; - } - - for (const value of set1) { - if (!set2.has(value)) { - return false; - } - } - - return true; - } - - _getSetIntersection(set1, set2) { - const result = []; - for (const value of set1) { - if (set2.has(value)) { - result.push(value); - } - } - return result; - } - - _getAllDefinitions(definitions) { - definitions = [...definitions]; - for (let i = 0; i < definitions.length; ++i) { - const childDefinitions = definitions[i].definitions; - if (Array.isArray(childDefinitions)) { - definitions.push(...childDefinitions); - } - } - return definitions; - } - - // Reduction functions - - _getSourceTermMatchCountSum(definitions) { - let result = 0; - for (const {sourceTermExactMatchCount} of definitions) { - result += sourceTermExactMatchCount; - } - return result; - } - - _getMaxDefinitionScore(definitions) { - let result = Number.MIN_SAFE_INTEGER; - for (const {score} of definitions) { - if (score > result) { result = score; } - } - return result; - } - - _getMaxPrimaryDefinitionScore(definitions) { - let result = Number.MIN_SAFE_INTEGER; - for (const {isPrimary, score} of definitions) { - if (isPrimary && score > result) { result = score; } - } - return result; - } - - _getBestDictionaryOrder(definitions) { - let index = Number.MAX_SAFE_INTEGER; - let priority = Number.MIN_SAFE_INTEGER; - for (const {dictionaryOrder: {index: index2, priority: priority2}} of definitions) { - if (index2 < index) { index = index2; } - if (priority2 > priority) { priority = priority2; } - } - return {index, priority}; - } - - // Common data creation and cloning functions - - _cloneTag(tag) { - const {name, category, notes, order, score, dictionary, redundant} = tag; - return this._createTag(name, category, notes, order, score, dictionary, redundant); - } - - _getTagMapKey(tag) { - const {name, category, notes} = tag; - return this._createMapKey([name, category, notes]); - } - _createMapKey(array) { return JSON.stringify(array); } - _createTag(name, category, notes, order, score, dictionary, redundant) { - return { - name, - category: (typeof category === 'string' && category.length > 0 ? category : 'default'), - notes: (typeof notes === 'string' ? notes : ''), - order: (typeof order === 'number' ? order : 0), - score: (typeof score === 'number' ? score : 0), - dictionary: (typeof dictionary === 'string' ? dictionary : null), - redundant - }; - } + // Kanji data - _createKanjiStat(name, category, notes, order, score, dictionary, value) { + _createKanjiStat(name, value, databaseInfo, dictionary) { + const {category, notes, order, score} = databaseInfo; return { name, category: (typeof category === 'string' && category.length > 0 ? category : 'default'), - notes: (typeof notes === 'string' ? notes : ''), + content: (typeof notes === 'string' ? notes : ''), order: (typeof order === 'number' ? order : 0), score: (typeof score === 'number' ? score : 0), dictionary: (typeof dictionary === 'string' ? dictionary : null), @@ -1020,322 +876,404 @@ class Translator { }; } - _createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, tags, stats) { + _createKanjiFrequency(index, dictionary, dictionaryIndex, dictionaryPriority, character, frequency) { + return {index, dictionary, dictionaryIndex, dictionaryPriority, character, frequency}; + } + + _createKanjiDictionaryEntry(character, dictionary, onyomi, kunyomi, tags, stats, definitions) { return { type: 'kanji', character, dictionary, onyomi, kunyomi, - glossary, tags, stats, + definitions, frequencies: [] }; } - async _createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, reasons, isPrimary, enabledDictionaryMap) { - const {expression, reading: rawReading, definitionTags, termTags, glossary, score, dictionary, id, sequence} = databaseDefinition; - const reading = (rawReading.length > 0 ? rawReading : expression); - const dictionaryOrder = this._getDictionaryOrder(dictionary, enabledDictionaryMap); - const termTagsExpanded = await this._expandTags(termTags, dictionary); - const definitionTagsExpanded = await this._expandTags(definitionTags, dictionary); + // Term data - this._sortTags(definitionTagsExpanded); - this._sortTags(termTagsExpanded); + _createTag(databaseTag, name, dictionary) { + const {category, notes, order, score} = (databaseTag !== null ? databaseTag : {}); + return { + name, + category: (typeof category === 'string' && category.length > 0 ? category : 'default'), + order: (typeof order === 'number' ? order : 0), + score: (typeof score === 'number' ? score : 0), + content: (typeof notes === 'string' && notes.length > 0 ? [notes] : []), + dictionaries: [dictionary], + redundant: false + }; + } - const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, termTagsExpanded)]; - const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); + _createTagGroup(dictionary, tagNames) { + return {dictionary, tagNames}; + } + + _createSource(originalText, transformedText, deinflectedText, isPrimary) { + return {originalText, transformedText, deinflectedText, isPrimary}; + } + + _createTermHeadword(index, term, reading, sources, tags) { + return {index, term, reading, sources, tags}; + } + + _createTermDefinition(index, headwordIndices, dictionary, tags, entries) { + return {index, headwordIndices, dictionary, tags, entries}; + } + _createTermPronunciation(index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, pitches) { + return {index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, pitches}; + } + + _createTermFrequency(index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, hasReading, frequency) { + return {index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, hasReading, frequency}; + } + + _createTermDictionaryEntry(id, isPrimary, sequence, inflections, score, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, maxDeinflectedTextLength, headwords, definitions) { return { type: 'term', id, - source, - rawSource, - sourceTerm, - reasons, - score, isPrimary, sequence, - dictionary, - dictionaryOrder, - dictionaryNames: [dictionary], - expression, - reading, - expressions: termDetailsList, - glossary, - definitionTags: definitionTagsExpanded, - termTags: termTagsExpanded, - // definitions - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount - }; - } - - /** - * Creates a grouped definition from an array of 'term' definitions. - * @param definitions An array of 'term' definitions. - * @returns A single 'termGrouped' definition. - */ - _createGroupedTermDefinition(definitions) { - const {reasons, source, rawSource, sourceTerm, expressions: [{expression, reading}]} = definitions[0]; - const score = this._getMaxDefinitionScore(definitions); - const dictionaryOrder = this._getBestDictionaryOrder(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); - const termTags = this._getUniqueTermTags(definitions); - const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, termTags)]; - const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); - return { - type: 'termGrouped', - // id - source, - rawSource, - sourceTerm, - reasons: [...reasons], + inflections, score, - // isPrimary - // sequence - dictionary: dictionaryNames[0], - dictionaryOrder, - dictionaryNames, - expression, - reading, - expressions: termDetailsList, - // glossary - // definitionTags - termTags, - definitions, // type: 'term' - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount + dictionaryIndex, + dictionaryPriority, + sourceTermExactMatchCount, + maxDeinflectedTextLength, + headwords, + definitions, + pronunciations: [], + frequencies: [] }; } - _createMergedTermDefinition(source, rawSource, definitions, expressions, readings, termDetailsList, reasons, score) { - const dictionaryOrder = this._getBestDictionaryOrder(definitions); - const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); - return { - type: 'termMerged', - // id - source, - rawSource, - // sourceTerm + _createTermDictionaryEntryFromDatabaseEntry(databaseEntry, originalText, transformedText, deinflectedText, reasons, isPrimary, enabledDictionaryMap) { + const {expression, reading: rawReading, definitionTags, termTags, glossary, score, dictionary, id, sequence} = databaseEntry; + const reading = (rawReading.length > 0 ? rawReading : expression); + const {index: dictionaryIndex, priority: dictionaryPriority} = this._getDictionaryOrder(dictionary, enabledDictionaryMap); + const sourceTermExactMatchCount = (isPrimary && deinflectedText === expression ? 1 : 0); + const source = this._createSource(originalText, transformedText, deinflectedText, isPrimary); + const maxDeinflectedTextLength = deinflectedText.length; + + const headwordTagGroups = []; + const definitionTagGroups = []; + if (termTags.length > 0) { headwordTagGroups.push(this._createTagGroup(dictionary, termTags)); } + if (definitionTags.length > 0) { definitionTagGroups.push(this._createTagGroup(dictionary, definitionTags)); } + + return this._createTermDictionaryEntry( + id, + isPrimary, + sequence, reasons, score, - // isPrimary - // sequence - dictionary: dictionaryNames[0], - dictionaryOrder, - dictionaryNames, - expression: expressions, - reading: readings, - expressions: termDetailsList, - // glossary - // definitionTags - // termTags - definitions, // type: 'termMergedByGlossary' - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount - }; + dictionaryIndex, + dictionaryPriority, + sourceTermExactMatchCount, + maxDeinflectedTextLength, + [this._createTermHeadword(0, expression, reading, [source], headwordTagGroups)], + [this._createTermDefinition(0, [0], dictionary, definitionTagGroups, glossary)] + ); } - _createMergedGlossaryTermDefinition(source, rawSource, definitions, expressions, readings, allExpressions, allReadings) { - const only = []; - if (!this._areSetsEqual(expressions, allExpressions)) { - only.push(...this._getSetIntersection(expressions, allExpressions)); + _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions) { + // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained + const definitionEntries = []; + const headwords = new Map(); + for (const dictionaryEntry of dictionaryEntries) { + const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords); + definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap}); } - if (!this._areSetsEqual(readings, allReadings)) { - only.push(...this._getSetIntersection(readings, allReadings)); + + // Sort + if (definitionEntries.length > 1) { + this._sortTermDefinitionEntries(definitionEntries); + } else { + checkDuplicateDefinitions = false; } - const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); + // Merge dictionary entry data + let score = Number.MIN_SAFE_INTEGER; + let dictionaryIndex = Number.MAX_SAFE_INTEGER; + let dictionaryPriority = Number.MIN_SAFE_INTEGER; + let maxDeinflectedTextLength = 0; + let sourceTermExactMatchCount = 0; + let isPrimary = false; + const definitions = []; + const definitionsMap = checkDuplicateDefinitions ? new Map() : null; + let inflections = null; + + for (const {dictionaryEntry, headwordIndexMap} of definitionEntries) { + score = Math.max(score, dictionaryEntry.score); + dictionaryIndex = Math.min(dictionaryIndex, dictionaryEntry.dictionaryIndex); + dictionaryPriority = Math.max(dictionaryPriority, dictionaryEntry.dictionaryPriority); + if (dictionaryEntry.isPrimary) { + isPrimary = true; + maxDeinflectedTextLength = Math.max(maxDeinflectedTextLength, dictionaryEntry.maxDeinflectedTextLength); + sourceTermExactMatchCount += dictionaryEntry.sourceTermExactMatchCount; + const dictionaryEntryInflections = dictionaryEntry.inflections; + if (inflections === null || dictionaryEntryInflections.length < inflections.length) { + inflections = dictionaryEntryInflections; + } + } + if (checkDuplicateDefinitions) { + this._addTermDefinitions2(definitions, definitionsMap, dictionaryEntry.definitions, headwordIndexMap); + } else { + this._addTermDefinitions(definitions, dictionaryEntry.definitions, headwordIndexMap); + } + } - const termDetailsList = this._createTermDetailsList(definitions); + return this._createTermDictionaryEntry( + -1, + isPrimary, + -1, + inflections !== null ? inflections : [], + score, + dictionaryIndex, + dictionaryPriority, + sourceTermExactMatchCount, + maxDeinflectedTextLength, + [...headwords.values()], + definitions + ); + } - const definitionTags = this._getUniqueDefinitionTags(definitions); - this._sortTags(definitionTags); + // Data collection addition functions - const {glossary} = definitions[0]; - const score = this._getMaxDefinitionScore(definitions); - const dictionaryOrder = this._getBestDictionaryOrder(definitions); - return { - type: 'termMergedByGlossary', - // id - source, - rawSource, - // sourceTerm - reasons: [], - score, - // isPrimary - // sequence - dictionary: dictionaryNames[0], - dictionaryOrder, - dictionaryNames, - expression: [...expressions], - reading: [...readings], - expressions: termDetailsList, - glossary: [...glossary], - definitionTags, - // termTags - definitions, // type: 'term'; contains duplicate data - frequencies: [], - pitches: [], - only, - sourceTermExactMatchCount - }; + _addUniqueStrings(list, newItems) { + for (const item of newItems) { + if (!list.includes(item)) { + list.push(item); + } + } } - /** - * Creates a list of term details from an array of 'term' definitions. - * @param definitions An array of 'term' definitions. - * @returns An array of term details. - */ - _createTermDetailsList(definitions) { - const termInfoMap = new Map(); - for (const {expression, reading, sourceTerm, termTags} of definitions) { - let readingMap = termInfoMap.get(expression); - if (typeof readingMap === 'undefined') { - readingMap = new Map(); - termInfoMap.set(expression, readingMap); + _addUniqueSources(sources, newSources) { + if (newSources.length === 0) { return; } + if (sources.length === 0) { + sources.push(...newSources); + return; + } + for (const newSource of newSources) { + const {originalText, transformedText, deinflectedText, isPrimary} = newSource; + let has = false; + for (const source of sources) { + if ( + source.deinflectedText === deinflectedText && + source.transformedText === transformedText && + source.originalText === originalText + ) { + if (isPrimary) { source.isPrimary = true; } + has = true; + break; + } } + if (!has) { + sources.push(newSource); + } + } + } - let termInfo = readingMap.get(reading); - if (typeof termInfo === 'undefined') { - termInfo = { - sourceTerm, - termTagsMap: new Map() - }; - readingMap.set(reading, termInfo); + _addUniqueTagGroups(tagGroups, newTagGroups) { + if (newTagGroups.length === 0) { return; } + for (const newTagGroup of newTagGroups) { + const {dictionary} = newTagGroup; + const ii = tagGroups.length; + if (ii > 0) { + let i = 0; + for (; i < ii; ++i) { + const tagGroup = tagGroups[i]; + if (tagGroup.dictionary === dictionary) { + this._addUniqueStrings(tagGroup.tagNames, newTagGroup.tagNames); + break; + } + } + if (i < ii) { continue; } } + tagGroups.push(newTagGroup); + } + } - const {termTagsMap} = termInfo; - for (const tag of termTags) { - const {name} = tag; - if (termTagsMap.has(name)) { continue; } - termTagsMap.set(name, this._cloneTag(tag)); + _addTermHeadwords(headwordsMap, headwords) { + const headwordIndexMap = []; + for (const {term, reading, sources, tags} of headwords) { + const key = this._createMapKey([term, reading]); + let headword = headwordsMap.get(key); + if (typeof headword === 'undefined') { + headword = this._createTermHeadword(headwordsMap.size, term, reading, [], []); + headwordsMap.set(key, headword); } + this._addUniqueSources(headword.sources, sources); + this._addUniqueTagGroups(headword.tags, tags); + headwordIndexMap.push(headword.index); + } + return headwordIndexMap; + } + + _addUniqueTermHeadwordIndex(headwordIndices, headwordIndex) { + let end = headwordIndices.length; + if (end === 0) { + headwordIndices.push(headwordIndex); + return; } - const termDetailsList = []; - for (const [expression, readingMap] of termInfoMap.entries()) { - for (const [reading, {termTagsMap, sourceTerm}] of readingMap.entries()) { - const termTags = [...termTagsMap.values()]; - this._sortTags(termTags); - termDetailsList.push(this._createTermDetails(sourceTerm, expression, reading, termTags)); + let start = 0; + while (start < end) { + const mid = Math.floor((start + end) / 2); + const value = headwordIndices[mid]; + if (headwordIndex === value) { return; } + if (headwordIndex > value) { + start = mid + 1; + } else { + end = mid; } } - return termDetailsList; + + if (headwordIndex === headwordIndices[start]) { return; } + headwordIndices.splice(start, 0, headwordIndex); } - _createTermDetails(sourceTerm, expression, reading, termTags) { - return { - sourceTerm, - expression, - reading, - termTags, - frequencies: [], - pitches: [] - }; + _addTermDefinitions(definitions, newDefinitions, headwordIndexMap) { + for (const {headwordIndices, dictionary, tags, entries} of newDefinitions) { + const headwordIndicesNew = []; + for (const headwordIndex of headwordIndices) { + headwordIndicesNew.push(headwordIndexMap[headwordIndex]); + } + definitions.push(this._createTermDefinition(definitions.length, headwordIndicesNew, dictionary, tags, entries)); + } } - // Sorting functions + _addTermDefinitions2(definitions, definitionsMap, newDefinitions, headwordIndexMap) { + for (const {headwordIndices, dictionary, tags, entries} of newDefinitions) { + const key = this._createMapKey([dictionary, ...entries]); + let definition = definitionsMap.get(key); + if (typeof definition === 'undefined') { + definition = this._createTermDefinition(definitions.length, [], dictionary, [], [...entries]); + definitions.push(definition); + definitionsMap.set(key, definition); + } - _sortTags(tags) { - if (tags.length <= 1) { return; } - const stringComparer = this._stringComparer; - tags.sort((v1, v2) => { - const i = v1.order - v2.order; - if (i !== 0) { return i; } + const newHeadwordIndices = definition.headwordIndices; + for (const headwordIndex of headwordIndices) { + this._addUniqueTermHeadwordIndex(newHeadwordIndices, headwordIndexMap[headwordIndex]); + } + this._addUniqueTagGroups(definition.tags, tags); + } + } - return stringComparer.compare(v1.name, v2.name); - }); + // Sorting functions + + _sortDatabaseEntriesByIndex(databaseEntries) { + if (databaseEntries.length <= 1) { return; } + databaseEntries.sort((a, b) => a.index - b.index); } - _sortDefinitions(definitions, topLevel=true) { - if (definitions.length <= 1) { return; } + _sortTermDictionaryEntries(dictionaryEntries) { const stringComparer = this._stringComparer; const compareFunction = (v1, v2) => { - let i; - if (topLevel) { - // Sort by length of source term - i = v2.source.length - v1.source.length; - if (i !== 0) { return i; } + // Sort by length of source term + let i = v2.maxDeinflectedTextLength - v1.maxDeinflectedTextLength; + if (i !== 0) { return i; } - // Sort by the number of inflection reasons - i = v1.reasons.length - v2.reasons.length; - if (i !== 0) { return i; } + // Sort by the number of inflection reasons + i = v1.inflections.length - v2.inflections.length; + if (i !== 0) { return i; } - // Sort by how many terms exactly match the source (e.g. for exact kana prioritization) - i = v2.sourceTermExactMatchCount - v1.sourceTermExactMatchCount; - if (i !== 0) { return i; } - } + // Sort by how many terms exactly match the source (e.g. for exact kana prioritization) + i = v2.sourceTermExactMatchCount - v1.sourceTermExactMatchCount; + if (i !== 0) { return i; } // Sort by dictionary priority - i = v2.dictionaryOrder.priority - v1.dictionaryOrder.priority; + i = v2.dictionaryPriority - v1.dictionaryPriority; if (i !== 0) { return i; } // Sort by term score i = v2.score - v1.score; if (i !== 0) { return i; } - // Sort by expression string comparison (skip if either expression is not a string, e.g. array) - const expression1 = v1.expression; - const expression2 = v2.expression; - if (typeof expression1 === 'string' && typeof expression2 === 'string') { - i = expression2.length - expression1.length; + // Sort by expression text + const headwords1 = v1.headwords; + const headwords2 = v2.headwords; + for (let j = 0, jj = Math.min(headwords1.length, headwords2.length); j < jj; ++j) { + const term1 = headwords1[j].term; + const term2 = headwords2[j].term; + + i = term2.length - term1.length; if (i !== 0) { return i; } - i = stringComparer.compare(expression1, expression2); + i = stringComparer.compare(term1, term2); if (i !== 0) { return i; } } // Sort by dictionary order - i = v1.dictionaryOrder.index - v2.dictionaryOrder.index; + i = v1.dictionaryIndex - v2.dictionaryIndex; return i; }; - definitions.sort(compareFunction); + dictionaryEntries.sort(compareFunction); } - _sortDatabaseDefinitionsByIndex(definitions) { - if (definitions.length <= 1) { return; } - definitions.sort((a, b) => a.index - b.index); - } + _sortTermDefinitionEntries(definitionEntries) { + const compareFunction = (e1, e2) => { + const v1 = e1.dictionaryEntry; + const v2 = e2.dictionaryEntry; - _sortDefinitionsById(definitions) { - if (definitions.length <= 1) { return; } - definitions.sort((a, b) => a.id - b.id); - } + // Sort by dictionary priority + let i = v2.dictionaryPriority - v1.dictionaryPriority; + if (i !== 0) { return i; } - _sortKanjiStats(stats) { - if (stats.length <= 1) { return; } - const stringComparer = this._stringComparer; - stats.sort((v1, v2) => { - const i = v1.order - v2.order; + // Sort by term score + i = v2.score - v1.score; if (i !== 0) { return i; } - return stringComparer.compare(v1.notes, v2.notes); - }); + // Sort by definition headword index + const definitions1 = v1.definitions; + const definitions2 = v2.definitions; + const headwordIndexMap1 = e1.headwordIndexMap; + const headwordIndexMap2 = e2.headwordIndexMap; + for (let j = 0, jj = Math.min(definitions1.length, definitions2.length); j < jj; ++j) { + const headwordIndices1 = definitions1[j].headwordIndices; + const headwordIndices2 = definitions2[j].headwordIndices; + const kk = headwordIndices1.length; + i = headwordIndices2.length - kk; + if (i !== 0) { return i; } + for (let k = 0; k < kk; ++k) { + i = headwordIndexMap1[headwordIndices1[k]] - headwordIndexMap2[headwordIndices2[k]]; + if (i !== 0) { return i; } + } + } + + // Sort by dictionary order + i = v1.dictionaryIndex - v2.dictionaryIndex; + if (i !== 0) { return i; } + + // Sort by original order + i = e1.index - e2.index; + return i; + }; + definitionEntries.sort(compareFunction); } - _sortTermDefinitionMeta(definition) { - const compareFunction = (v1, v2) => { + _sortTermDictionaryEntriesById(dictionaryEntries) { + if (dictionaryEntries.length <= 1) { return; } + dictionaryEntries.sort((a, b) => a.id - b.id); + } + + _sortTermDictionaryEntryData(dictionaryEntries) { + const compare = (v1, v2) => { // Sort by dictionary priority - let i = v2.dictionaryOrder.priority - v1.dictionaryOrder.priority; + let i = v2.dictionaryPriority - v1.dictionaryPriority; if (i !== 0) { return i; } // Sory by expression order - i = v1.expressionIndex - v2.expressionIndex; + i = v1.headwordIndex - v2.headwordIndex; if (i !== 0) { return i; } // Sort by dictionary order - i = v1.dictionaryOrder.index - v2.dictionaryOrder.index; + i = v1.dictionaryIndex - v2.dictionaryIndex; if (i !== 0) { return i; } // Default order @@ -1343,23 +1281,21 @@ class Translator { return i; }; - const {expressions, frequencies: frequencies1, pitches: pitches1} = definition; - frequencies1.sort(compareFunction); - pitches1.sort(compareFunction); - for (const {frequencies: frequencies2, pitches: pitches2} of expressions) { - frequencies2.sort(compareFunction); - pitches2.sort(compareFunction); + for (const {definitions, frequencies, pronunciations} of dictionaryEntries) { + this._flagRedundantDefinitionTags(definitions); + frequencies.sort(compare); + pronunciations.sort(compare); } } - _sortKanjiDefinitionMeta(definition) { - const compareFunction = (v1, v2) => { + _sortKanjiDictionaryEntryData(dictionaryEntries) { + const compare = (v1, v2) => { // Sort by dictionary priority - let i = v2.dictionaryOrder.priority - v1.dictionaryOrder.priority; + let i = v2.dictionaryPriority - v1.dictionaryPriority; if (i !== 0) { return i; } // Sort by dictionary order - i = v1.dictionaryOrder.index - v2.dictionaryOrder.index; + i = v1.dictionaryIndex - v2.dictionaryIndex; if (i !== 0) { return i; } // Default order @@ -1367,16 +1303,8 @@ class Translator { return i; }; - const {frequencies} = definition; - frequencies.sort(compareFunction); - } - - // Regex functions - - _applyTextReplacements(text, sourceMap, replacements) { - for (const {pattern, replacement} of replacements) { - text = RegexUtil.applyTextReplacement(text, sourceMap, pattern, replacement); + for (const {frequencies} of dictionaryEntries) { + frequencies.sort(compare); } - return text; } } |