diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-02-14 11:19:54 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-02-14 11:19:54 -0500 |
commit | e419a418f6f03ef0a24330b67e7b76c5e3a7c22d (patch) | |
tree | a4c27bdfabc9280d9f6262d93d5152a58de8bc15 /ext/js/language/translator.js | |
parent | 43d1457ebfe23196348649c245dfb942a0f00a1a (diff) |
Move bg/js (#1387)
* Move bg/js/anki.js to js/comm/anki.js
* Move bg/js/mecab.js to js/comm/mecab.js
* Move bg/js/search-main.js to js/display/search-main.js
* Move bg/js/template-patcher.js to js/templates/template-patcher.js
* Move bg/js/template-renderer-frame-api.js to js/templates/template-renderer-frame-api.js
* Move bg/js/template-renderer-frame-main.js to js/templates/template-renderer-frame-main.js
* Move bg/js/template-renderer-proxy.js to js/templates/template-renderer-proxy.js
* Move bg/js/template-renderer.js to js/templates/template-renderer.js
* Move bg/js/media-utility.js to js/media/media-utility.js
* Move bg/js/native-simple-dom-parser.js to js/dom/native-simple-dom-parser.js
* Move bg/js/simple-dom-parser.js to js/dom/simple-dom-parser.js
* Move bg/js/audio-downloader.js to js/media/audio-downloader.js
* Move bg/js/deinflector.js to js/language/deinflector.js
* Move bg/js/backend.js to js/background/backend.js
* Move bg/js/translator.js to js/language/translator.js
* Move bg/js/search-display-controller.js to js/display/search-display-controller.js
* Move bg/js/request-builder.js to js/background/request-builder.js
* Move bg/js/text-source-map.js to js/general/text-source-map.js
* Move bg/js/clipboard-reader.js to js/comm/clipboard-reader.js
* Move bg/js/clipboard-monitor.js to js/comm/clipboard-monitor.js
* Move bg/js/query-parser.js to js/display/query-parser.js
* Move bg/js/profile-conditions.js to js/background/profile-conditions.js
* Move bg/js/dictionary-database.js to js/language/dictionary-database.js
* Move bg/js/dictionary-importer.js to js/language/dictionary-importer.js
* Move bg/js/anki-note-builder.js to js/data/anki-note-builder.js
* Move bg/js/anki-note-data.js to js/data/anki-note-data.js
* Move bg/js/database.js to js/data/database.js
* Move bg/js/json-schema.js to js/data/json-schema.js
* Move bg/js/options.js to js/data/options-util.js
* Move bg/js/background-main.js to js/background/background-main.js
* Move bg/js/permissions-util.js to js/data/permissions-util.js
* Move bg/js/context-main.js to js/pages/action-popup-main.js
* Move bg/js/generic-page-main.js to js/pages/generic-page-main.js
* Move bg/js/info-main.js to js/pages/info-main.js
* Move bg/js/permissions-main.js to js/pages/permissions-main.js
* Move bg/js/welcome-main.js to js/pages/welcome-main.js
Diffstat (limited to 'ext/js/language/translator.js')
-rw-r--r-- | ext/js/language/translator.js | 1397 |
1 files changed, 1397 insertions, 0 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js new file mode 100644 index 00000000..729c8294 --- /dev/null +++ b/ext/js/language/translator.js @@ -0,0 +1,1397 @@ +/* + * Copyright (C) 2016-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/* global + * Deinflector + * TextSourceMap + */ + +/** + * Class which finds term and kanji definitions for text. + */ +class Translator { + /** + * Creates a new Translator instance. + * @param database An instance of DictionaryDatabase. + */ + constructor({japaneseUtil, database}) { + this._japaneseUtil = japaneseUtil; + this._database = database; + this._deinflector = null; + this._tagCache = new Map(); + this._stringComparer = new Intl.Collator('en-US'); // Invariant locale + } + + /** + * Initializes the instance for use. The public API should not be used until + * this function has been called. + * @param deinflectionReasons The raw deinflections reasons data that the Deinflector uses. + */ + prepare(deinflectionReasons) { + this._deinflector = new Deinflector(deinflectionReasons); + } + + /** + * Clears the database tag cache. This should be executed if the database is changed. + */ + clearDatabaseCaches() { + this._tagCache.clear(); + } + + /** + * Finds term definitions for the given text. + * @param mode The mode to use for finding terms, which determines the format of the resulting array. + * One of: 'group', 'merge', 'split', 'simple' + * @param text The text to find terms for. + * @param options An object using the following structure: + * { + * wildcard: (enum: null, 'prefix', 'suffix'), + * mainDictionary: (string), + * alphanumeric: (boolean), + * convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'), + * convertNumericCharacters: (enum: 'false', 'true', 'variant'), + * convertAlphabeticCharacters: (enum: 'false', 'true', 'variant'), + * convertHiraganaToKatakana: (enum: 'false', 'true', 'variant'), + * convertKatakanaToHiragana: (enum: 'false', 'true', 'variant'), + * collapseEmphaticSequences: (enum: 'false', 'true', 'full'), + * textReplacements: [ + * (null or [ + * {pattern: (RegExp), replacement: (string)} + * ... + * ]) + * ... + * ], + * enabledDictionaryMap: (Map of [ + * (string), + * { + * priority: (number), + * allowSecondarySearches: (boolean) + * } + * ]) + * } + * @returns An array of [definitions, textLength]. The structure of each definition depends on the + * mode parameter, see the _create?TermDefinition?() functions for structure details. + */ + async findTerms(mode, text, options) { + switch (mode) { + case 'group': + return await this._findTermsGrouped(text, options); + case 'merge': + return await this._findTermsMerged(text, options); + case 'split': + return await this._findTermsSplit(text, options); + case 'simple': + return await this._findTermsSimple(text, options); + default: + return [[], 0]; + } + } + + /** + * Finds kanji definitions for the given text. + * @param text The text to find kanji definitions for. This string can be of any length, + * but is typically just one character, which is a single kanji. If the string is multiple + * characters long, each character will be searched in the database. + * @param options An object using the following structure: + * { + * enabledDictionaryMap: (Map of [ + * (string), + * { + * priority: (number) + * } + * ]) + * } + * @returns An array of definitions. See the _createKanjiDefinition() function for structure details. + */ + async findKanji(text, options) { + const {enabledDictionaryMap} = options; + const kanjiUnique = new Set(); + for (const c of text) { + kanjiUnique.add(c); + } + + const databaseDefinitions = await this._database.findKanjiBulk([...kanjiUnique], enabledDictionaryMap); + if (databaseDefinitions.length === 0) { return []; } + + this._sortDatabaseDefinitionsByIndex(databaseDefinitions); + + const definitions = []; + for (const {character, onyomi, kunyomi, tags, glossary, stats, dictionary} of databaseDefinitions) { + const expandedStats = await this._expandStats(stats, dictionary); + const expandedTags = await this._expandTags(tags, dictionary); + this._sortTags(expandedTags); + + const definition = this._createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, expandedTags, expandedStats); + definitions.push(definition); + } + + await this._buildKanjiMeta(definitions, enabledDictionaryMap); + + return definitions; + } + + // Find terms core functions + + async _findTermsSimple(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + this._sortDefinitions(definitions, false); + return [definitions, length]; + } + + async _findTermsSplit(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + await this._buildTermMeta(definitions, enabledDictionaryMap); + this._sortDefinitions(definitions, true); + return [definitions, length]; + } + + async _findTermsGrouped(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + + const groupedDefinitions = this._groupTerms(definitions, enabledDictionaryMap); + await this._buildTermMeta(groupedDefinitions, enabledDictionaryMap); + this._sortDefinitions(groupedDefinitions, false); + + for (const definition of groupedDefinitions) { + this._flagRedundantDefinitionTags(definition.definitions); + } + + return [groupedDefinitions, length]; + } + + async _findTermsMerged(text, options) { + const {mainDictionary, enabledDictionaryMap} = options; + const secondarySearchDictionaryMap = this._getSecondarySearchDictionaryMap(enabledDictionaryMap); + + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + const {sequencedDefinitions, unsequencedDefinitions} = await this._getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap); + const definitionsMerged = []; + const usedDefinitions = new Set(); + + for (const {sourceDefinitions, relatedDefinitions} of sequencedDefinitions) { + const result = await this._getMergedDefinition( + sourceDefinitions, + relatedDefinitions, + unsequencedDefinitions, + secondarySearchDictionaryMap, + usedDefinitions + ); + definitionsMerged.push(result); + } + + const unusedDefinitions = unsequencedDefinitions.filter((definition) => !usedDefinitions.has(definition)); + for (const groupedDefinition of this._groupTerms(unusedDefinitions, enabledDictionaryMap)) { + const {reasons, score, expression, reading, source, rawSource, sourceTerm, furiganaSegments, termTags, definitions: definitions2} = groupedDefinition; + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; + const compatibilityDefinition = this._createMergedTermDefinition( + source, + rawSource, + this._convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions2), + [expression], + [reading], + termDetailsList, + reasons, + score + ); + definitionsMerged.push(compatibilityDefinition); + } + + await this._buildTermMeta(definitionsMerged, enabledDictionaryMap); + this._sortDefinitions(definitionsMerged, false); + + for (const definition of definitionsMerged) { + this._flagRedundantDefinitionTags(definition.definitions); + } + + return [definitionsMerged, length]; + } + + // Find terms internal implementation + + async _findTermsInternal(text, enabledDictionaryMap, options) { + const {alphanumeric, wildcard} = options; + text = this._getSearchableText(text, alphanumeric); + if (text.length === 0) { + return [[], 0]; + } + + const deinflections = ( + wildcard ? + await this._findTermWildcard(text, enabledDictionaryMap, wildcard) : + await this._findTermDeinflections(text, enabledDictionaryMap, options) + ); + + let maxLength = 0; + const definitions = []; + for (const {databaseDefinitions, source, rawSource, term, reasons} of deinflections) { + if (databaseDefinitions.length === 0) { continue; } + maxLength = Math.max(maxLength, rawSource.length); + for (const databaseDefinition of databaseDefinitions) { + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, term, reasons, enabledDictionaryMap); + definitions.push(definition); + } + } + + this._removeDuplicateDefinitions(definitions); + return [definitions, maxLength]; + } + + async _findTermWildcard(text, enabledDictionaryMap, wildcard) { + const databaseDefinitions = await this._database.findTermsBulk([text], enabledDictionaryMap, wildcard); + if (databaseDefinitions.length === 0) { + return []; + } + + return [{ + source: text, + rawSource: text, + term: text, + rules: 0, + reasons: [], + databaseDefinitions + }]; + } + + async _findTermDeinflections(text, enabledDictionaryMap, options) { + const deinflections = this._getAllDeinflections(text, options); + + if (deinflections.length === 0) { + return []; + } + + const uniqueDeinflectionTerms = []; + const uniqueDeinflectionArrays = []; + const uniqueDeinflectionsMap = new Map(); + for (const deinflection of deinflections) { + const term = deinflection.term; + let deinflectionArray = uniqueDeinflectionsMap.get(term); + if (typeof deinflectionArray === 'undefined') { + deinflectionArray = []; + uniqueDeinflectionTerms.push(term); + uniqueDeinflectionArrays.push(deinflectionArray); + uniqueDeinflectionsMap.set(term, deinflectionArray); + } + deinflectionArray.push(deinflection); + } + + const databaseDefinitions = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, null); + + for (const databaseDefinition of databaseDefinitions) { + const definitionRules = Deinflector.rulesToRuleFlags(databaseDefinition.rules); + for (const deinflection of uniqueDeinflectionArrays[databaseDefinition.index]) { + const deinflectionRules = deinflection.rules; + if (deinflectionRules === 0 || (definitionRules & deinflectionRules) !== 0) { + deinflection.databaseDefinitions.push(databaseDefinition); + } + } + } + + return deinflections; + } + + _getAllDeinflections(text, options) { + const textOptionVariantArray = [ + this._getTextReplacementsVariants(options), + this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), + this._getTextOptionEntryVariants(options.convertNumericCharacters), + this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), + this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), + this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), + this._getCollapseEmphaticOptions(options) + ]; + + const jp = this._japaneseUtil; + const deinflections = []; + const used = new Set(); + for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of this._getArrayVariants(textOptionVariantArray)) { + let text2 = text; + const sourceMap = new TextSourceMap(text2); + if (textReplacements !== null) { + text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); + } + if (halfWidth) { + text2 = jp.convertHalfWidthKanaToFullWidth(text2, sourceMap); + } + if (numeric) { + text2 = jp.convertNumericToFullWidth(text2); + } + if (alphabetic) { + text2 = jp.convertAlphabeticToKana(text2, sourceMap); + } + if (katakana) { + text2 = jp.convertHiraganaToKatakana(text2); + } + if (hiragana) { + text2 = jp.convertKatakanaToHiragana(text2); + } + if (collapseEmphatic) { + text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + } + + for (let i = text2.length; i > 0; --i) { + const text2Substring = text2.substring(0, i); + if (used.has(text2Substring)) { break; } + used.add(text2Substring); + const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); + for (const deinflection of this._deinflector.deinflect(text2Substring, rawSource)) { + deinflections.push(deinflection); + } + } + } + return deinflections; + } + + async _getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap) { + const sequenceList = []; + const sequencedDefinitionMap = new Map(); + const sequencedDefinitions = []; + const unsequencedDefinitions = []; + for (const definition of definitions) { + const {sequence, dictionary} = definition; + if (mainDictionary === dictionary && sequence >= 0) { + let sequencedDefinition = sequencedDefinitionMap.get(sequence); + if (typeof sequencedDefinition === 'undefined') { + sequencedDefinition = { + sourceDefinitions: [], + relatedDefinitions: [], + relatedDefinitionIds: new Set() + }; + sequencedDefinitionMap.set(sequence, sequencedDefinition); + sequencedDefinitions.push(sequencedDefinition); + sequenceList.push(sequence); + } + sequencedDefinition.sourceDefinitions.push(definition); + sequencedDefinition.relatedDefinitions.push(definition); + sequencedDefinition.relatedDefinitionIds.add(definition.id); + } else { + unsequencedDefinitions.push(definition); + } + } + + if (sequenceList.length > 0) { + const databaseDefinitions = await this._database.findTermsBySequenceBulk(sequenceList, mainDictionary); + for (const databaseDefinition of databaseDefinitions) { + const {relatedDefinitions, relatedDefinitionIds} = sequencedDefinitions[databaseDefinition.index]; + const {id} = databaseDefinition; + if (relatedDefinitionIds.has(id)) { continue; } + + const {source, rawSource, sourceTerm} = relatedDefinitions[0]; + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, [], enabledDictionaryMap); + relatedDefinitions.push(definition); + } + } + + for (const {relatedDefinitions} of sequencedDefinitions) { + this._sortDefinitionsById(relatedDefinitions); + } + + return {sequencedDefinitions, unsequencedDefinitions}; + } + + async _getMergedSecondarySearchResults(expressionsMap, secondarySearchDictionaryMap) { + if (secondarySearchDictionaryMap.size === 0) { + return []; + } + + const expressionList = []; + const readingList = []; + for (const [expression, readingMap] of expressionsMap.entries()) { + for (const reading of readingMap.keys()) { + expressionList.push(expression); + readingList.push(reading); + } + } + + const databaseDefinitions = await this._database.findTermsExactBulk(expressionList, readingList, secondarySearchDictionaryMap); + this._sortDatabaseDefinitionsByIndex(databaseDefinitions); + + const definitions = []; + for (const databaseDefinition of databaseDefinitions) { + const source = expressionList[databaseDefinition.index]; + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, source, source, [], secondarySearchDictionaryMap); + definitions.push(definition); + } + + return definitions; + } + + async _getMergedDefinition(sourceDefinitions, relatedDefinitions, unsequencedDefinitions, secondarySearchDictionaryMap, usedDefinitions) { + const {reasons, source, rawSource} = sourceDefinitions[0]; + const score = this._getMaxDefinitionScore(sourceDefinitions); + const termInfoMap = new Map(); + const glossaryDefinitions = []; + const glossaryDefinitionGroupMap = new Map(); + + this._mergeByGlossary(relatedDefinitions, glossaryDefinitionGroupMap); + this._addUniqueTermInfos(relatedDefinitions, termInfoMap); + + let secondaryDefinitions = await this._getMergedSecondarySearchResults(termInfoMap, secondarySearchDictionaryMap); + secondaryDefinitions = [...unsequencedDefinitions, ...secondaryDefinitions]; + + this._removeUsedDefinitions(secondaryDefinitions, termInfoMap, usedDefinitions); + this._removeDuplicateDefinitions(secondaryDefinitions); + + this._mergeByGlossary(secondaryDefinitions, glossaryDefinitionGroupMap); + + const allExpressions = new Set(); + const allReadings = new Set(); + for (const {expressions, readings} of glossaryDefinitionGroupMap.values()) { + for (const expression of expressions) { allExpressions.add(expression); } + for (const reading of readings) { allReadings.add(reading); } + } + + for (const {expressions, readings, definitions} of glossaryDefinitionGroupMap.values()) { + const glossaryDefinition = this._createMergedGlossaryTermDefinition( + source, + rawSource, + definitions, + expressions, + readings, + allExpressions, + allReadings + ); + glossaryDefinitions.push(glossaryDefinition); + } + + this._sortDefinitions(glossaryDefinitions, true); + + const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); + + return this._createMergedTermDefinition( + source, + rawSource, + glossaryDefinitions, + [...allExpressions], + [...allReadings], + termDetailsList, + reasons, + score + ); + } + + _removeUsedDefinitions(definitions, termInfoMap, usedDefinitions) { + for (let i = 0, ii = definitions.length; i < ii; ++i) { + const definition = definitions[i]; + const {expression, reading} = definition; + const expressionMap = termInfoMap.get(expression); + if ( + typeof expressionMap !== 'undefined' && + typeof expressionMap.get(reading) !== 'undefined' + ) { + usedDefinitions.add(definition); + } else { + definitions.splice(i, 1); + --i; + --ii; + } + } + } + + _getUniqueDefinitionTags(definitions) { + const definitionTagsMap = new Map(); + for (const {definitionTags} of definitions) { + for (const tag of definitionTags) { + const {name} = tag; + if (definitionTagsMap.has(name)) { continue; } + definitionTagsMap.set(name, this._cloneTag(tag)); + } + } + return [...definitionTagsMap.values()]; + } + + _removeDuplicateDefinitions(definitions) { + const definitionGroups = new Map(); + for (let i = 0, ii = definitions.length; i < ii; ++i) { + const definition = definitions[i]; + const {id} = definition; + const existing = definitionGroups.get(id); + if (typeof existing === 'undefined') { + definitionGroups.set(id, [i, definition]); + continue; + } + + let removeIndex = i; + if (definition.source.length > existing[1].source.length) { + definitionGroups.set(id, [i, definition]); + removeIndex = existing[0]; + } + + definitions.splice(removeIndex, 1); + --i; + --ii; + } + } + + _flagRedundantDefinitionTags(definitions) { + let lastDictionary = null; + let lastPartOfSpeech = ''; + const removeCategoriesSet = new Set(); + + for (const {dictionary, definitionTags} of definitions) { + const partOfSpeech = this._createMapKey(this._getTagNamesWithCategory(definitionTags, 'partOfSpeech')); + + if (lastDictionary !== dictionary) { + lastDictionary = dictionary; + lastPartOfSpeech = ''; + } + + if (lastPartOfSpeech === partOfSpeech) { + removeCategoriesSet.add('partOfSpeech'); + } else { + lastPartOfSpeech = partOfSpeech; + } + + if (removeCategoriesSet.size > 0) { + this._flagTagsWithCategoryAsRedundant(definitionTags, removeCategoriesSet); + removeCategoriesSet.clear(); + } + } + } + + _groupTerms(definitions) { + const groups = new Map(); + for (const definition of definitions) { + const key = this._createMapKey([definition.source, definition.expression, definition.reading, ...definition.reasons]); + let groupDefinitions = groups.get(key); + if (typeof groupDefinitions === 'undefined') { + groupDefinitions = []; + groups.set(key, groupDefinitions); + } + + groupDefinitions.push(definition); + } + + const results = []; + for (const groupDefinitions of groups.values()) { + this._sortDefinitions(groupDefinitions, true); + const definition = this._createGroupedTermDefinition(groupDefinitions); + results.push(definition); + } + + return results; + } + + _mergeByGlossary(definitions, glossaryDefinitionGroupMap) { + for (const definition of definitions) { + const {expression, reading, dictionary, glossary, id} = definition; + + const key = this._createMapKey([dictionary, ...glossary]); + let group = glossaryDefinitionGroupMap.get(key); + if (typeof group === 'undefined') { + group = { + expressions: new Set(), + readings: new Set(), + definitions: [], + definitionIds: new Set() + }; + glossaryDefinitionGroupMap.set(key, group); + } + + const {definitionIds} = group; + if (definitionIds.has(id)) { continue; } + definitionIds.add(id); + group.expressions.add(expression); + group.readings.add(reading); + group.definitions.push(definition); + } + } + + _addUniqueTermInfos(definitions, termInfoMap) { + for (const {expression, reading, sourceTerm, furiganaSegments, termTags} of definitions) { + let readingMap = termInfoMap.get(expression); + if (typeof readingMap === 'undefined') { + readingMap = new Map(); + termInfoMap.set(expression, readingMap); + } + + let termInfo = readingMap.get(reading); + if (typeof termInfo === 'undefined') { + termInfo = { + sourceTerm, + furiganaSegments, + termTagsMap: new Map() + }; + readingMap.set(reading, termInfo); + } + + const {termTagsMap} = termInfo; + for (const tag of termTags) { + const {name} = tag; + if (termTagsMap.has(name)) { continue; } + termTagsMap.set(name, this._cloneTag(tag)); + } + } + } + + _convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions) { + const convertedDefinitions = []; + for (const definition of definitions) { + const {source, rawSource, expression, reading} = definition; + const expressions = new Set([expression]); + const readings = new Set([reading]); + const convertedDefinition = this._createMergedGlossaryTermDefinition(source, rawSource, [definition], expressions, readings, expressions, readings); + convertedDefinitions.push(convertedDefinition); + } + return convertedDefinitions; + } + + // Metadata building + + async _buildTermMeta(definitions, enabledDictionaryMap) { + const addMetadataTargetInfo = (targetMap1, target, parents) => { + let {expression, reading} = target; + if (!reading) { reading = expression; } + + let targetMap2 = targetMap1.get(expression); + if (typeof targetMap2 === 'undefined') { + targetMap2 = new Map(); + targetMap1.set(expression, targetMap2); + } + + let targets = targetMap2.get(reading); + if (typeof targets === 'undefined') { + targets = new Set([target, ...parents]); + targetMap2.set(reading, targets); + } else { + targets.add(target); + for (const parent of parents) { + targets.add(parent); + } + } + }; + + const targetMap = new Map(); + const definitionsQueue = definitions.map((definition) => ({definition, parents: []})); + while (definitionsQueue.length > 0) { + const {definition, parents} = definitionsQueue.shift(); + const childDefinitions = definition.definitions; + if (Array.isArray(childDefinitions)) { + for (const definition2 of childDefinitions) { + definitionsQueue.push({definition: definition2, parents: [...parents, definition]}); + } + } else { + addMetadataTargetInfo(targetMap, definition, parents); + } + + for (const target of definition.expressions) { + addMetadataTargetInfo(targetMap, target, []); + } + } + const targetMapEntries = [...targetMap.entries()]; + const uniqueExpressions = targetMapEntries.map(([expression]) => expression); + + const metas = await this._database.findTermMetaBulk(uniqueExpressions, enabledDictionaryMap); + for (const {expression, mode, data, dictionary, index} of metas) { + const targetMap2 = targetMapEntries[index][1]; + for (const [reading, targets] of targetMap2) { + switch (mode) { + case 'freq': + { + const frequencyData = this._getTermFrequencyData(expression, reading, dictionary, data); + if (frequencyData === null) { continue; } + for (const {frequencies} of targets) { frequencies.push(frequencyData); } + } + break; + case 'pitch': + { + const pitchData = await this._getPitchData(expression, reading, dictionary, data); + if (pitchData === null) { continue; } + for (const {pitches} of targets) { pitches.push(pitchData); } + } + break; + } + } + } + } + + async _buildKanjiMeta(definitions, enabledDictionaryMap) { + const kanjiList = []; + for (const {character} of definitions) { + kanjiList.push(character); + } + + const metas = await this._database.findKanjiMetaBulk(kanjiList, enabledDictionaryMap); + for (const {character, mode, data, dictionary, index} of metas) { + switch (mode) { + case 'freq': + { + const frequencyData = this._getKanjiFrequencyData(character, dictionary, data); + definitions[index].frequencies.push(frequencyData); + } + break; + } + } + } + + async _expandTags(names, dictionary) { + const tagMetaList = await this._getTagMetaList(names, dictionary); + const results = []; + for (let i = 0, ii = tagMetaList.length; i < ii; ++i) { + const meta = tagMetaList[i]; + const name = names[i]; + const {category, notes, order, score} = (meta !== null ? meta : {}); + const tag = this._createTag(name, category, notes, order, score, dictionary, false); + results.push(tag); + } + return results; + } + + async _expandStats(items, dictionary) { + const names = Object.keys(items); + const tagMetaList = await this._getTagMetaList(names, dictionary); + + const statsGroups = new Map(); + for (let i = 0; i < names.length; ++i) { + const name = names[i]; + const meta = tagMetaList[i]; + if (meta === null) { continue; } + + const {category, notes, order, score} = meta; + let group = statsGroups.get(category); + if (typeof group === 'undefined') { + group = []; + statsGroups.set(category, group); + } + + const value = items[name]; + const stat = this._createKanjiStat(name, category, notes, order, score, dictionary, value); + group.push(stat); + } + + const stats = {}; + for (const [category, group] of statsGroups.entries()) { + this._sortKanjiStats(group); + stats[category] = group; + } + return stats; + } + + async _getTagMetaList(names, dictionary) { + const tagMetaList = []; + let cache = this._tagCache.get(dictionary); + if (typeof cache === 'undefined') { + cache = new Map(); + this._tagCache.set(dictionary, cache); + } + + for (const name of names) { + const base = this._getNameBase(name); + + let tagMeta = cache.get(base); + if (typeof tagMeta === 'undefined') { + tagMeta = await this._database.findTagForTitle(base, dictionary); + cache.set(base, tagMeta); + } + + tagMetaList.push(tagMeta); + } + + return tagMetaList; + } + + _getTermFrequencyData(expression, reading, dictionary, data) { + let frequency = data; + const hasReading = (data !== null && typeof data === 'object'); + if (hasReading) { + if (data.reading !== reading) { return null; } + frequency = data.frequency; + } + return {dictionary, expression, reading, hasReading, frequency}; + } + + _getKanjiFrequencyData(character, dictionary, data) { + return {dictionary, character, frequency: data}; + } + + async _getPitchData(expression, reading, dictionary, data) { + if (data.reading !== reading) { return null; } + + const pitches = []; + for (let {position, tags} of data.pitches) { + tags = Array.isArray(tags) ? await this._expandTags(tags, dictionary) : []; + pitches.push({position, tags}); + } + + return {expression, reading, dictionary, pitches}; + } + + // Simple helpers + + _scoreToTermFrequency(score) { + if (score > 0) { + return 'popular'; + } else if (score < 0) { + return 'rare'; + } else { + return 'normal'; + } + } + + _getNameBase(name) { + const pos = name.indexOf(':'); + return (pos >= 0 ? name.substring(0, pos) : name); + } + + _getSearchableText(text, allowAlphanumericCharacters) { + if (allowAlphanumericCharacters) { + return text; + } + + const jp = this._japaneseUtil; + let newText = ''; + for (const c of text) { + if (!jp.isCodePointJapanese(c.codePointAt(0))) { + break; + } + newText += c; + } + return newText; + } + + _getTextOptionEntryVariants(value) { + switch (value) { + case 'true': return [true]; + case 'variant': return [false, true]; + default: return [false]; + } + } + + _getCollapseEmphaticOptions(options) { + const collapseEmphaticOptions = [[false, false]]; + switch (options.collapseEmphaticSequences) { + case 'true': + collapseEmphaticOptions.push([true, false]); + break; + case 'full': + collapseEmphaticOptions.push([true, false], [true, true]); + break; + } + return collapseEmphaticOptions; + } + + _getTextReplacementsVariants(options) { + return options.textReplacements; + } + + _getSecondarySearchDictionaryMap(enabledDictionaryMap) { + const secondarySearchDictionaryMap = new Map(); + for (const [dictionary, details] of enabledDictionaryMap.entries()) { + if (!details.allowSecondarySearches) { continue; } + secondarySearchDictionaryMap.set(dictionary, details); + } + return secondarySearchDictionaryMap; + } + + _getDictionaryPriority(dictionary, enabledDictionaryMap) { + const info = enabledDictionaryMap.get(dictionary); + return typeof info !== 'undefined' ? info.priority : 0; + } + + _getTagNamesWithCategory(tags, category) { + const results = []; + for (const tag of tags) { + if (tag.category !== category) { continue; } + results.push(tag.name); + } + results.sort(); + return results; + } + + _flagTagsWithCategoryAsRedundant(tags, removeCategoriesSet) { + for (const tag of tags) { + if (removeCategoriesSet.has(tag.category)) { + tag.redundant = true; + } + } + } + + _getUniqueDictionaryNames(definitions) { + const uniqueDictionaryNames = new Set(); + for (const {dictionaryNames} of definitions) { + for (const dictionaryName of dictionaryNames) { + uniqueDictionaryNames.add(dictionaryName); + } + } + return [...uniqueDictionaryNames]; + } + + _getUniqueTermTags(definitions) { + const newTermTags = []; + if (definitions.length <= 1) { + for (const {termTags} of definitions) { + for (const tag of termTags) { + newTermTags.push(this._cloneTag(tag)); + } + } + } else { + const tagsSet = new Set(); + let checkTagsMap = false; + for (const {termTags} of definitions) { + for (const tag of termTags) { + const key = this._getTagMapKey(tag); + if (checkTagsMap && tagsSet.has(key)) { continue; } + tagsSet.add(key); + newTermTags.push(this._cloneTag(tag)); + } + checkTagsMap = true; + } + } + return newTermTags; + } + + *_getArrayVariants(arrayVariants) { + const ii = arrayVariants.length; + + let total = 1; + for (let i = 0; i < ii; ++i) { + total *= arrayVariants[i].length; + } + + for (let a = 0; a < total; ++a) { + const variant = []; + let index = a; + for (let i = 0; i < ii; ++i) { + const entryVariants = arrayVariants[i]; + variant.push(entryVariants[index % entryVariants.length]); + index = Math.floor(index / entryVariants.length); + } + yield variant; + } + } + + _areSetsEqual(set1, set2) { + if (set1.size !== set2.size) { + return false; + } + + for (const value of set1) { + if (!set2.has(value)) { + return false; + } + } + + return true; + } + + _getSetIntersection(set1, set2) { + const result = []; + for (const value of set1) { + if (set2.has(value)) { + result.push(value); + } + } + return result; + } + + // Reduction functions + + _getTermTagsScoreSum(termTags) { + let result = 0; + for (const {score} of termTags) { + result += score; + } + return result; + } + + _getSourceTermMatchCountSum(definitions) { + let result = 0; + for (const {sourceTermExactMatchCount} of definitions) { + result += sourceTermExactMatchCount; + } + return result; + } + + _getMaxDefinitionScore(definitions) { + let result = Number.MIN_SAFE_INTEGER; + for (const {score} of definitions) { + if (score > result) { result = score; } + } + return result; + } + + _getMaxDictionaryPriority(definitions) { + let result = Number.MIN_SAFE_INTEGER; + for (const {dictionaryPriority} of definitions) { + if (dictionaryPriority > result) { result = dictionaryPriority; } + } + return result; + } + + // Common data creation and cloning functions + + _cloneTag(tag) { + const {name, category, notes, order, score, dictionary, redundant} = tag; + return this._createTag(name, category, notes, order, score, dictionary, redundant); + } + + _getTagMapKey(tag) { + const {name, category, notes} = tag; + return this._createMapKey([name, category, notes]); + } + + _createMapKey(array) { + return JSON.stringify(array); + } + + _createTag(name, category, notes, order, score, dictionary, redundant) { + return { + name, + category: (typeof category === 'string' && category.length > 0 ? category : 'default'), + notes: (typeof notes === 'string' ? notes : ''), + order: (typeof order === 'number' ? order : 0), + score: (typeof score === 'number' ? score : 0), + dictionary: (typeof dictionary === 'string' ? dictionary : null), + redundant + }; + } + + _createKanjiStat(name, category, notes, order, score, dictionary, value) { + return { + name, + category: (typeof category === 'string' && category.length > 0 ? category : 'default'), + notes: (typeof notes === 'string' ? notes : ''), + order: (typeof order === 'number' ? order : 0), + score: (typeof score === 'number' ? score : 0), + dictionary: (typeof dictionary === 'string' ? dictionary : null), + value + }; + } + + _createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, tags, stats) { + return { + type: 'kanji', + character, + dictionary, + onyomi, + kunyomi, + glossary, + tags, + stats, + frequencies: [] + }; + } + + async _createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, reasons, enabledDictionaryMap) { + const {expression, reading, definitionTags, termTags, glossary, score, dictionary, id, sequence} = databaseDefinition; + const dictionaryPriority = this._getDictionaryPriority(dictionary, enabledDictionaryMap); + const termTagsExpanded = await this._expandTags(termTags, dictionary); + const definitionTagsExpanded = await this._expandTags(definitionTags, dictionary); + + this._sortTags(definitionTagsExpanded); + this._sortTags(termTagsExpanded); + + const furiganaSegments = this._japaneseUtil.distributeFurigana(expression, reading); + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTagsExpanded)]; + const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); + + return { + type: 'term', + id, + source, + rawSource, + sourceTerm, + reasons, + score, + sequence, + dictionary, + dictionaryPriority, + dictionaryNames: [dictionary], + expression, + reading, + expressions: termDetailsList, + furiganaSegments, + glossary, + definitionTags: definitionTagsExpanded, + termTags: termTagsExpanded, + // definitions + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createGroupedTermDefinition(definitions) { + const {expression, reading, furiganaSegments, reasons, source, rawSource, sourceTerm} = definitions[0]; + const score = this._getMaxDefinitionScore(definitions); + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + const termTags = this._getUniqueTermTags(definitions); + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; + const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); + return { + type: 'termGrouped', + // id + source, + rawSource, + sourceTerm, + reasons: [...reasons], + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression, + reading, + expressions: termDetailsList, + furiganaSegments, // Contains duplicate data + // glossary + // definitionTags + termTags, + definitions, // type: 'term' + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createMergedTermDefinition(source, rawSource, definitions, expressions, readings, termDetailsList, reasons, score) { + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + return { + type: 'termMerged', + // id + source, + rawSource, + // sourceTerm + reasons, + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression: expressions, + reading: readings, + expressions: termDetailsList, + // furiganaSegments + // glossary + // definitionTags + // termTags + definitions, // type: 'termMergedByGlossary' + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createMergedGlossaryTermDefinition(source, rawSource, definitions, expressions, readings, allExpressions, allReadings) { + const only = []; + if (!this._areSetsEqual(expressions, allExpressions)) { + only.push(...this._getSetIntersection(expressions, allExpressions)); + } + if (!this._areSetsEqual(readings, allReadings)) { + only.push(...this._getSetIntersection(readings, allReadings)); + } + + const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + + const termInfoMap = new Map(); + this._addUniqueTermInfos(definitions, termInfoMap); + const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); + + const definitionTags = this._getUniqueDefinitionTags(definitions); + this._sortTags(definitionTags); + + const {glossary} = definitions[0]; + const score = this._getMaxDefinitionScore(definitions); + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + return { + type: 'termMergedByGlossary', + // id + source, + rawSource, + // sourceTerm + reasons: [], + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression: [...expressions], + reading: [...readings], + expressions: termDetailsList, + // furiganaSegments + glossary: [...glossary], + definitionTags, + // termTags + definitions, // type: 'term'; contains duplicate data + frequencies: [], + pitches: [], + only, + sourceTermExactMatchCount + }; + } + + _createTermDetailsListFromTermInfoMap(termInfoMap) { + const termDetailsList = []; + for (const [expression, readingMap] of termInfoMap.entries()) { + for (const [reading, {termTagsMap, sourceTerm, furiganaSegments}] of readingMap.entries()) { + const termTags = [...termTagsMap.values()]; + this._sortTags(termTags); + termDetailsList.push(this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)); + } + } + return termDetailsList; + } + + _createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags) { + const termFrequency = this._scoreToTermFrequency(this._getTermTagsScoreSum(termTags)); + return { + sourceTerm, + expression, + reading, + furiganaSegments, // Contains duplicate data + termTags, + termFrequency, + frequencies: [], + pitches: [] + }; + } + + // Sorting functions + + _sortTags(tags) { + if (tags.length <= 1) { return; } + const stringComparer = this._stringComparer; + tags.sort((v1, v2) => { + const i = v1.order - v2.order; + if (i !== 0) { return i; } + + return stringComparer.compare(v1.name, v2.name); + }); + } + + _sortDefinitions(definitions, useDictionaryPriority) { + if (definitions.length <= 1) { return; } + const stringComparer = this._stringComparer; + const compareFunction1 = (v1, v2) => { + let i = v2.source.length - v1.source.length; + if (i !== 0) { return i; } + + i = v1.reasons.length - v2.reasons.length; + if (i !== 0) { return i; } + + i = v2.sourceTermExactMatchCount - v1.sourceTermExactMatchCount; + if (i !== 0) { return i; } + + i = v2.score - v1.score; + if (i !== 0) { return i; } + + const expression1 = v1.expression; + const expression2 = v2.expression; + if (typeof expression1 !== 'string' || typeof expression2 !== 'string') { return 0; } // Skip if either is not a string (array) + + i = expression2.length - expression1.length; + if (i !== 0) { return i; } + + return stringComparer.compare(expression1, expression2); + }; + const compareFunction2 = (v1, v2) => { + const i = v2.dictionaryPriority - v1.dictionaryPriority; + return (i !== 0) ? i : compareFunction1(v1, v2); + }; + definitions.sort(useDictionaryPriority ? compareFunction2 : compareFunction1); + } + + _sortDatabaseDefinitionsByIndex(definitions) { + if (definitions.length <= 1) { return; } + definitions.sort((a, b) => a.index - b.index); + } + + _sortDefinitionsById(definitions) { + if (definitions.length <= 1) { return; } + definitions.sort((a, b) => a.id - b.id); + } + + _sortKanjiStats(stats) { + if (stats.length <= 1) { return; } + const stringComparer = this._stringComparer; + stats.sort((v1, v2) => { + const i = v1.order - v2.order; + if (i !== 0) { return i; } + + return stringComparer.compare(v1.notes, v2.notes); + }); + } + + // Regex functions + + _applyTextReplacements(text, sourceMap, replacements) { + for (const {pattern, replacement} of replacements) { + text = this._applyTextReplacement(text, sourceMap, pattern, replacement); + } + return text; + } + + _applyTextReplacement(text, sourceMap, pattern, replacement) { + const isGlobal = pattern.global; + if (isGlobal) { pattern.lastIndex = 0; } + for (let loop = true; loop; loop = isGlobal) { + const match = pattern.exec(text); + if (match === null) { break; } + + const matchText = match[0]; + const index = match.index; + const actualReplacement = this._applyMatchReplacement(replacement, match); + const actualReplacementLength = actualReplacement.length; + const delta = actualReplacementLength - (matchText.length > 0 ? matchText.length : -1); + + text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`; + pattern.lastIndex += delta; + + if (actualReplacementLength > 0) { + sourceMap.combine(Math.max(0, index - 1), matchText.length); + sourceMap.insert(index, ...(new Array(actualReplacementLength).fill(0))); + } else { + sourceMap.combine(index, matchText.length); + } + } + return text; + } + + _applyMatchReplacement(replacement, match) { + const pattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g; + return replacement.replace(pattern, (g0, g1, g2) => { + if (typeof g1 !== 'undefined') { + const matchIndex = Number.parseInt(g1, 10); + if (matchIndex >= 1 && matchIndex <= match.length) { + return match[matchIndex]; + } + } else if (typeof g2 !== 'undefined') { + const {groups} = match; + if (typeof groups === 'object' && groups !== null && Object.prototype.hasOwnProperty.call(groups, g2)) { + return groups[g2]; + } + } else { + switch (g0) { + case '$': return '$'; + case '&': return match[0]; + case '`': return replacement.substring(0, match.index); + case '\'': return replacement.substring(match.index + g0.length); + } + } + return g0; + }); + } +} |