diff options
Diffstat (limited to 'ext/bg/js/translator.js')
-rw-r--r-- | ext/bg/js/translator.js | 1397 |
1 files changed, 0 insertions, 1397 deletions
diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js deleted file mode 100644 index 729c8294..00000000 --- a/ext/bg/js/translator.js +++ /dev/null @@ -1,1397 +0,0 @@ -/* - * Copyright (C) 2016-2021 Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - */ - -/* global - * Deinflector - * TextSourceMap - */ - -/** - * Class which finds term and kanji definitions for text. - */ -class Translator { - /** - * Creates a new Translator instance. - * @param database An instance of DictionaryDatabase. - */ - constructor({japaneseUtil, database}) { - this._japaneseUtil = japaneseUtil; - this._database = database; - this._deinflector = null; - this._tagCache = new Map(); - this._stringComparer = new Intl.Collator('en-US'); // Invariant locale - } - - /** - * Initializes the instance for use. The public API should not be used until - * this function has been called. - * @param deinflectionReasons The raw deinflections reasons data that the Deinflector uses. - */ - prepare(deinflectionReasons) { - this._deinflector = new Deinflector(deinflectionReasons); - } - - /** - * Clears the database tag cache. This should be executed if the database is changed. - */ - clearDatabaseCaches() { - this._tagCache.clear(); - } - - /** - * Finds term definitions for the given text. - * @param mode The mode to use for finding terms, which determines the format of the resulting array. - * One of: 'group', 'merge', 'split', 'simple' - * @param text The text to find terms for. - * @param options An object using the following structure: - * { - * wildcard: (enum: null, 'prefix', 'suffix'), - * mainDictionary: (string), - * alphanumeric: (boolean), - * convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'), - * convertNumericCharacters: (enum: 'false', 'true', 'variant'), - * convertAlphabeticCharacters: (enum: 'false', 'true', 'variant'), - * convertHiraganaToKatakana: (enum: 'false', 'true', 'variant'), - * convertKatakanaToHiragana: (enum: 'false', 'true', 'variant'), - * collapseEmphaticSequences: (enum: 'false', 'true', 'full'), - * textReplacements: [ - * (null or [ - * {pattern: (RegExp), replacement: (string)} - * ... - * ]) - * ... - * ], - * enabledDictionaryMap: (Map of [ - * (string), - * { - * priority: (number), - * allowSecondarySearches: (boolean) - * } - * ]) - * } - * @returns An array of [definitions, textLength]. The structure of each definition depends on the - * mode parameter, see the _create?TermDefinition?() functions for structure details. - */ - async findTerms(mode, text, options) { - switch (mode) { - case 'group': - return await this._findTermsGrouped(text, options); - case 'merge': - return await this._findTermsMerged(text, options); - case 'split': - return await this._findTermsSplit(text, options); - case 'simple': - return await this._findTermsSimple(text, options); - default: - return [[], 0]; - } - } - - /** - * Finds kanji definitions for the given text. - * @param text The text to find kanji definitions for. This string can be of any length, - * but is typically just one character, which is a single kanji. If the string is multiple - * characters long, each character will be searched in the database. - * @param options An object using the following structure: - * { - * enabledDictionaryMap: (Map of [ - * (string), - * { - * priority: (number) - * } - * ]) - * } - * @returns An array of definitions. See the _createKanjiDefinition() function for structure details. - */ - async findKanji(text, options) { - const {enabledDictionaryMap} = options; - const kanjiUnique = new Set(); - for (const c of text) { - kanjiUnique.add(c); - } - - const databaseDefinitions = await this._database.findKanjiBulk([...kanjiUnique], enabledDictionaryMap); - if (databaseDefinitions.length === 0) { return []; } - - this._sortDatabaseDefinitionsByIndex(databaseDefinitions); - - const definitions = []; - for (const {character, onyomi, kunyomi, tags, glossary, stats, dictionary} of databaseDefinitions) { - const expandedStats = await this._expandStats(stats, dictionary); - const expandedTags = await this._expandTags(tags, dictionary); - this._sortTags(expandedTags); - - const definition = this._createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, expandedTags, expandedStats); - definitions.push(definition); - } - - await this._buildKanjiMeta(definitions, enabledDictionaryMap); - - return definitions; - } - - // Find terms core functions - - async _findTermsSimple(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - this._sortDefinitions(definitions, false); - return [definitions, length]; - } - - async _findTermsSplit(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - await this._buildTermMeta(definitions, enabledDictionaryMap); - this._sortDefinitions(definitions, true); - return [definitions, length]; - } - - async _findTermsGrouped(text, options) { - const {enabledDictionaryMap} = options; - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - - const groupedDefinitions = this._groupTerms(definitions, enabledDictionaryMap); - await this._buildTermMeta(groupedDefinitions, enabledDictionaryMap); - this._sortDefinitions(groupedDefinitions, false); - - for (const definition of groupedDefinitions) { - this._flagRedundantDefinitionTags(definition.definitions); - } - - return [groupedDefinitions, length]; - } - - async _findTermsMerged(text, options) { - const {mainDictionary, enabledDictionaryMap} = options; - const secondarySearchDictionaryMap = this._getSecondarySearchDictionaryMap(enabledDictionaryMap); - - const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); - const {sequencedDefinitions, unsequencedDefinitions} = await this._getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap); - const definitionsMerged = []; - const usedDefinitions = new Set(); - - for (const {sourceDefinitions, relatedDefinitions} of sequencedDefinitions) { - const result = await this._getMergedDefinition( - sourceDefinitions, - relatedDefinitions, - unsequencedDefinitions, - secondarySearchDictionaryMap, - usedDefinitions - ); - definitionsMerged.push(result); - } - - const unusedDefinitions = unsequencedDefinitions.filter((definition) => !usedDefinitions.has(definition)); - for (const groupedDefinition of this._groupTerms(unusedDefinitions, enabledDictionaryMap)) { - const {reasons, score, expression, reading, source, rawSource, sourceTerm, furiganaSegments, termTags, definitions: definitions2} = groupedDefinition; - const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; - const compatibilityDefinition = this._createMergedTermDefinition( - source, - rawSource, - this._convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions2), - [expression], - [reading], - termDetailsList, - reasons, - score - ); - definitionsMerged.push(compatibilityDefinition); - } - - await this._buildTermMeta(definitionsMerged, enabledDictionaryMap); - this._sortDefinitions(definitionsMerged, false); - - for (const definition of definitionsMerged) { - this._flagRedundantDefinitionTags(definition.definitions); - } - - return [definitionsMerged, length]; - } - - // Find terms internal implementation - - async _findTermsInternal(text, enabledDictionaryMap, options) { - const {alphanumeric, wildcard} = options; - text = this._getSearchableText(text, alphanumeric); - if (text.length === 0) { - return [[], 0]; - } - - const deinflections = ( - wildcard ? - await this._findTermWildcard(text, enabledDictionaryMap, wildcard) : - await this._findTermDeinflections(text, enabledDictionaryMap, options) - ); - - let maxLength = 0; - const definitions = []; - for (const {databaseDefinitions, source, rawSource, term, reasons} of deinflections) { - if (databaseDefinitions.length === 0) { continue; } - maxLength = Math.max(maxLength, rawSource.length); - for (const databaseDefinition of databaseDefinitions) { - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, term, reasons, enabledDictionaryMap); - definitions.push(definition); - } - } - - this._removeDuplicateDefinitions(definitions); - return [definitions, maxLength]; - } - - async _findTermWildcard(text, enabledDictionaryMap, wildcard) { - const databaseDefinitions = await this._database.findTermsBulk([text], enabledDictionaryMap, wildcard); - if (databaseDefinitions.length === 0) { - return []; - } - - return [{ - source: text, - rawSource: text, - term: text, - rules: 0, - reasons: [], - databaseDefinitions - }]; - } - - async _findTermDeinflections(text, enabledDictionaryMap, options) { - const deinflections = this._getAllDeinflections(text, options); - - if (deinflections.length === 0) { - return []; - } - - const uniqueDeinflectionTerms = []; - const uniqueDeinflectionArrays = []; - const uniqueDeinflectionsMap = new Map(); - for (const deinflection of deinflections) { - const term = deinflection.term; - let deinflectionArray = uniqueDeinflectionsMap.get(term); - if (typeof deinflectionArray === 'undefined') { - deinflectionArray = []; - uniqueDeinflectionTerms.push(term); - uniqueDeinflectionArrays.push(deinflectionArray); - uniqueDeinflectionsMap.set(term, deinflectionArray); - } - deinflectionArray.push(deinflection); - } - - const databaseDefinitions = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, null); - - for (const databaseDefinition of databaseDefinitions) { - const definitionRules = Deinflector.rulesToRuleFlags(databaseDefinition.rules); - for (const deinflection of uniqueDeinflectionArrays[databaseDefinition.index]) { - const deinflectionRules = deinflection.rules; - if (deinflectionRules === 0 || (definitionRules & deinflectionRules) !== 0) { - deinflection.databaseDefinitions.push(databaseDefinition); - } - } - } - - return deinflections; - } - - _getAllDeinflections(text, options) { - const textOptionVariantArray = [ - this._getTextReplacementsVariants(options), - this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), - this._getTextOptionEntryVariants(options.convertNumericCharacters), - this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), - this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), - this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), - this._getCollapseEmphaticOptions(options) - ]; - - const jp = this._japaneseUtil; - const deinflections = []; - const used = new Set(); - for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of this._getArrayVariants(textOptionVariantArray)) { - let text2 = text; - const sourceMap = new TextSourceMap(text2); - if (textReplacements !== null) { - text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); - } - if (halfWidth) { - text2 = jp.convertHalfWidthKanaToFullWidth(text2, sourceMap); - } - if (numeric) { - text2 = jp.convertNumericToFullWidth(text2); - } - if (alphabetic) { - text2 = jp.convertAlphabeticToKana(text2, sourceMap); - } - if (katakana) { - text2 = jp.convertHiraganaToKatakana(text2); - } - if (hiragana) { - text2 = jp.convertKatakanaToHiragana(text2); - } - if (collapseEmphatic) { - text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); - } - - for (let i = text2.length; i > 0; --i) { - const text2Substring = text2.substring(0, i); - if (used.has(text2Substring)) { break; } - used.add(text2Substring); - const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); - for (const deinflection of this._deinflector.deinflect(text2Substring, rawSource)) { - deinflections.push(deinflection); - } - } - } - return deinflections; - } - - async _getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap) { - const sequenceList = []; - const sequencedDefinitionMap = new Map(); - const sequencedDefinitions = []; - const unsequencedDefinitions = []; - for (const definition of definitions) { - const {sequence, dictionary} = definition; - if (mainDictionary === dictionary && sequence >= 0) { - let sequencedDefinition = sequencedDefinitionMap.get(sequence); - if (typeof sequencedDefinition === 'undefined') { - sequencedDefinition = { - sourceDefinitions: [], - relatedDefinitions: [], - relatedDefinitionIds: new Set() - }; - sequencedDefinitionMap.set(sequence, sequencedDefinition); - sequencedDefinitions.push(sequencedDefinition); - sequenceList.push(sequence); - } - sequencedDefinition.sourceDefinitions.push(definition); - sequencedDefinition.relatedDefinitions.push(definition); - sequencedDefinition.relatedDefinitionIds.add(definition.id); - } else { - unsequencedDefinitions.push(definition); - } - } - - if (sequenceList.length > 0) { - const databaseDefinitions = await this._database.findTermsBySequenceBulk(sequenceList, mainDictionary); - for (const databaseDefinition of databaseDefinitions) { - const {relatedDefinitions, relatedDefinitionIds} = sequencedDefinitions[databaseDefinition.index]; - const {id} = databaseDefinition; - if (relatedDefinitionIds.has(id)) { continue; } - - const {source, rawSource, sourceTerm} = relatedDefinitions[0]; - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, [], enabledDictionaryMap); - relatedDefinitions.push(definition); - } - } - - for (const {relatedDefinitions} of sequencedDefinitions) { - this._sortDefinitionsById(relatedDefinitions); - } - - return {sequencedDefinitions, unsequencedDefinitions}; - } - - async _getMergedSecondarySearchResults(expressionsMap, secondarySearchDictionaryMap) { - if (secondarySearchDictionaryMap.size === 0) { - return []; - } - - const expressionList = []; - const readingList = []; - for (const [expression, readingMap] of expressionsMap.entries()) { - for (const reading of readingMap.keys()) { - expressionList.push(expression); - readingList.push(reading); - } - } - - const databaseDefinitions = await this._database.findTermsExactBulk(expressionList, readingList, secondarySearchDictionaryMap); - this._sortDatabaseDefinitionsByIndex(databaseDefinitions); - - const definitions = []; - for (const databaseDefinition of databaseDefinitions) { - const source = expressionList[databaseDefinition.index]; - const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, source, source, [], secondarySearchDictionaryMap); - definitions.push(definition); - } - - return definitions; - } - - async _getMergedDefinition(sourceDefinitions, relatedDefinitions, unsequencedDefinitions, secondarySearchDictionaryMap, usedDefinitions) { - const {reasons, source, rawSource} = sourceDefinitions[0]; - const score = this._getMaxDefinitionScore(sourceDefinitions); - const termInfoMap = new Map(); - const glossaryDefinitions = []; - const glossaryDefinitionGroupMap = new Map(); - - this._mergeByGlossary(relatedDefinitions, glossaryDefinitionGroupMap); - this._addUniqueTermInfos(relatedDefinitions, termInfoMap); - - let secondaryDefinitions = await this._getMergedSecondarySearchResults(termInfoMap, secondarySearchDictionaryMap); - secondaryDefinitions = [...unsequencedDefinitions, ...secondaryDefinitions]; - - this._removeUsedDefinitions(secondaryDefinitions, termInfoMap, usedDefinitions); - this._removeDuplicateDefinitions(secondaryDefinitions); - - this._mergeByGlossary(secondaryDefinitions, glossaryDefinitionGroupMap); - - const allExpressions = new Set(); - const allReadings = new Set(); - for (const {expressions, readings} of glossaryDefinitionGroupMap.values()) { - for (const expression of expressions) { allExpressions.add(expression); } - for (const reading of readings) { allReadings.add(reading); } - } - - for (const {expressions, readings, definitions} of glossaryDefinitionGroupMap.values()) { - const glossaryDefinition = this._createMergedGlossaryTermDefinition( - source, - rawSource, - definitions, - expressions, - readings, - allExpressions, - allReadings - ); - glossaryDefinitions.push(glossaryDefinition); - } - - this._sortDefinitions(glossaryDefinitions, true); - - const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); - - return this._createMergedTermDefinition( - source, - rawSource, - glossaryDefinitions, - [...allExpressions], - [...allReadings], - termDetailsList, - reasons, - score - ); - } - - _removeUsedDefinitions(definitions, termInfoMap, usedDefinitions) { - for (let i = 0, ii = definitions.length; i < ii; ++i) { - const definition = definitions[i]; - const {expression, reading} = definition; - const expressionMap = termInfoMap.get(expression); - if ( - typeof expressionMap !== 'undefined' && - typeof expressionMap.get(reading) !== 'undefined' - ) { - usedDefinitions.add(definition); - } else { - definitions.splice(i, 1); - --i; - --ii; - } - } - } - - _getUniqueDefinitionTags(definitions) { - const definitionTagsMap = new Map(); - for (const {definitionTags} of definitions) { - for (const tag of definitionTags) { - const {name} = tag; - if (definitionTagsMap.has(name)) { continue; } - definitionTagsMap.set(name, this._cloneTag(tag)); - } - } - return [...definitionTagsMap.values()]; - } - - _removeDuplicateDefinitions(definitions) { - const definitionGroups = new Map(); - for (let i = 0, ii = definitions.length; i < ii; ++i) { - const definition = definitions[i]; - const {id} = definition; - const existing = definitionGroups.get(id); - if (typeof existing === 'undefined') { - definitionGroups.set(id, [i, definition]); - continue; - } - - let removeIndex = i; - if (definition.source.length > existing[1].source.length) { - definitionGroups.set(id, [i, definition]); - removeIndex = existing[0]; - } - - definitions.splice(removeIndex, 1); - --i; - --ii; - } - } - - _flagRedundantDefinitionTags(definitions) { - let lastDictionary = null; - let lastPartOfSpeech = ''; - const removeCategoriesSet = new Set(); - - for (const {dictionary, definitionTags} of definitions) { - const partOfSpeech = this._createMapKey(this._getTagNamesWithCategory(definitionTags, 'partOfSpeech')); - - if (lastDictionary !== dictionary) { - lastDictionary = dictionary; - lastPartOfSpeech = ''; - } - - if (lastPartOfSpeech === partOfSpeech) { - removeCategoriesSet.add('partOfSpeech'); - } else { - lastPartOfSpeech = partOfSpeech; - } - - if (removeCategoriesSet.size > 0) { - this._flagTagsWithCategoryAsRedundant(definitionTags, removeCategoriesSet); - removeCategoriesSet.clear(); - } - } - } - - _groupTerms(definitions) { - const groups = new Map(); - for (const definition of definitions) { - const key = this._createMapKey([definition.source, definition.expression, definition.reading, ...definition.reasons]); - let groupDefinitions = groups.get(key); - if (typeof groupDefinitions === 'undefined') { - groupDefinitions = []; - groups.set(key, groupDefinitions); - } - - groupDefinitions.push(definition); - } - - const results = []; - for (const groupDefinitions of groups.values()) { - this._sortDefinitions(groupDefinitions, true); - const definition = this._createGroupedTermDefinition(groupDefinitions); - results.push(definition); - } - - return results; - } - - _mergeByGlossary(definitions, glossaryDefinitionGroupMap) { - for (const definition of definitions) { - const {expression, reading, dictionary, glossary, id} = definition; - - const key = this._createMapKey([dictionary, ...glossary]); - let group = glossaryDefinitionGroupMap.get(key); - if (typeof group === 'undefined') { - group = { - expressions: new Set(), - readings: new Set(), - definitions: [], - definitionIds: new Set() - }; - glossaryDefinitionGroupMap.set(key, group); - } - - const {definitionIds} = group; - if (definitionIds.has(id)) { continue; } - definitionIds.add(id); - group.expressions.add(expression); - group.readings.add(reading); - group.definitions.push(definition); - } - } - - _addUniqueTermInfos(definitions, termInfoMap) { - for (const {expression, reading, sourceTerm, furiganaSegments, termTags} of definitions) { - let readingMap = termInfoMap.get(expression); - if (typeof readingMap === 'undefined') { - readingMap = new Map(); - termInfoMap.set(expression, readingMap); - } - - let termInfo = readingMap.get(reading); - if (typeof termInfo === 'undefined') { - termInfo = { - sourceTerm, - furiganaSegments, - termTagsMap: new Map() - }; - readingMap.set(reading, termInfo); - } - - const {termTagsMap} = termInfo; - for (const tag of termTags) { - const {name} = tag; - if (termTagsMap.has(name)) { continue; } - termTagsMap.set(name, this._cloneTag(tag)); - } - } - } - - _convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions) { - const convertedDefinitions = []; - for (const definition of definitions) { - const {source, rawSource, expression, reading} = definition; - const expressions = new Set([expression]); - const readings = new Set([reading]); - const convertedDefinition = this._createMergedGlossaryTermDefinition(source, rawSource, [definition], expressions, readings, expressions, readings); - convertedDefinitions.push(convertedDefinition); - } - return convertedDefinitions; - } - - // Metadata building - - async _buildTermMeta(definitions, enabledDictionaryMap) { - const addMetadataTargetInfo = (targetMap1, target, parents) => { - let {expression, reading} = target; - if (!reading) { reading = expression; } - - let targetMap2 = targetMap1.get(expression); - if (typeof targetMap2 === 'undefined') { - targetMap2 = new Map(); - targetMap1.set(expression, targetMap2); - } - - let targets = targetMap2.get(reading); - if (typeof targets === 'undefined') { - targets = new Set([target, ...parents]); - targetMap2.set(reading, targets); - } else { - targets.add(target); - for (const parent of parents) { - targets.add(parent); - } - } - }; - - const targetMap = new Map(); - const definitionsQueue = definitions.map((definition) => ({definition, parents: []})); - while (definitionsQueue.length > 0) { - const {definition, parents} = definitionsQueue.shift(); - const childDefinitions = definition.definitions; - if (Array.isArray(childDefinitions)) { - for (const definition2 of childDefinitions) { - definitionsQueue.push({definition: definition2, parents: [...parents, definition]}); - } - } else { - addMetadataTargetInfo(targetMap, definition, parents); - } - - for (const target of definition.expressions) { - addMetadataTargetInfo(targetMap, target, []); - } - } - const targetMapEntries = [...targetMap.entries()]; - const uniqueExpressions = targetMapEntries.map(([expression]) => expression); - - const metas = await this._database.findTermMetaBulk(uniqueExpressions, enabledDictionaryMap); - for (const {expression, mode, data, dictionary, index} of metas) { - const targetMap2 = targetMapEntries[index][1]; - for (const [reading, targets] of targetMap2) { - switch (mode) { - case 'freq': - { - const frequencyData = this._getTermFrequencyData(expression, reading, dictionary, data); - if (frequencyData === null) { continue; } - for (const {frequencies} of targets) { frequencies.push(frequencyData); } - } - break; - case 'pitch': - { - const pitchData = await this._getPitchData(expression, reading, dictionary, data); - if (pitchData === null) { continue; } - for (const {pitches} of targets) { pitches.push(pitchData); } - } - break; - } - } - } - } - - async _buildKanjiMeta(definitions, enabledDictionaryMap) { - const kanjiList = []; - for (const {character} of definitions) { - kanjiList.push(character); - } - - const metas = await this._database.findKanjiMetaBulk(kanjiList, enabledDictionaryMap); - for (const {character, mode, data, dictionary, index} of metas) { - switch (mode) { - case 'freq': - { - const frequencyData = this._getKanjiFrequencyData(character, dictionary, data); - definitions[index].frequencies.push(frequencyData); - } - break; - } - } - } - - async _expandTags(names, dictionary) { - const tagMetaList = await this._getTagMetaList(names, dictionary); - const results = []; - for (let i = 0, ii = tagMetaList.length; i < ii; ++i) { - const meta = tagMetaList[i]; - const name = names[i]; - const {category, notes, order, score} = (meta !== null ? meta : {}); - const tag = this._createTag(name, category, notes, order, score, dictionary, false); - results.push(tag); - } - return results; - } - - async _expandStats(items, dictionary) { - const names = Object.keys(items); - const tagMetaList = await this._getTagMetaList(names, dictionary); - - const statsGroups = new Map(); - for (let i = 0; i < names.length; ++i) { - const name = names[i]; - const meta = tagMetaList[i]; - if (meta === null) { continue; } - - const {category, notes, order, score} = meta; - let group = statsGroups.get(category); - if (typeof group === 'undefined') { - group = []; - statsGroups.set(category, group); - } - - const value = items[name]; - const stat = this._createKanjiStat(name, category, notes, order, score, dictionary, value); - group.push(stat); - } - - const stats = {}; - for (const [category, group] of statsGroups.entries()) { - this._sortKanjiStats(group); - stats[category] = group; - } - return stats; - } - - async _getTagMetaList(names, dictionary) { - const tagMetaList = []; - let cache = this._tagCache.get(dictionary); - if (typeof cache === 'undefined') { - cache = new Map(); - this._tagCache.set(dictionary, cache); - } - - for (const name of names) { - const base = this._getNameBase(name); - - let tagMeta = cache.get(base); - if (typeof tagMeta === 'undefined') { - tagMeta = await this._database.findTagForTitle(base, dictionary); - cache.set(base, tagMeta); - } - - tagMetaList.push(tagMeta); - } - - return tagMetaList; - } - - _getTermFrequencyData(expression, reading, dictionary, data) { - let frequency = data; - const hasReading = (data !== null && typeof data === 'object'); - if (hasReading) { - if (data.reading !== reading) { return null; } - frequency = data.frequency; - } - return {dictionary, expression, reading, hasReading, frequency}; - } - - _getKanjiFrequencyData(character, dictionary, data) { - return {dictionary, character, frequency: data}; - } - - async _getPitchData(expression, reading, dictionary, data) { - if (data.reading !== reading) { return null; } - - const pitches = []; - for (let {position, tags} of data.pitches) { - tags = Array.isArray(tags) ? await this._expandTags(tags, dictionary) : []; - pitches.push({position, tags}); - } - - return {expression, reading, dictionary, pitches}; - } - - // Simple helpers - - _scoreToTermFrequency(score) { - if (score > 0) { - return 'popular'; - } else if (score < 0) { - return 'rare'; - } else { - return 'normal'; - } - } - - _getNameBase(name) { - const pos = name.indexOf(':'); - return (pos >= 0 ? name.substring(0, pos) : name); - } - - _getSearchableText(text, allowAlphanumericCharacters) { - if (allowAlphanumericCharacters) { - return text; - } - - const jp = this._japaneseUtil; - let newText = ''; - for (const c of text) { - if (!jp.isCodePointJapanese(c.codePointAt(0))) { - break; - } - newText += c; - } - return newText; - } - - _getTextOptionEntryVariants(value) { - switch (value) { - case 'true': return [true]; - case 'variant': return [false, true]; - default: return [false]; - } - } - - _getCollapseEmphaticOptions(options) { - const collapseEmphaticOptions = [[false, false]]; - switch (options.collapseEmphaticSequences) { - case 'true': - collapseEmphaticOptions.push([true, false]); - break; - case 'full': - collapseEmphaticOptions.push([true, false], [true, true]); - break; - } - return collapseEmphaticOptions; - } - - _getTextReplacementsVariants(options) { - return options.textReplacements; - } - - _getSecondarySearchDictionaryMap(enabledDictionaryMap) { - const secondarySearchDictionaryMap = new Map(); - for (const [dictionary, details] of enabledDictionaryMap.entries()) { - if (!details.allowSecondarySearches) { continue; } - secondarySearchDictionaryMap.set(dictionary, details); - } - return secondarySearchDictionaryMap; - } - - _getDictionaryPriority(dictionary, enabledDictionaryMap) { - const info = enabledDictionaryMap.get(dictionary); - return typeof info !== 'undefined' ? info.priority : 0; - } - - _getTagNamesWithCategory(tags, category) { - const results = []; - for (const tag of tags) { - if (tag.category !== category) { continue; } - results.push(tag.name); - } - results.sort(); - return results; - } - - _flagTagsWithCategoryAsRedundant(tags, removeCategoriesSet) { - for (const tag of tags) { - if (removeCategoriesSet.has(tag.category)) { - tag.redundant = true; - } - } - } - - _getUniqueDictionaryNames(definitions) { - const uniqueDictionaryNames = new Set(); - for (const {dictionaryNames} of definitions) { - for (const dictionaryName of dictionaryNames) { - uniqueDictionaryNames.add(dictionaryName); - } - } - return [...uniqueDictionaryNames]; - } - - _getUniqueTermTags(definitions) { - const newTermTags = []; - if (definitions.length <= 1) { - for (const {termTags} of definitions) { - for (const tag of termTags) { - newTermTags.push(this._cloneTag(tag)); - } - } - } else { - const tagsSet = new Set(); - let checkTagsMap = false; - for (const {termTags} of definitions) { - for (const tag of termTags) { - const key = this._getTagMapKey(tag); - if (checkTagsMap && tagsSet.has(key)) { continue; } - tagsSet.add(key); - newTermTags.push(this._cloneTag(tag)); - } - checkTagsMap = true; - } - } - return newTermTags; - } - - *_getArrayVariants(arrayVariants) { - const ii = arrayVariants.length; - - let total = 1; - for (let i = 0; i < ii; ++i) { - total *= arrayVariants[i].length; - } - - for (let a = 0; a < total; ++a) { - const variant = []; - let index = a; - for (let i = 0; i < ii; ++i) { - const entryVariants = arrayVariants[i]; - variant.push(entryVariants[index % entryVariants.length]); - index = Math.floor(index / entryVariants.length); - } - yield variant; - } - } - - _areSetsEqual(set1, set2) { - if (set1.size !== set2.size) { - return false; - } - - for (const value of set1) { - if (!set2.has(value)) { - return false; - } - } - - return true; - } - - _getSetIntersection(set1, set2) { - const result = []; - for (const value of set1) { - if (set2.has(value)) { - result.push(value); - } - } - return result; - } - - // Reduction functions - - _getTermTagsScoreSum(termTags) { - let result = 0; - for (const {score} of termTags) { - result += score; - } - return result; - } - - _getSourceTermMatchCountSum(definitions) { - let result = 0; - for (const {sourceTermExactMatchCount} of definitions) { - result += sourceTermExactMatchCount; - } - return result; - } - - _getMaxDefinitionScore(definitions) { - let result = Number.MIN_SAFE_INTEGER; - for (const {score} of definitions) { - if (score > result) { result = score; } - } - return result; - } - - _getMaxDictionaryPriority(definitions) { - let result = Number.MIN_SAFE_INTEGER; - for (const {dictionaryPriority} of definitions) { - if (dictionaryPriority > result) { result = dictionaryPriority; } - } - return result; - } - - // Common data creation and cloning functions - - _cloneTag(tag) { - const {name, category, notes, order, score, dictionary, redundant} = tag; - return this._createTag(name, category, notes, order, score, dictionary, redundant); - } - - _getTagMapKey(tag) { - const {name, category, notes} = tag; - return this._createMapKey([name, category, notes]); - } - - _createMapKey(array) { - return JSON.stringify(array); - } - - _createTag(name, category, notes, order, score, dictionary, redundant) { - return { - name, - category: (typeof category === 'string' && category.length > 0 ? category : 'default'), - notes: (typeof notes === 'string' ? notes : ''), - order: (typeof order === 'number' ? order : 0), - score: (typeof score === 'number' ? score : 0), - dictionary: (typeof dictionary === 'string' ? dictionary : null), - redundant - }; - } - - _createKanjiStat(name, category, notes, order, score, dictionary, value) { - return { - name, - category: (typeof category === 'string' && category.length > 0 ? category : 'default'), - notes: (typeof notes === 'string' ? notes : ''), - order: (typeof order === 'number' ? order : 0), - score: (typeof score === 'number' ? score : 0), - dictionary: (typeof dictionary === 'string' ? dictionary : null), - value - }; - } - - _createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, tags, stats) { - return { - type: 'kanji', - character, - dictionary, - onyomi, - kunyomi, - glossary, - tags, - stats, - frequencies: [] - }; - } - - async _createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, reasons, enabledDictionaryMap) { - const {expression, reading, definitionTags, termTags, glossary, score, dictionary, id, sequence} = databaseDefinition; - const dictionaryPriority = this._getDictionaryPriority(dictionary, enabledDictionaryMap); - const termTagsExpanded = await this._expandTags(termTags, dictionary); - const definitionTagsExpanded = await this._expandTags(definitionTags, dictionary); - - this._sortTags(definitionTagsExpanded); - this._sortTags(termTagsExpanded); - - const furiganaSegments = this._japaneseUtil.distributeFurigana(expression, reading); - const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTagsExpanded)]; - const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); - - return { - type: 'term', - id, - source, - rawSource, - sourceTerm, - reasons, - score, - sequence, - dictionary, - dictionaryPriority, - dictionaryNames: [dictionary], - expression, - reading, - expressions: termDetailsList, - furiganaSegments, - glossary, - definitionTags: definitionTagsExpanded, - termTags: termTagsExpanded, - // definitions - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount - }; - } - - _createGroupedTermDefinition(definitions) { - const {expression, reading, furiganaSegments, reasons, source, rawSource, sourceTerm} = definitions[0]; - const score = this._getMaxDefinitionScore(definitions); - const dictionaryPriority = this._getMaxDictionaryPriority(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); - const termTags = this._getUniqueTermTags(definitions); - const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; - const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); - return { - type: 'termGrouped', - // id - source, - rawSource, - sourceTerm, - reasons: [...reasons], - score, - // sequence - dictionary: dictionaryNames[0], - dictionaryPriority, - dictionaryNames, - expression, - reading, - expressions: termDetailsList, - furiganaSegments, // Contains duplicate data - // glossary - // definitionTags - termTags, - definitions, // type: 'term' - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount - }; - } - - _createMergedTermDefinition(source, rawSource, definitions, expressions, readings, termDetailsList, reasons, score) { - const dictionaryPriority = this._getMaxDictionaryPriority(definitions); - const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); - return { - type: 'termMerged', - // id - source, - rawSource, - // sourceTerm - reasons, - score, - // sequence - dictionary: dictionaryNames[0], - dictionaryPriority, - dictionaryNames, - expression: expressions, - reading: readings, - expressions: termDetailsList, - // furiganaSegments - // glossary - // definitionTags - // termTags - definitions, // type: 'termMergedByGlossary' - frequencies: [], - pitches: [], - // only - sourceTermExactMatchCount - }; - } - - _createMergedGlossaryTermDefinition(source, rawSource, definitions, expressions, readings, allExpressions, allReadings) { - const only = []; - if (!this._areSetsEqual(expressions, allExpressions)) { - only.push(...this._getSetIntersection(expressions, allExpressions)); - } - if (!this._areSetsEqual(readings, allReadings)) { - only.push(...this._getSetIntersection(readings, allReadings)); - } - - const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); - const dictionaryNames = this._getUniqueDictionaryNames(definitions); - - const termInfoMap = new Map(); - this._addUniqueTermInfos(definitions, termInfoMap); - const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); - - const definitionTags = this._getUniqueDefinitionTags(definitions); - this._sortTags(definitionTags); - - const {glossary} = definitions[0]; - const score = this._getMaxDefinitionScore(definitions); - const dictionaryPriority = this._getMaxDictionaryPriority(definitions); - return { - type: 'termMergedByGlossary', - // id - source, - rawSource, - // sourceTerm - reasons: [], - score, - // sequence - dictionary: dictionaryNames[0], - dictionaryPriority, - dictionaryNames, - expression: [...expressions], - reading: [...readings], - expressions: termDetailsList, - // furiganaSegments - glossary: [...glossary], - definitionTags, - // termTags - definitions, // type: 'term'; contains duplicate data - frequencies: [], - pitches: [], - only, - sourceTermExactMatchCount - }; - } - - _createTermDetailsListFromTermInfoMap(termInfoMap) { - const termDetailsList = []; - for (const [expression, readingMap] of termInfoMap.entries()) { - for (const [reading, {termTagsMap, sourceTerm, furiganaSegments}] of readingMap.entries()) { - const termTags = [...termTagsMap.values()]; - this._sortTags(termTags); - termDetailsList.push(this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)); - } - } - return termDetailsList; - } - - _createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags) { - const termFrequency = this._scoreToTermFrequency(this._getTermTagsScoreSum(termTags)); - return { - sourceTerm, - expression, - reading, - furiganaSegments, // Contains duplicate data - termTags, - termFrequency, - frequencies: [], - pitches: [] - }; - } - - // Sorting functions - - _sortTags(tags) { - if (tags.length <= 1) { return; } - const stringComparer = this._stringComparer; - tags.sort((v1, v2) => { - const i = v1.order - v2.order; - if (i !== 0) { return i; } - - return stringComparer.compare(v1.name, v2.name); - }); - } - - _sortDefinitions(definitions, useDictionaryPriority) { - if (definitions.length <= 1) { return; } - const stringComparer = this._stringComparer; - const compareFunction1 = (v1, v2) => { - let i = v2.source.length - v1.source.length; - if (i !== 0) { return i; } - - i = v1.reasons.length - v2.reasons.length; - if (i !== 0) { return i; } - - i = v2.sourceTermExactMatchCount - v1.sourceTermExactMatchCount; - if (i !== 0) { return i; } - - i = v2.score - v1.score; - if (i !== 0) { return i; } - - const expression1 = v1.expression; - const expression2 = v2.expression; - if (typeof expression1 !== 'string' || typeof expression2 !== 'string') { return 0; } // Skip if either is not a string (array) - - i = expression2.length - expression1.length; - if (i !== 0) { return i; } - - return stringComparer.compare(expression1, expression2); - }; - const compareFunction2 = (v1, v2) => { - const i = v2.dictionaryPriority - v1.dictionaryPriority; - return (i !== 0) ? i : compareFunction1(v1, v2); - }; - definitions.sort(useDictionaryPriority ? compareFunction2 : compareFunction1); - } - - _sortDatabaseDefinitionsByIndex(definitions) { - if (definitions.length <= 1) { return; } - definitions.sort((a, b) => a.index - b.index); - } - - _sortDefinitionsById(definitions) { - if (definitions.length <= 1) { return; } - definitions.sort((a, b) => a.id - b.id); - } - - _sortKanjiStats(stats) { - if (stats.length <= 1) { return; } - const stringComparer = this._stringComparer; - stats.sort((v1, v2) => { - const i = v1.order - v2.order; - if (i !== 0) { return i; } - - return stringComparer.compare(v1.notes, v2.notes); - }); - } - - // Regex functions - - _applyTextReplacements(text, sourceMap, replacements) { - for (const {pattern, replacement} of replacements) { - text = this._applyTextReplacement(text, sourceMap, pattern, replacement); - } - return text; - } - - _applyTextReplacement(text, sourceMap, pattern, replacement) { - const isGlobal = pattern.global; - if (isGlobal) { pattern.lastIndex = 0; } - for (let loop = true; loop; loop = isGlobal) { - const match = pattern.exec(text); - if (match === null) { break; } - - const matchText = match[0]; - const index = match.index; - const actualReplacement = this._applyMatchReplacement(replacement, match); - const actualReplacementLength = actualReplacement.length; - const delta = actualReplacementLength - (matchText.length > 0 ? matchText.length : -1); - - text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`; - pattern.lastIndex += delta; - - if (actualReplacementLength > 0) { - sourceMap.combine(Math.max(0, index - 1), matchText.length); - sourceMap.insert(index, ...(new Array(actualReplacementLength).fill(0))); - } else { - sourceMap.combine(index, matchText.length); - } - } - return text; - } - - _applyMatchReplacement(replacement, match) { - const pattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g; - return replacement.replace(pattern, (g0, g1, g2) => { - if (typeof g1 !== 'undefined') { - const matchIndex = Number.parseInt(g1, 10); - if (matchIndex >= 1 && matchIndex <= match.length) { - return match[matchIndex]; - } - } else if (typeof g2 !== 'undefined') { - const {groups} = match; - if (typeof groups === 'object' && groups !== null && Object.prototype.hasOwnProperty.call(groups, g2)) { - return groups[g2]; - } - } else { - switch (g0) { - case '$': return '$'; - case '&': return match[0]; - case '`': return replacement.substring(0, match.index); - case '\'': return replacement.substring(match.index + g0.length); - } - } - return g0; - }); - } -} |