diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-09-26 11:08:16 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-26 11:08:16 -0400 |
commit | 9899727d7d53caed4c5b5e68176f7ed7f90a9438 (patch) | |
tree | 3d764007cf8e86cee23be969a2065a644b27f73d /ext/js/language/translator.js | |
parent | 88e71f82232781a1bc16701ce4719d770222ec4c (diff) |
Frequency dictionary sort (#1938)
* Add sortDictionary/sortDictionaryOrder options
* Update options
* Add API.getTermFrequencies
* Add settings
* Implement frequency dictionary sorting
* Update test
* Update test data
* Fix handling of undefined rank-based frequencies
Diffstat (limited to 'ext/js/language/translator.js')
-rw-r--r-- | ext/js/language/translator.js | 112 |
1 files changed, 110 insertions, 2 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 641c9d57..1abf9f4e 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -64,6 +64,8 @@ class Translator { * { * wildcard: (enum: null, 'prefix', 'suffix'), * mainDictionary: (string), + * sortFrequencyDictionary: (null or string), + * sortFrequencyDictionaryOrder: (enum: 'ascending', 'descending'), * removeNonJapaneseCharacters: (boolean), * convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'), * convertNumericCharacters: (enum: 'false', 'true', 'variant'), @@ -92,7 +94,7 @@ class Translator { * @returns An object of the structure `{dictionaryEntries, originalTextLength}`. */ async findTerms(mode, text, options) { - const {enabledDictionaryMap, excludeDictionaryDefinitions} = options; + const {enabledDictionaryMap, excludeDictionaryDefinitions, sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options; let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options); switch (mode) { @@ -115,6 +117,9 @@ class Translator { await this._expandTermTags(dictionaryEntries); } + if (sortFrequencyDictionary !== null) { + this._updateSortFrequencies(dictionaryEntries, sortFrequencyDictionary, sortFrequencyDictionaryOrder === 'ascending'); + } if (dictionaryEntries.length > 1) { this._sortTermDictionaryEntries(dictionaryEntries); } @@ -176,6 +181,48 @@ class Translator { return dictionaryEntries; } + /** + * Gets a list of frequency information for a given list of term-reading pairs + * and a list of dictionaries. + * @param termReadingList An array of `{term, reading}` pairs. If reading is null, + * the reading won't be compared. + * @param dictionaries An array of dictionary names. + * @returns An array of objects with the format + * `{term, reading, dictionary, hasReading, frequency}`. + */ + async getTermFrequencies(termReadingList, dictionaries) { + const dictionarySet = new Set(); + for (const dictionary of dictionaries) { + dictionarySet.add(dictionary); + } + + const termList = termReadingList.map(({term}) => term); + const metas = await this._database.findTermMetaBulk(termList, dictionarySet); + + const results = []; + for (const {mode, data, dictionary, index} of metas) { + if (mode !== 'freq') { continue; } + let {term, reading} = termReadingList[index]; + let frequency = data; + const hasReading = (data !== null && typeof data === 'object'); + if (hasReading) { + if (data.reading !== reading) { + if (reading !== null) { continue; } + reading = data.reading; + } + frequency = data.frequency; + } + results.push({ + term, + reading, + dictionary, + hasReading, + frequency + }); + } + return results; + } + // Find terms internal implementation async _findTermsInternal(text, enabledDictionaryMap, options) { @@ -1035,7 +1082,20 @@ class Translator { } _createTermDefinition(index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries) { - return {index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries}; + return { + index, + headwordIndices, + dictionary, + dictionaryIndex, + dictionaryPriority, + id, + score, + frequencyOrder: 0, + sequences, + isPrimary, + tags, + entries + }; } _createTermPronunciation(index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, pitches) { @@ -1052,6 +1112,7 @@ class Translator { isPrimary, inflections, score, + frequencyOrder: 0, dictionaryIndex, dictionaryPriority, sourceTermExactMatchCount, @@ -1314,6 +1375,10 @@ class Translator { i = v2.dictionaryPriority - v1.dictionaryPriority; if (i !== 0) { return i; } + // Sort by frequency order + i = v1.frequencyOrder - v2.frequencyOrder; + if (i !== 0) { return i; } + // Sort by term score i = v2.score - v1.score; if (i !== 0) { return i; } @@ -1345,6 +1410,10 @@ class Translator { let i = v2.dictionaryPriority - v1.dictionaryPriority; if (i !== 0) { return i; } + // Sort by frequency order + i = v1.frequencyOrder - v2.frequencyOrder; + if (i !== 0) { return i; } + // Sort by term score i = v2.score - v1.score; if (i !== 0) { return i; } @@ -1416,4 +1485,43 @@ class Translator { frequencies.sort(compare); } } + + _updateSortFrequencies(dictionaryEntries, dictionary, ascending) { + const frequencyMap = new Map(); + for (const dictionaryEntry of dictionaryEntries) { + const {definitions, frequencies} = dictionaryEntry; + let frequencyMin = Number.MAX_SAFE_INTEGER; + let frequencyMax = Number.MIN_SAFE_INTEGER; + for (const item of frequencies) { + if (item.dictionary !== dictionary) { continue; } + const {headwordIndex, frequency} = item; + if (typeof frequency !== 'number') { continue; } + frequencyMap.set(headwordIndex, frequency); + frequencyMin = Math.min(frequencyMin, frequency); + frequencyMax = Math.max(frequencyMax, frequency); + } + dictionaryEntry.frequencyOrder = ( + frequencyMin <= frequencyMax ? + (ascending ? frequencyMin : -frequencyMax) : + (ascending ? Number.MAX_SAFE_INTEGER : 0) + ); + for (const definition of definitions) { + frequencyMin = Number.MAX_SAFE_INTEGER; + frequencyMax = Number.MIN_SAFE_INTEGER; + const {headwordIndices} = definition; + for (const headwordIndex of headwordIndices) { + const frequency = frequencyMap.get(headwordIndex); + if (typeof frequency !== 'number') { continue; } + frequencyMin = Math.min(frequencyMin, frequency); + frequencyMax = Math.max(frequencyMax, frequency); + } + definition.frequencyOrder = ( + frequencyMin <= frequencyMax ? + (ascending ? frequencyMin : -frequencyMax) : + (ascending ? Number.MAX_SAFE_INTEGER : 0) + ); + } + frequencyMap.clear(); + } + } } |