aboutsummaryrefslogtreecommitdiff
path: root/ext/js/language
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-09-26 11:08:16 -0400
committerGitHub <noreply@github.com>2021-09-26 11:08:16 -0400
commit9899727d7d53caed4c5b5e68176f7ed7f90a9438 (patch)
tree3d764007cf8e86cee23be969a2065a644b27f73d /ext/js/language
parent88e71f82232781a1bc16701ce4719d770222ec4c (diff)
Frequency dictionary sort (#1938)
* Add sortDictionary/sortDictionaryOrder options * Update options * Add API.getTermFrequencies * Add settings * Implement frequency dictionary sorting * Update test * Update test data * Fix handling of undefined rank-based frequencies
Diffstat (limited to 'ext/js/language')
-rw-r--r--ext/js/language/translator.js112
1 files changed, 110 insertions, 2 deletions
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index 641c9d57..1abf9f4e 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -64,6 +64,8 @@ class Translator {
* {
* wildcard: (enum: null, 'prefix', 'suffix'),
* mainDictionary: (string),
+ * sortFrequencyDictionary: (null or string),
+ * sortFrequencyDictionaryOrder: (enum: 'ascending', 'descending'),
* removeNonJapaneseCharacters: (boolean),
* convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'),
* convertNumericCharacters: (enum: 'false', 'true', 'variant'),
@@ -92,7 +94,7 @@ class Translator {
* @returns An object of the structure `{dictionaryEntries, originalTextLength}`.
*/
async findTerms(mode, text, options) {
- const {enabledDictionaryMap, excludeDictionaryDefinitions} = options;
+ const {enabledDictionaryMap, excludeDictionaryDefinitions, sortFrequencyDictionary, sortFrequencyDictionaryOrder} = options;
let {dictionaryEntries, originalTextLength} = await this._findTermsInternal(text, enabledDictionaryMap, options);
switch (mode) {
@@ -115,6 +117,9 @@ class Translator {
await this._expandTermTags(dictionaryEntries);
}
+ if (sortFrequencyDictionary !== null) {
+ this._updateSortFrequencies(dictionaryEntries, sortFrequencyDictionary, sortFrequencyDictionaryOrder === 'ascending');
+ }
if (dictionaryEntries.length > 1) {
this._sortTermDictionaryEntries(dictionaryEntries);
}
@@ -176,6 +181,48 @@ class Translator {
return dictionaryEntries;
}
+ /**
+ * Gets a list of frequency information for a given list of term-reading pairs
+ * and a list of dictionaries.
+ * @param termReadingList An array of `{term, reading}` pairs. If reading is null,
+ * the reading won't be compared.
+ * @param dictionaries An array of dictionary names.
+ * @returns An array of objects with the format
+ * `{term, reading, dictionary, hasReading, frequency}`.
+ */
+ async getTermFrequencies(termReadingList, dictionaries) {
+ const dictionarySet = new Set();
+ for (const dictionary of dictionaries) {
+ dictionarySet.add(dictionary);
+ }
+
+ const termList = termReadingList.map(({term}) => term);
+ const metas = await this._database.findTermMetaBulk(termList, dictionarySet);
+
+ const results = [];
+ for (const {mode, data, dictionary, index} of metas) {
+ if (mode !== 'freq') { continue; }
+ let {term, reading} = termReadingList[index];
+ let frequency = data;
+ const hasReading = (data !== null && typeof data === 'object');
+ if (hasReading) {
+ if (data.reading !== reading) {
+ if (reading !== null) { continue; }
+ reading = data.reading;
+ }
+ frequency = data.frequency;
+ }
+ results.push({
+ term,
+ reading,
+ dictionary,
+ hasReading,
+ frequency
+ });
+ }
+ return results;
+ }
+
// Find terms internal implementation
async _findTermsInternal(text, enabledDictionaryMap, options) {
@@ -1035,7 +1082,20 @@ class Translator {
}
_createTermDefinition(index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries) {
- return {index, headwordIndices, dictionary, dictionaryIndex, dictionaryPriority, id, score, sequences, isPrimary, tags, entries};
+ return {
+ index,
+ headwordIndices,
+ dictionary,
+ dictionaryIndex,
+ dictionaryPriority,
+ id,
+ score,
+ frequencyOrder: 0,
+ sequences,
+ isPrimary,
+ tags,
+ entries
+ };
}
_createTermPronunciation(index, headwordIndex, dictionary, dictionaryIndex, dictionaryPriority, pitches) {
@@ -1052,6 +1112,7 @@ class Translator {
isPrimary,
inflections,
score,
+ frequencyOrder: 0,
dictionaryIndex,
dictionaryPriority,
sourceTermExactMatchCount,
@@ -1314,6 +1375,10 @@ class Translator {
i = v2.dictionaryPriority - v1.dictionaryPriority;
if (i !== 0) { return i; }
+ // Sort by frequency order
+ i = v1.frequencyOrder - v2.frequencyOrder;
+ if (i !== 0) { return i; }
+
// Sort by term score
i = v2.score - v1.score;
if (i !== 0) { return i; }
@@ -1345,6 +1410,10 @@ class Translator {
let i = v2.dictionaryPriority - v1.dictionaryPriority;
if (i !== 0) { return i; }
+ // Sort by frequency order
+ i = v1.frequencyOrder - v2.frequencyOrder;
+ if (i !== 0) { return i; }
+
// Sort by term score
i = v2.score - v1.score;
if (i !== 0) { return i; }
@@ -1416,4 +1485,43 @@ class Translator {
frequencies.sort(compare);
}
}
+
+ _updateSortFrequencies(dictionaryEntries, dictionary, ascending) {
+ const frequencyMap = new Map();
+ for (const dictionaryEntry of dictionaryEntries) {
+ const {definitions, frequencies} = dictionaryEntry;
+ let frequencyMin = Number.MAX_SAFE_INTEGER;
+ let frequencyMax = Number.MIN_SAFE_INTEGER;
+ for (const item of frequencies) {
+ if (item.dictionary !== dictionary) { continue; }
+ const {headwordIndex, frequency} = item;
+ if (typeof frequency !== 'number') { continue; }
+ frequencyMap.set(headwordIndex, frequency);
+ frequencyMin = Math.min(frequencyMin, frequency);
+ frequencyMax = Math.max(frequencyMax, frequency);
+ }
+ dictionaryEntry.frequencyOrder = (
+ frequencyMin <= frequencyMax ?
+ (ascending ? frequencyMin : -frequencyMax) :
+ (ascending ? Number.MAX_SAFE_INTEGER : 0)
+ );
+ for (const definition of definitions) {
+ frequencyMin = Number.MAX_SAFE_INTEGER;
+ frequencyMax = Number.MIN_SAFE_INTEGER;
+ const {headwordIndices} = definition;
+ for (const headwordIndex of headwordIndices) {
+ const frequency = frequencyMap.get(headwordIndex);
+ if (typeof frequency !== 'number') { continue; }
+ frequencyMin = Math.min(frequencyMin, frequency);
+ frequencyMax = Math.max(frequencyMax, frequency);
+ }
+ definition.frequencyOrder = (
+ frequencyMin <= frequencyMax ?
+ (ascending ? frequencyMin : -frequencyMax) :
+ (ascending ? Number.MAX_SAFE_INTEGER : 0)
+ );
+ }
+ frequencyMap.clear();
+ }
+ }
}