summaryrefslogtreecommitdiff
path: root/ext/bg/js/translator.js
diff options
context:
space:
mode:
authorAlex Yatskov <alex@foosoft.net>2020-01-26 11:29:30 -0800
committerAlex Yatskov <alex@foosoft.net>2020-01-26 11:29:30 -0800
commit0c5b9b1fa1599cbf769d96cdebc226310f9dd8bc (patch)
treee734e2c3005078dbc248b541d357a934baa8a116 /ext/bg/js/translator.js
parent2a12036ca305044291f1f4105d6a8d249848b210 (diff)
parent0cf1cf3aa094585bd6db8db2c1f229ba0ea37b6e (diff)
Merge branch 'master' into testing
Diffstat (limited to 'ext/bg/js/translator.js')
-rw-r--r--ext/bg/js/translator.js255
1 files changed, 174 insertions, 81 deletions
diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js
index 7473c6ad..dfec54ac 100644
--- a/ext/bg/js/translator.js
+++ b/ext/bg/js/translator.js
@@ -121,16 +121,10 @@ class Translator {
dictTermsSort(result.definitions, dictionaries);
const expressions = [];
- for (const expression of result.expressions.keys()) {
- for (const reading of result.expressions.get(expression).keys()) {
- const termTags = result.expressions.get(expression).get(reading);
+ for (const [expression, readingMap] of result.expressions.entries()) {
+ for (const [reading, termTags] of readingMap.entries()) {
const score = termTags.map((tag) => tag.score).reduce((p, v) => p + v, 0);
- expressions.push({
- expression: expression,
- reading: reading,
- termTags: dictTagsSort(termTags),
- termFrequency: Translator.scoreToTermFrequency(score)
- });
+ expressions.push(Translator.createExpression(expression, reading, dictTagsSort(termTags), Translator.scoreToTermFrequency(score)));
}
}
@@ -157,10 +151,10 @@ class Translator {
async findTermsGrouped(text, details, options) {
const dictionaries = dictEnabledSet(options);
const titles = Object.keys(dictionaries);
- const [definitions, length] = await this.findTermsInternal(text, dictionaries, options.scanning.alphanumeric, details);
+ const [definitions, length] = await this.findTermsInternal(text, dictionaries, details, options);
const definitionsGrouped = dictTermsGroup(definitions, dictionaries);
- await this.buildTermFrequencies(definitionsGrouped, titles);
+ await this.buildTermMeta(definitionsGrouped, titles);
if (options.general.compactTags) {
for (const definition of definitionsGrouped) {
@@ -175,7 +169,7 @@ class Translator {
const dictionaries = dictEnabledSet(options);
const secondarySearchTitles = Object.keys(options.dictionaries).filter((dict) => options.dictionaries[dict].allowSecondarySearches);
const titles = Object.keys(dictionaries);
- const [definitions, length] = await this.findTermsInternal(text, dictionaries, options.scanning.alphanumeric, details);
+ const [definitions, length] = await this.findTermsInternal(text, dictionaries, details, options);
const {sequencedDefinitions, defaultDefinitions} = await this.getSequencedDefinitions(definitions, options.general.mainDictionary);
const definitionsMerged = [];
const mergedByTermIndices = new Set();
@@ -194,11 +188,11 @@ class Translator {
const strayDefinitions = defaultDefinitions.filter((definition, index) => !mergedByTermIndices.has(index));
for (const groupedDefinition of dictTermsGroup(strayDefinitions, dictionaries)) {
- groupedDefinition.expressions = [{expression: groupedDefinition.expression, reading: groupedDefinition.reading}];
+ groupedDefinition.expressions = [Translator.createExpression(groupedDefinition.expression, groupedDefinition.reading)];
definitionsMerged.push(groupedDefinition);
}
- await this.buildTermFrequencies(definitionsMerged, titles);
+ await this.buildTermMeta(definitionsMerged, titles);
if (options.general.compactTags) {
for (const definition of definitionsMerged) {
@@ -212,26 +206,24 @@ class Translator {
async findTermsSplit(text, details, options) {
const dictionaries = dictEnabledSet(options);
const titles = Object.keys(dictionaries);
- const [definitions, length] = await this.findTermsInternal(text, dictionaries, options.scanning.alphanumeric, details);
+ const [definitions, length] = await this.findTermsInternal(text, dictionaries, details, options);
- await this.buildTermFrequencies(definitions, titles);
+ await this.buildTermMeta(definitions, titles);
return [definitions, length];
}
- async findTermsInternal(text, dictionaries, alphanumeric, details) {
- if (!alphanumeric && text.length > 0) {
- const c = text[0];
- if (!jpIsKana(c) && !jpIsKanji(c)) {
- return [[], 0];
- }
+ async findTermsInternal(text, dictionaries, details, options) {
+ text = Translator.getSearchableText(text, options);
+ if (text.length === 0) {
+ return [[], 0];
}
const titles = Object.keys(dictionaries);
const deinflections = (
details.wildcard ?
await this.findTermWildcard(text, titles, details.wildcard) :
- await this.findTermDeinflections(text, titles)
+ await this.findTermDeinflections(text, titles, options)
);
let definitions = [];
@@ -241,14 +233,19 @@ class Translator {
definitionTags.push(dictTagBuildSource(definition.dictionary));
const termTags = await this.expandTags(definition.termTags, definition.dictionary);
+ const {expression, reading} = definition;
+ const furiganaSegments = jpDistributeFurigana(expression, reading);
+
definitions.push({
source: deinflection.source,
+ rawSource: deinflection.rawSource,
reasons: deinflection.reasons,
score: definition.score,
id: definition.id,
dictionary: definition.dictionary,
- expression: definition.expression,
- reading: definition.reading,
+ expression,
+ reading,
+ furiganaSegments,
glossary: definition.glossary,
definitionTags: dictTagsSort(definitionTags),
termTags: dictTagsSort(termTags),
@@ -262,7 +259,7 @@ class Translator {
let length = 0;
for (const definition of definitions) {
- length = Math.max(length, definition.source.length);
+ length = Math.max(length, definition.rawSource.length);
}
return [definitions, length];
@@ -276,6 +273,7 @@ class Translator {
return [{
source: text,
+ rawSource: text,
term: text,
rules: 0,
definitions,
@@ -283,9 +281,8 @@ class Translator {
}];
}
- async findTermDeinflections(text, titles) {
- const text2 = jpKatakanaToHiragana(text);
- const deinflections = (text === text2 ? this.getDeinflections(text) : this.getDeinflections2(text, text2));
+ async findTermDeinflections(text, titles, options) {
+ const deinflections = this.getAllDeinflections(text, options);
if (deinflections.length === 0) {
return [];
@@ -293,17 +290,15 @@ class Translator {
const uniqueDeinflectionTerms = [];
const uniqueDeinflectionArrays = [];
- const uniqueDeinflectionsMap = {};
+ const uniqueDeinflectionsMap = new Map();
for (const deinflection of deinflections) {
const term = deinflection.term;
- let deinflectionArray;
- if (hasOwn(uniqueDeinflectionsMap, term)) {
- deinflectionArray = uniqueDeinflectionsMap[term];
- } else {
+ let deinflectionArray = uniqueDeinflectionsMap.get(term);
+ if (typeof deinflectionArray === 'undefined') {
deinflectionArray = [];
uniqueDeinflectionTerms.push(term);
uniqueDeinflectionArrays.push(deinflectionArray);
- uniqueDeinflectionsMap[term] = deinflectionArray;
+ uniqueDeinflectionsMap.set(term, deinflectionArray);
}
deinflectionArray.push(deinflection);
}
@@ -323,30 +318,77 @@ class Translator {
return deinflections.filter((e) => e.definitions.length > 0);
}
- getDeinflections(text) {
+ getAllDeinflections(text, options) {
+ const translationOptions = options.translation;
+ const textOptionVariantArray = [
+ Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters),
+ Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters),
+ Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters),
+ Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana),
+ Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana)
+ ];
+
const deinflections = [];
+ const used = new Set();
+ for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) {
+ let text2 = text;
+ let sourceMapping = null;
+ if (halfWidth) {
+ if (sourceMapping === null) { sourceMapping = Translator.createTextSourceMapping(text2); }
+ text2 = jpConvertHalfWidthKanaToFullWidth(text2, sourceMapping);
+ }
+ if (numeric) {
+ text2 = jpConvertNumericTofullWidth(text2);
+ }
+ if (alphabetic) {
+ if (sourceMapping === null) { sourceMapping = Translator.createTextSourceMapping(text2); }
+ text2 = jpConvertAlphabeticToKana(text2, sourceMapping);
+ }
+ if (katakana) {
+ text2 = jpHiraganaToKatakana(text2);
+ }
+ if (hiragana) {
+ text2 = jpKatakanaToHiragana(text2);
+ }
- for (let i = text.length; i > 0; --i) {
- const textSubstring = text.substring(0, i);
- deinflections.push(...this.deinflector.deinflect(textSubstring));
+ for (let i = text2.length; i > 0; --i) {
+ const text2Substring = text2.substring(0, i);
+ if (used.has(text2Substring)) { break; }
+ used.add(text2Substring);
+ for (const deinflection of this.deinflector.deinflect(text2Substring)) {
+ deinflection.rawSource = Translator.getDeinflectionRawSource(text, i, sourceMapping);
+ deinflections.push(deinflection);
+ }
+ }
}
-
return deinflections;
}
- getDeinflections2(text1, text2) {
- const deinflections = [];
+ static getTextOptionEntryVariants(value) {
+ switch (value) {
+ case 'true': return [true];
+ case 'variant': return [false, true];
+ default: return [false];
+ }
+ }
- for (let i = text1.length; i > 0; --i) {
- const text1Substring = text1.substring(0, i);
- const text2Substring = text2.substring(0, i);
- deinflections.push(...this.deinflector.deinflect(text1Substring));
- if (text1Substring !== text2Substring) {
- deinflections.push(...this.deinflector.deinflect(text2Substring));
- }
+ static getDeinflectionRawSource(source, length, sourceMapping) {
+ if (sourceMapping === null) {
+ return source.substring(0, length);
}
- return deinflections;
+ let result = '';
+ let index = 0;
+ for (let i = 0; i < length; ++i) {
+ const c = sourceMapping[i];
+ result += source.substring(index, index + c);
+ index += c;
+ }
+ return result;
+ }
+
+ static createTextSourceMapping(text) {
+ return new Array(text.length).fill(1);
}
async findKanji(text, options) {
@@ -370,31 +412,23 @@ class Translator {
definitions.sort((a, b) => a.index - b.index);
}
- const kanjiList2 = [];
for (const definition of definitions) {
- kanjiList2.push(definition.character);
-
const tags = await this.expandTags(definition.tags, definition.dictionary);
tags.push(dictTagBuildSource(definition.dictionary));
+ dictTagsSort(tags);
- definition.tags = dictTagsSort(tags);
- definition.stats = await this.expandStats(definition.stats, definition.dictionary);
- definition.frequencies = [];
- }
+ const stats = await this.expandStats(definition.stats, definition.dictionary);
- for (const meta of await this.database.findKanjiMetaBulk(kanjiList2, titles)) {
- if (meta.mode !== 'freq') { continue; }
- definitions[meta.index].frequencies.push({
- character: meta.character,
- frequency: meta.data,
- dictionary: meta.dictionary
- });
+ definition.tags = tags;
+ definition.stats = stats;
}
+ await this.buildKanjiMeta(definitions, titles);
+
return definitions;
}
- async buildTermFrequencies(definitions, titles) {
+ async buildTermMeta(definitions, titles) {
const terms = [];
for (const definition of definitions) {
if (definition.expressions) {
@@ -411,34 +445,48 @@ class Translator {
// Create mapping of unique terms
const expressionsUnique = [];
const termsUnique = [];
- const termsUniqueMap = {};
+ const termsUniqueMap = new Map();
for (let i = 0, ii = terms.length; i < ii; ++i) {
const term = terms[i];
const expression = term.expression;
- term.frequencies = [];
-
- if (hasOwn(termsUniqueMap, expression)) {
- termsUniqueMap[expression].push(term);
- } else {
- const termList = [term];
+ let termList = termsUniqueMap.get(expression);
+ if (typeof termList === 'undefined') {
+ termList = [];
expressionsUnique.push(expression);
termsUnique.push(termList);
termsUniqueMap[expression] = termList;
}
+ termList.push(term);
+
+ // New data
+ term.frequencies = [];
}
const metas = await this.database.findTermMetaBulk(expressionsUnique, titles);
- for (const meta of metas) {
- if (meta.mode !== 'freq') {
- continue;
+ for (const {expression, mode, data, dictionary, index} of metas) {
+ switch (mode) {
+ case 'freq':
+ for (const term of termsUnique[index]) {
+ term.frequencies.push({expression, frequency: data, dictionary});
+ }
+ break;
}
+ }
+ }
- for (const term of termsUnique[meta.index]) {
- term.frequencies.push({
- expression: meta.expression,
- frequency: meta.data,
- dictionary: meta.dictionary
- });
+ async buildKanjiMeta(definitions, titles) {
+ const kanjiList = [];
+ for (const definition of definitions) {
+ kanjiList.push(definition.character);
+ definition.frequencies = [];
+ }
+
+ const metas = await this.database.findKanjiMetaBulk(kanjiList, titles);
+ for (const {character, mode, data, dictionary, index} of metas) {
+ switch (mode) {
+ case 'freq':
+ definitions[index].frequencies.push({character, frequency: data, dictionary});
+ break;
}
}
}
@@ -504,6 +552,17 @@ class Translator {
return tagMetaList;
}
+ static createExpression(expression, reading, termTags=null, termFrequency=null) {
+ const furiganaSegments = jpDistributeFurigana(expression, reading);
+ return {
+ expression,
+ reading,
+ furiganaSegments,
+ termTags,
+ termFrequency
+ };
+ }
+
static scoreToTermFrequency(score) {
if (score > 0) {
return 'popular';
@@ -518,4 +577,38 @@ class Translator {
const pos = name.indexOf(':');
return (pos >= 0 ? name.substring(0, pos) : name);
}
+
+ static *getArrayVariants(arrayVariants) {
+ const ii = arrayVariants.length;
+
+ let total = 1;
+ for (let i = 0; i < ii; ++i) {
+ total *= arrayVariants[i].length;
+ }
+
+ for (let a = 0; a < total; ++a) {
+ const variant = [];
+ let index = a;
+ for (let i = 0; i < ii; ++i) {
+ const entryVariants = arrayVariants[i];
+ variant.push(entryVariants[index % entryVariants.length]);
+ index = Math.floor(index / entryVariants.length);
+ }
+ yield variant;
+ }
+ }
+
+ static getSearchableText(text, options) {
+ if (!options.scanning.alphanumeric) {
+ const ii = text.length;
+ for (let i = 0; i < ii; ++i) {
+ if (!jpIsCharCodeJapanese(text.charCodeAt(i))) {
+ text = text.substring(0, i);
+ break;
+ }
+ }
+ }
+
+ return text;
+ }
}