summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorCashew <52880648+cashewnuttynuts@users.noreply.github.com>2024-06-24 18:38:39 +0700
committerGitHub <noreply@github.com>2024-06-24 11:38:39 +0000
commitb584c5440721fa7399564ced57f134fd5333d20c (patch)
tree4ea760c9232b4e8d2ab9c0b60bf07f7ed3bad1ef /ext
parent8f32410f34b2c839105eb508da9b9e63f6a89194 (diff)
add cn reading processors (#1120)
* add cn reading processors * remove meow * fix lint * add cn reading processors * remove meow * fix lint * wip * update names * update test * remove vestigial code
Diffstat (limited to 'ext')
-rw-r--r--ext/js/language/language-descriptors.js3
-rwxr-xr-xext/js/language/languages.js12
-rw-r--r--ext/js/language/translator.js49
-rw-r--r--ext/js/language/zh/chinese.js5
4 files changed, 52 insertions, 17 deletions
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 517c908c..7965ff30 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces
import {albanianTransforms} from './sq/albanian-transforms.js';
import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
const capitalizationPreprocessors = {
decapitalize,
@@ -277,6 +277,7 @@ const languageDescriptors = [
name: 'Chinese',
exampleText: '读',
isTextLookupWorthy: isStringPartiallyChinese,
+ readingNormalizer: normalizePinyin,
},
];
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
index 57b5ea90..7759fda5 100755
--- a/ext/js/language/languages.js
+++ b/ext/js/language/languages.js
@@ -29,6 +29,18 @@ export function getLanguageSummaries() {
}
/**
+ * @returns {import('language').LanguageAndReadingNormalizer[]}
+ */
+export function getAllLanguageReadingNormalizers() {
+ const results = [];
+ for (const {iso, readingNormalizer} of languageDescriptorMap.values()) {
+ if (typeof readingNormalizer === 'undefined') { continue; }
+ results.push({iso, readingNormalizer});
+ }
+ return results;
+}
+
+/**
* @returns {import('language').LanguageAndProcessors[]}
* @throws {Error}
*/
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index f4f3449b..8c55f41c 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -19,7 +19,7 @@
import {applyTextReplacement} from '../general/regex-util.js';
import {isCodePointJapanese} from './ja/japanese.js';
import {LanguageTransformer} from './language-transformer.js';
-import {getAllLanguageTextProcessors} from './languages.js';
+import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js';
import {MultiLanguageTransformer} from './multi-language-transformer.js';
/**
@@ -42,6 +42,8 @@ export class Translator {
this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
/** @type {import('translation-internal').TextProcessorMap} */
this._textProcessors = new Map();
+ /** @type {import('translation-internal').ReadingNormalizerMap} */
+ this._readingNormalizers = new Map();
}
/**
@@ -52,6 +54,9 @@ export class Translator {
for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {
this._textProcessors.set(iso, {textPreprocessors, textPostprocessors});
}
+ for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) {
+ this._readingNormalizers.set(iso, readingNormalizer);
+ }
}
/**
@@ -76,7 +81,7 @@ export class Translator {
switch (mode) {
case 'group':
- dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator);
+ dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator);
break;
case 'merge':
dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator);
@@ -629,7 +634,7 @@ export class Translator {
* @returns {Promise<import('translation-internal').TermDictionaryEntry[]>}
*/
async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) {
- const {mainDictionary, enabledDictionaryMap} = options;
+ const {mainDictionary, enabledDictionaryMap, language} = options;
/** @type {import('translator').SequenceQuery[]} */
const sequenceList = [];
/** @type {import('translation-internal').DictionaryEntryGroup[]} */
@@ -665,15 +670,15 @@ export class Translator {
this._sortTermDictionaryEntriesById(group.dictionaryEntries);
}
if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) {
- await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
+ await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
}
}
const newDictionaryEntries = [];
for (const group of groupedDictionaryEntries) {
- newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator));
+ newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator));
}
- newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator));
+ newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator));
return newDictionaryEntries;
}
@@ -700,13 +705,14 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries
* @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap
* @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap
* @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap
* @param {TranslatorTagAggregator} tagAggregator
*/
- async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
+ async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
// Prepare grouping info
/** @type {import('dictionary-database').TermExactRequest[]} */
const termList = [];
@@ -714,11 +720,14 @@ export class Translator {
/** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */
const targetMap = new Map();
+ const readingNormalizer = this._readingNormalizers.get(language);
+
for (const group of groupedDictionaryEntries) {
const {dictionaryEntries} = group;
for (const dictionaryEntry of dictionaryEntries) {
const {term, reading} = dictionaryEntry.headwords[0];
- const key = this._createMapKey([term, reading]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
let target = targetMap.get(key);
if (typeof target === 'undefined') {
target = {
@@ -735,7 +744,8 @@ export class Translator {
// Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading].
for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) {
const {term, reading} = dictionaryEntry.headwords[0];
- const key = this._createMapKey([term, reading]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
const target = targetMap.get(key);
if (typeof target === 'undefined') { continue; }
@@ -769,16 +779,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries
* @param {TranslatorTagAggregator} tagAggregator
* @returns {import('translation-internal').TermDictionaryEntry[]}
*/
- _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) {
+ _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) {
/** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */
const groups = new Map();
+ const readingNormalizer = this._readingNormalizers.get(language);
for (const dictionaryEntry of dictionaryEntries) {
const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry;
- const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]);
let groupDictionaryEntries = groups.get(key);
if (typeof groupDictionaryEntries === 'undefined') {
groupDictionaryEntries = [];
@@ -789,7 +802,7 @@ export class Translator {
const newDictionaryEntries = [];
for (const groupDictionaryEntries of groups.values()) {
- newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator));
+ newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator));
}
return newDictionaryEntries;
}
@@ -1664,18 +1677,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries
* @param {boolean} checkDuplicateDefinitions
* @param {TranslatorTagAggregator} tagAggregator
* @returns {import('translation-internal').TermDictionaryEntry}
*/
- _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
+ _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
// Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained
const definitionEntries = [];
/** @type {Map<string, import('dictionary').TermHeadword>} */
const headwords = new Map();
for (const dictionaryEntry of dictionaryEntries) {
- const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator);
+ const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator);
definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap});
}
@@ -1788,16 +1802,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {Map<string, import('dictionary').TermHeadword>} headwordsMap
* @param {import('dictionary').TermHeadword[]} headwords
* @param {TranslatorTagAggregator} tagAggregator
* @returns {number[]}
*/
- _addTermHeadwords(headwordsMap, headwords, tagAggregator) {
+ _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) {
/** @type {number[]} */
const headwordIndexMap = [];
for (const {term, reading, sources, tags, wordClasses} of headwords) {
- const key = this._createMapKey([term, reading]);
+ const readingNormalizer = this._readingNormalizers.get(language);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
let headword = headwordsMap.get(key);
if (typeof headword === 'undefined') {
headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []);
diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js
index 086d2f0a..3072d200 100644
--- a/ext/js/language/zh/chinese.js
+++ b/ext/js/language/zh/chinese.js
@@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) {
}
return false;
}
+
+/** @type {import('language').ReadingNormalizer} */
+export function normalizePinyin(str) {
+ return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, '');
+}