aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCashew <52880648+cashewnuttynuts@users.noreply.github.com>2024-06-24 18:38:39 +0700
committerGitHub <noreply@github.com>2024-06-24 11:38:39 +0000
commitb584c5440721fa7399564ced57f134fd5333d20c (patch)
tree4ea760c9232b4e8d2ab9c0b60bf07f7ed3bad1ef
parent8f32410f34b2c839105eb508da9b9e63f6a89194 (diff)
add cn reading processors (#1120)
* add cn reading processors * remove meow * fix lint * add cn reading processors * remove meow * fix lint * wip * update names * update test * remove vestigial code
-rw-r--r--ext/js/language/language-descriptors.js3
-rwxr-xr-xext/js/language/languages.js12
-rw-r--r--ext/js/language/translator.js49
-rw-r--r--ext/js/language/zh/chinese.js5
-rw-r--r--test/language/chinese-reading-normalizer.test.js33
-rw-r--r--types/ext/language-descriptors.d.ts3
-rw-r--r--types/ext/language.d.ts7
-rw-r--r--types/ext/translation-internal.d.ts5
8 files changed, 99 insertions, 18 deletions
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 517c908c..7965ff30 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces
import {albanianTransforms} from './sq/albanian-transforms.js';
import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
const capitalizationPreprocessors = {
decapitalize,
@@ -277,6 +277,7 @@ const languageDescriptors = [
name: 'Chinese',
exampleText: '读',
isTextLookupWorthy: isStringPartiallyChinese,
+ readingNormalizer: normalizePinyin,
},
];
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
index 57b5ea90..7759fda5 100755
--- a/ext/js/language/languages.js
+++ b/ext/js/language/languages.js
@@ -29,6 +29,18 @@ export function getLanguageSummaries() {
}
/**
+ * @returns {import('language').LanguageAndReadingNormalizer[]}
+ */
+export function getAllLanguageReadingNormalizers() {
+ const results = [];
+ for (const {iso, readingNormalizer} of languageDescriptorMap.values()) {
+ if (typeof readingNormalizer === 'undefined') { continue; }
+ results.push({iso, readingNormalizer});
+ }
+ return results;
+}
+
+/**
* @returns {import('language').LanguageAndProcessors[]}
* @throws {Error}
*/
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index f4f3449b..8c55f41c 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -19,7 +19,7 @@
import {applyTextReplacement} from '../general/regex-util.js';
import {isCodePointJapanese} from './ja/japanese.js';
import {LanguageTransformer} from './language-transformer.js';
-import {getAllLanguageTextProcessors} from './languages.js';
+import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js';
import {MultiLanguageTransformer} from './multi-language-transformer.js';
/**
@@ -42,6 +42,8 @@ export class Translator {
this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
/** @type {import('translation-internal').TextProcessorMap} */
this._textProcessors = new Map();
+ /** @type {import('translation-internal').ReadingNormalizerMap} */
+ this._readingNormalizers = new Map();
}
/**
@@ -52,6 +54,9 @@ export class Translator {
for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {
this._textProcessors.set(iso, {textPreprocessors, textPostprocessors});
}
+ for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) {
+ this._readingNormalizers.set(iso, readingNormalizer);
+ }
}
/**
@@ -76,7 +81,7 @@ export class Translator {
switch (mode) {
case 'group':
- dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator);
+ dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator);
break;
case 'merge':
dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator);
@@ -629,7 +634,7 @@ export class Translator {
* @returns {Promise<import('translation-internal').TermDictionaryEntry[]>}
*/
async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) {
- const {mainDictionary, enabledDictionaryMap} = options;
+ const {mainDictionary, enabledDictionaryMap, language} = options;
/** @type {import('translator').SequenceQuery[]} */
const sequenceList = [];
/** @type {import('translation-internal').DictionaryEntryGroup[]} */
@@ -665,15 +670,15 @@ export class Translator {
this._sortTermDictionaryEntriesById(group.dictionaryEntries);
}
if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) {
- await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
+ await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
}
}
const newDictionaryEntries = [];
for (const group of groupedDictionaryEntries) {
- newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator));
+ newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator));
}
- newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator));
+ newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator));
return newDictionaryEntries;
}
@@ -700,13 +705,14 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries
* @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap
* @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap
* @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap
* @param {TranslatorTagAggregator} tagAggregator
*/
- async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
+ async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
// Prepare grouping info
/** @type {import('dictionary-database').TermExactRequest[]} */
const termList = [];
@@ -714,11 +720,14 @@ export class Translator {
/** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */
const targetMap = new Map();
+ const readingNormalizer = this._readingNormalizers.get(language);
+
for (const group of groupedDictionaryEntries) {
const {dictionaryEntries} = group;
for (const dictionaryEntry of dictionaryEntries) {
const {term, reading} = dictionaryEntry.headwords[0];
- const key = this._createMapKey([term, reading]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
let target = targetMap.get(key);
if (typeof target === 'undefined') {
target = {
@@ -735,7 +744,8 @@ export class Translator {
// Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading].
for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) {
const {term, reading} = dictionaryEntry.headwords[0];
- const key = this._createMapKey([term, reading]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
const target = targetMap.get(key);
if (typeof target === 'undefined') { continue; }
@@ -769,16 +779,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries
* @param {TranslatorTagAggregator} tagAggregator
* @returns {import('translation-internal').TermDictionaryEntry[]}
*/
- _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) {
+ _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) {
/** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */
const groups = new Map();
+ const readingNormalizer = this._readingNormalizers.get(language);
for (const dictionaryEntry of dictionaryEntries) {
const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry;
- const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]);
let groupDictionaryEntries = groups.get(key);
if (typeof groupDictionaryEntries === 'undefined') {
groupDictionaryEntries = [];
@@ -789,7 +802,7 @@ export class Translator {
const newDictionaryEntries = [];
for (const groupDictionaryEntries of groups.values()) {
- newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator));
+ newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator));
}
return newDictionaryEntries;
}
@@ -1664,18 +1677,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries
* @param {boolean} checkDuplicateDefinitions
* @param {TranslatorTagAggregator} tagAggregator
* @returns {import('translation-internal').TermDictionaryEntry}
*/
- _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
+ _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
// Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained
const definitionEntries = [];
/** @type {Map<string, import('dictionary').TermHeadword>} */
const headwords = new Map();
for (const dictionaryEntry of dictionaryEntries) {
- const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator);
+ const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator);
definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap});
}
@@ -1788,16 +1802,19 @@ export class Translator {
}
/**
+ * @param {string} language
* @param {Map<string, import('dictionary').TermHeadword>} headwordsMap
* @param {import('dictionary').TermHeadword[]} headwords
* @param {TranslatorTagAggregator} tagAggregator
* @returns {number[]}
*/
- _addTermHeadwords(headwordsMap, headwords, tagAggregator) {
+ _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) {
/** @type {number[]} */
const headwordIndexMap = [];
for (const {term, reading, sources, tags, wordClasses} of headwords) {
- const key = this._createMapKey([term, reading]);
+ const readingNormalizer = this._readingNormalizers.get(language);
+ const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+ const key = this._createMapKey([term, normalizedReading]);
let headword = headwordsMap.get(key);
if (typeof headword === 'undefined') {
headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []);
diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js
index 086d2f0a..3072d200 100644
--- a/ext/js/language/zh/chinese.js
+++ b/ext/js/language/zh/chinese.js
@@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) {
}
return false;
}
+
+/** @type {import('language').ReadingNormalizer} */
+export function normalizePinyin(str) {
+ return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, '');
+}
diff --git a/test/language/chinese-reading-normalizer.test.js b/test/language/chinese-reading-normalizer.test.js
new file mode 100644
index 00000000..398ef0c9
--- /dev/null
+++ b/test/language/chinese-reading-normalizer.test.js
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2023-2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {describe, expect, test} from 'vitest';
+import {normalizePinyin} from '../../ext/js/language/zh/chinese.js';
+
+const tests = [
+ ['rìwén', 'rìwén'],
+ ['Rì wén', 'rìwén'],
+ ['Wéi jī Bǎi kē', 'wéijībǎikē'],
+ ['wán:zhěng', 'wánzhěng'],
+ ['fān・yì', 'fānyì'],
+];
+
+describe('Normalize Pinyin', () => {
+ test.each(tests)('%s should normalize to %s', (a, b) => {
+ expect(normalizePinyin(a)).toStrictEqual(b);
+ });
+});
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 778445de..42312937 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import type {TextProcessor, BidirectionalConversionPreprocessor} from './language';
+import type {TextProcessor, ReadingNormalizer, BidirectionalConversionPreprocessor} from './language';
import type {LanguageTransformDescriptor} from './language-transformer';
import type {SafeAny} from './core';
@@ -36,6 +36,7 @@ type LanguageDescriptor<
* If no value is provided, `true` is assumed for all inputs.
*/
isTextLookupWorthy?: IsTextLookupWorthyFunction;
+ readingNormalizer?: ReadingNormalizer;
textPreprocessors?: TTextPreprocessorDescriptor;
textPostprocessors?: TTextPostprocessorDescriptor;
languageTransforms?: LanguageTransformDescriptor;
diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts
index c708f6e7..ea8c0e47 100644
--- a/types/ext/language.d.ts
+++ b/types/ext/language.d.ts
@@ -33,6 +33,8 @@ export type TextProcessor<T = unknown> = {
process: TextProcessorFunction<T>;
};
+export type ReadingNormalizer = (str: string) => string;
+
export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';
export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>;
@@ -43,6 +45,11 @@ export type LanguageAndProcessors = {
textPostprocessors?: TextProcessorWithId<unknown>[];
};
+export type LanguageAndReadingNormalizer = {
+ iso: string;
+ readingNormalizer: ReadingNormalizer;
+};
+
export type LanguageAndTransforms = {
iso: string;
languageTransforms: LanguageTransformDescriptor;
diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts
index 0efbe54f..05821a64 100644
--- a/types/ext/translation-internal.d.ts
+++ b/types/ext/translation-internal.d.ts
@@ -71,4 +71,9 @@ export type TextProcessorMap = Map<
}
>;
+export type ReadingNormalizerMap = Map<
+ string,
+ Language.ReadingNormalizer
+>;
+
export type TextCache = Map<string, Map<string, Map<unknown, string>>>;