add cn reading processors (#1120)

* add cn reading processors * remove meow * fix lint * add cn reading processors * remove meow * fix lint * wip * update names * update test * remove vestigial code
author: Cashew <52880648+cashewnuttynuts@users.noreply.github.com> 2024-06-24 18:38:39 +0700
committer: GitHub <noreply@github.com> 2024-06-24 11:38:39 +0000
commit: b584c5440721fa7399564ced57f134fd5333d20c (patch)
tree: 4ea760c9232b4e8d2ab9c0b60bf07f7ed3bad1ef
parent: 8f32410f34b2c839105eb508da9b9e63f6a89194 (diff)
8 files changed, 99 insertions, 18 deletions
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 517c908c..7965ff30 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces
 import {albanianTransforms} from './sq/albanian-transforms.js';
 import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
 import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
 
 const capitalizationPreprocessors = {
     decapitalize,
@@ -277,6 +277,7 @@ const languageDescriptors = [
         name: 'Chinese',
         exampleText: '读',
         isTextLookupWorthy: isStringPartiallyChinese,
+        readingNormalizer: normalizePinyin,
     },
 ];
 
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
index 57b5ea90..7759fda5 100755
--- a/ext/js/language/languages.js
+++ b/ext/js/language/languages.js
@@ -29,6 +29,18 @@ export function getLanguageSummaries() {
 }
 
 /**
+ * @returns {import('language').LanguageAndReadingNormalizer[]}
+ */
+export function getAllLanguageReadingNormalizers() {
+    const results = [];
+    for (const {iso, readingNormalizer} of languageDescriptorMap.values()) {
+        if (typeof readingNormalizer === 'undefined') { continue; }
+        results.push({iso, readingNormalizer});
+    }
+    return results;
+}
+
+/**
  * @returns {import('language').LanguageAndProcessors[]}
  * @throws {Error}
  */
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index f4f3449b..8c55f41c 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -19,7 +19,7 @@
 import {applyTextReplacement} from '../general/regex-util.js';
 import {isCodePointJapanese} from './ja/japanese.js';
 import {LanguageTransformer} from './language-transformer.js';
-import {getAllLanguageTextProcessors} from './languages.js';
+import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js';
 import {MultiLanguageTransformer} from './multi-language-transformer.js';
 
 /**
@@ -42,6 +42,8 @@ export class Translator {
         this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
         /** @type {import('translation-internal').TextProcessorMap} */
         this._textProcessors = new Map();
+        /** @type {import('translation-internal').ReadingNormalizerMap} */
+        this._readingNormalizers = new Map();
     }
 
     /**
@@ -52,6 +54,9 @@ export class Translator {
         for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {
             this._textProcessors.set(iso, {textPreprocessors, textPostprocessors});
         }
+        for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) {
+            this._readingNormalizers.set(iso, readingNormalizer);
+        }
     }
 
     /**
@@ -76,7 +81,7 @@ export class Translator {
 
         switch (mode) {
             case 'group':
-                dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator);
+                dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator);
                 break;
             case 'merge':
                 dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator);
@@ -629,7 +634,7 @@ export class Translator {
      * @returns {Promise<import('translation-internal').TermDictionaryEntry[]>}
      */
     async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) {
-        const {mainDictionary, enabledDictionaryMap} = options;
+        const {mainDictionary, enabledDictionaryMap, language} = options;
         /** @type {import('translator').SequenceQuery[]} */
         const sequenceList = [];
         /** @type {import('translation-internal').DictionaryEntryGroup[]} */
@@ -665,15 +670,15 @@ export class Translator {
                 this._sortTermDictionaryEntriesById(group.dictionaryEntries);
             }
             if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) {
-                await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
+                await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);
             }
         }
 
         const newDictionaryEntries = [];
         for (const group of groupedDictionaryEntries) {
-            newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator));
+            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator));
         }
-        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator));
+        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator));
         return newDictionaryEntries;
     }
 
@@ -700,13 +705,14 @@ export class Translator {
     }
 
     /**
+     * @param {string} language
      * @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries
      * @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap
      * @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap
      * @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap
      * @param {TranslatorTagAggregator} tagAggregator
      */
-    async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
+    async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {
         // Prepare grouping info
         /** @type {import('dictionary-database').TermExactRequest[]} */
         const termList = [];
@@ -714,11 +720,14 @@ export class Translator {
         /** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */
         const targetMap = new Map();
 
+        const readingNormalizer = this._readingNormalizers.get(language);
+
         for (const group of groupedDictionaryEntries) {
             const {dictionaryEntries} = group;
             for (const dictionaryEntry of dictionaryEntries) {
                 const {term, reading} = dictionaryEntry.headwords[0];
-                const key = this._createMapKey([term, reading]);
+                const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+                const key = this._createMapKey([term, normalizedReading]);
                 let target = targetMap.get(key);
                 if (typeof target === 'undefined') {
                     target = {
@@ -735,7 +744,8 @@ export class Translator {
         // Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading].
         for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) {
             const {term, reading} = dictionaryEntry.headwords[0];
-            const key = this._createMapKey([term, reading]);
+            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+            const key = this._createMapKey([term, normalizedReading]);
             const target = targetMap.get(key);
             if (typeof target === 'undefined') { continue; }
 
@@ -769,16 +779,19 @@ export class Translator {
     }
 
     /**
+     * @param {string} language
      * @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries
      * @param {TranslatorTagAggregator} tagAggregator
      * @returns {import('translation-internal').TermDictionaryEntry[]}
      */
-    _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) {
+    _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) {
         /** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */
         const groups = new Map();
+        const readingNormalizer = this._readingNormalizers.get(language);
         for (const dictionaryEntry of dictionaryEntries) {
             const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry;
-            const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]);
+            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+            const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]);
             let groupDictionaryEntries = groups.get(key);
             if (typeof groupDictionaryEntries === 'undefined') {
                 groupDictionaryEntries = [];
@@ -789,7 +802,7 @@ export class Translator {
 
         const newDictionaryEntries = [];
         for (const groupDictionaryEntries of groups.values()) {
-            newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator));
+            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator));
         }
         return newDictionaryEntries;
     }
@@ -1664,18 +1677,19 @@ export class Translator {
     }
 
     /**
+     * @param {string} language
      * @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries
      * @param {boolean} checkDuplicateDefinitions
      * @param {TranslatorTagAggregator} tagAggregator
      * @returns {import('translation-internal').TermDictionaryEntry}
      */
-    _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
+    _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {
         // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained
         const definitionEntries = [];
         /** @type {Map<string, import('dictionary').TermHeadword>} */
         const headwords = new Map();
         for (const dictionaryEntry of dictionaryEntries) {
-            const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator);
+            const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator);
             definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap});
         }
 
@@ -1788,16 +1802,19 @@ export class Translator {
     }
 
     /**
+     * @param {string} language
      * @param {Map<string, import('dictionary').TermHeadword>} headwordsMap
      * @param {import('dictionary').TermHeadword[]} headwords
      * @param {TranslatorTagAggregator} tagAggregator
      * @returns {number[]}
      */
-    _addTermHeadwords(headwordsMap, headwords, tagAggregator) {
+    _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) {
         /** @type {number[]} */
         const headwordIndexMap = [];
         for (const {term, reading, sources, tags, wordClasses} of headwords) {
-            const key = this._createMapKey([term, reading]);
+            const readingNormalizer = this._readingNormalizers.get(language);
+            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading);
+            const key = this._createMapKey([term, normalizedReading]);
             let headword = headwordsMap.get(key);
             if (typeof headword === 'undefined') {
                 headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []);
diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js
index 086d2f0a..3072d200 100644
--- a/ext/js/language/zh/chinese.js
+++ b/ext/js/language/zh/chinese.js
@@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) {
     }
     return false;
 }
+
+/** @type {import('language').ReadingNormalizer} */
+export function normalizePinyin(str) {
+    return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, '');
+}
diff --git a/test/language/chinese-reading-normalizer.test.js b/test/language/chinese-reading-normalizer.test.js
new file mode 100644
index 00000000..398ef0c9
--- /dev/null
+++ b/test/language/chinese-reading-normalizer.test.js
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2023-2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {describe, expect, test} from 'vitest';
+import {normalizePinyin} from '../../ext/js/language/zh/chinese.js';
+
+const tests = [
+    ['rìwén', 'rìwén'],
+    ['Rì wén', 'rìwén'],
+    ['Wéi jī Bǎi kē', 'wéijībǎikē'],
+    ['wán:zhěng', 'wánzhěng'],
+    ['fān・yì', 'fānyì'],
+];
+
+describe('Normalize Pinyin', () => {
+    test.each(tests)('%s should normalize to %s', (a, b) => {
+        expect(normalizePinyin(a)).toStrictEqual(b);
+    });
+});
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 778445de..42312937 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -15,7 +15,7 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-import type {TextProcessor, BidirectionalConversionPreprocessor} from './language';
+import type {TextProcessor, ReadingNormalizer, BidirectionalConversionPreprocessor} from './language';
 import type {LanguageTransformDescriptor} from './language-transformer';
 import type {SafeAny} from './core';
 
@@ -36,6 +36,7 @@ type LanguageDescriptor<
      * If no value is provided, `true` is assumed for all inputs.
      */
     isTextLookupWorthy?: IsTextLookupWorthyFunction;
+    readingNormalizer?: ReadingNormalizer;
     textPreprocessors?: TTextPreprocessorDescriptor;
     textPostprocessors?: TTextPostprocessorDescriptor;
     languageTransforms?: LanguageTransformDescriptor;
diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts
index c708f6e7..ea8c0e47 100644
--- a/types/ext/language.d.ts
+++ b/types/ext/language.d.ts
@@ -33,6 +33,8 @@ export type TextProcessor<T = unknown> = {
     process: TextProcessorFunction<T>;
 };
 
+export type ReadingNormalizer = (str: string) => string;
+
 export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';
 
 export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>;
@@ -43,6 +45,11 @@ export type LanguageAndProcessors = {
     textPostprocessors?: TextProcessorWithId<unknown>[];
 };
 
+export type LanguageAndReadingNormalizer = {
+    iso: string;
+    readingNormalizer: ReadingNormalizer;
+};
+
 export type LanguageAndTransforms = {
     iso: string;
     languageTransforms: LanguageTransformDescriptor;
diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts
index 0efbe54f..05821a64 100644
--- a/types/ext/translation-internal.d.ts
+++ b/types/ext/translation-internal.d.ts
@@ -71,4 +71,9 @@ export type TextProcessorMap = Map<
     }
 >;
 
+export type ReadingNormalizerMap = Map<
+    string,
+    Language.ReadingNormalizer
+>;
+
 export type TextCache = Map<string, Map<string, Map<unknown, string>>>;
author	Cashew <52880648+cashewnuttynuts@users.noreply.github.com>	2024-06-24 18:38:39 +0700
committer	GitHub <noreply@github.com>	2024-06-24 11:38:39 +0000
commit	b584c5440721fa7399564ced57f134fd5333d20c (patch)
tree	4ea760c9232b4e8d2ab9c0b60bf07f7ed3bad1ef
parent	8f32410f34b2c839105eb508da9b9e63f6a89194 (diff)