From b584c5440721fa7399564ced57f134fd5333d20c Mon Sep 17 00:00:00 2001 From: Cashew <52880648+cashewnuttynuts@users.noreply.github.com> Date: Mon, 24 Jun 2024 18:38:39 +0700 Subject: add cn reading processors (#1120) * add cn reading processors * remove meow * fix lint * add cn reading processors * remove meow * fix lint * wip * update names * update test * remove vestigial code --- ext/js/language/language-descriptors.js | 3 +- ext/js/language/languages.js | 12 ++++++ ext/js/language/translator.js | 49 ++++++++++++++++-------- ext/js/language/zh/chinese.js | 5 +++ test/language/chinese-reading-normalizer.test.js | 33 ++++++++++++++++ types/ext/language-descriptors.d.ts | 3 +- types/ext/language.d.ts | 7 ++++ types/ext/translation-internal.d.ts | 5 +++ 8 files changed, 99 insertions(+), 18 deletions(-) create mode 100644 test/language/chinese-reading-normalizer.test.js diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 517c908c..7965ff30 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces import {albanianTransforms} from './sq/albanian-transforms.js'; import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, @@ -277,6 +277,7 @@ const languageDescriptors = [ name: 'Chinese', exampleText: '读', isTextLookupWorthy: isStringPartiallyChinese, + readingNormalizer: normalizePinyin, }, ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 57b5ea90..7759fda5 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -28,6 +28,18 @@ export function getLanguageSummaries() { return results; } +/** + * @returns {import('language').LanguageAndReadingNormalizer[]} + */ +export function getAllLanguageReadingNormalizers() { + const results = []; + for (const {iso, readingNormalizer} of languageDescriptorMap.values()) { + if (typeof readingNormalizer === 'undefined') { continue; } + results.push({iso, readingNormalizer}); + } + return results; +} + /** * @returns {import('language').LanguageAndProcessors[]} * @throws {Error} diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index f4f3449b..8c55f41c 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -19,7 +19,7 @@ import {applyTextReplacement} from '../general/regex-util.js'; import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextProcessors} from './languages.js'; +import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js'; import {MultiLanguageTransformer} from './multi-language-transformer.js'; /** @@ -42,6 +42,8 @@ export class Translator { this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; /** @type {import('translation-internal').TextProcessorMap} */ this._textProcessors = new Map(); + /** @type {import('translation-internal').ReadingNormalizerMap} */ + this._readingNormalizers = new Map(); } /** @@ -52,6 +54,9 @@ export class Translator { for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) { this._textProcessors.set(iso, {textPreprocessors, textPostprocessors}); } + for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) { + this._readingNormalizers.set(iso, readingNormalizer); + } } /** @@ -76,7 +81,7 @@ export class Translator { switch (mode) { case 'group': - dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator); + dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator); break; case 'merge': dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator); @@ -629,7 +634,7 @@ export class Translator { * @returns {Promise} */ async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) { - const {mainDictionary, enabledDictionaryMap} = options; + const {mainDictionary, enabledDictionaryMap, language} = options; /** @type {import('translator').SequenceQuery[]} */ const sequenceList = []; /** @type {import('translation-internal').DictionaryEntryGroup[]} */ @@ -665,15 +670,15 @@ export class Translator { this._sortTermDictionaryEntriesById(group.dictionaryEntries); } if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) { - await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); + await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); } } const newDictionaryEntries = []; for (const group of groupedDictionaryEntries) { - newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator)); + newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator)); } - newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator)); + newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator)); return newDictionaryEntries; } @@ -700,13 +705,14 @@ export class Translator { } /** + * @param {string} language * @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries * @param {Map} ungroupedDictionaryEntriesMap * @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap * @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap * @param {TranslatorTagAggregator} tagAggregator */ - async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { + async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { // Prepare grouping info /** @type {import('dictionary-database').TermExactRequest[]} */ const termList = []; @@ -714,11 +720,14 @@ export class Translator { /** @type {Map} */ const targetMap = new Map(); + const readingNormalizer = this._readingNormalizers.get(language); + for (const group of groupedDictionaryEntries) { const {dictionaryEntries} = group; for (const dictionaryEntry of dictionaryEntries) { const {term, reading} = dictionaryEntry.headwords[0]; - const key = this._createMapKey([term, reading]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); let target = targetMap.get(key); if (typeof target === 'undefined') { target = { @@ -735,7 +744,8 @@ export class Translator { // Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading]. for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) { const {term, reading} = dictionaryEntry.headwords[0]; - const key = this._createMapKey([term, reading]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); const target = targetMap.get(key); if (typeof target === 'undefined') { continue; } @@ -769,16 +779,19 @@ export class Translator { } /** + * @param {string} language * @param {Iterable} dictionaryEntries * @param {TranslatorTagAggregator} tagAggregator * @returns {import('translation-internal').TermDictionaryEntry[]} */ - _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) { + _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) { /** @type {Map} */ const groups = new Map(); + const readingNormalizer = this._readingNormalizers.get(language); for (const dictionaryEntry of dictionaryEntries) { const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry; - const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]); let groupDictionaryEntries = groups.get(key); if (typeof groupDictionaryEntries === 'undefined') { groupDictionaryEntries = []; @@ -789,7 +802,7 @@ export class Translator { const newDictionaryEntries = []; for (const groupDictionaryEntries of groups.values()) { - newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator)); + newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator)); } return newDictionaryEntries; } @@ -1664,18 +1677,19 @@ export class Translator { } /** + * @param {string} language * @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries * @param {boolean} checkDuplicateDefinitions * @param {TranslatorTagAggregator} tagAggregator * @returns {import('translation-internal').TermDictionaryEntry} */ - _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { + _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained const definitionEntries = []; /** @type {Map} */ const headwords = new Map(); for (const dictionaryEntry of dictionaryEntries) { - const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator); + const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator); definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap}); } @@ -1788,16 +1802,19 @@ export class Translator { } /** + * @param {string} language * @param {Map} headwordsMap * @param {import('dictionary').TermHeadword[]} headwords * @param {TranslatorTagAggregator} tagAggregator * @returns {number[]} */ - _addTermHeadwords(headwordsMap, headwords, tagAggregator) { + _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) { /** @type {number[]} */ const headwordIndexMap = []; for (const {term, reading, sources, tags, wordClasses} of headwords) { - const key = this._createMapKey([term, reading]); + const readingNormalizer = this._readingNormalizers.get(language); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); let headword = headwordsMap.get(key); if (typeof headword === 'undefined') { headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []); diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js index 086d2f0a..3072d200 100644 --- a/ext/js/language/zh/chinese.js +++ b/ext/js/language/zh/chinese.js @@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) { } return false; } + +/** @type {import('language').ReadingNormalizer} */ +export function normalizePinyin(str) { + return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, ''); +} diff --git a/test/language/chinese-reading-normalizer.test.js b/test/language/chinese-reading-normalizer.test.js new file mode 100644 index 00000000..398ef0c9 --- /dev/null +++ b/test/language/chinese-reading-normalizer.test.js @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2023-2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {describe, expect, test} from 'vitest'; +import {normalizePinyin} from '../../ext/js/language/zh/chinese.js'; + +const tests = [ + ['rìwén', 'rìwén'], + ['Rì wén', 'rìwén'], + ['Wéi jī Bǎi kē', 'wéijībǎikē'], + ['wán:zhěng', 'wánzhěng'], + ['fān・yì', 'fānyì'], +]; + +describe('Normalize Pinyin', () => { + test.each(tests)('%s should normalize to %s', (a, b) => { + expect(normalizePinyin(a)).toStrictEqual(b); + }); +}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 778445de..42312937 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,7 +15,7 @@ * along with this program. If not, see . */ -import type {TextProcessor, BidirectionalConversionPreprocessor} from './language'; +import type {TextProcessor, ReadingNormalizer, BidirectionalConversionPreprocessor} from './language'; import type {LanguageTransformDescriptor} from './language-transformer'; import type {SafeAny} from './core'; @@ -36,6 +36,7 @@ type LanguageDescriptor< * If no value is provided, `true` is assumed for all inputs. */ isTextLookupWorthy?: IsTextLookupWorthyFunction; + readingNormalizer?: ReadingNormalizer; textPreprocessors?: TTextPreprocessorDescriptor; textPostprocessors?: TTextPostprocessorDescriptor; languageTransforms?: LanguageTransformDescriptor; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index c708f6e7..ea8c0e47 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -33,6 +33,8 @@ export type TextProcessor = { process: TextProcessorFunction; }; +export type ReadingNormalizer = (str: string) => string; + export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse'; export type BidirectionalConversionPreprocessor = TextProcessor; @@ -43,6 +45,11 @@ export type LanguageAndProcessors = { textPostprocessors?: TextProcessorWithId[]; }; +export type LanguageAndReadingNormalizer = { + iso: string; + readingNormalizer: ReadingNormalizer; +}; + export type LanguageAndTransforms = { iso: string; languageTransforms: LanguageTransformDescriptor; diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts index 0efbe54f..05821a64 100644 --- a/types/ext/translation-internal.d.ts +++ b/types/ext/translation-internal.d.ts @@ -71,4 +71,9 @@ export type TextProcessorMap = Map< } >; +export type ReadingNormalizerMap = Map< + string, + Language.ReadingNormalizer +>; + export type TextCache = Map>>; -- cgit v1.2.3