diff options
| -rw-r--r-- | ext/js/language/language-descriptors.js | 3 | ||||
| -rwxr-xr-x | ext/js/language/languages.js | 12 | ||||
| -rw-r--r-- | ext/js/language/translator.js | 49 | ||||
| -rw-r--r-- | ext/js/language/zh/chinese.js | 5 | ||||
| -rw-r--r-- | test/language/chinese-reading-normalizer.test.js | 33 | ||||
| -rw-r--r-- | types/ext/language-descriptors.d.ts | 3 | ||||
| -rw-r--r-- | types/ext/language.d.ts | 7 | ||||
| -rw-r--r-- | types/ext/translation-internal.d.ts | 5 | 
8 files changed, 99 insertions, 18 deletions
| diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 517c908c..7965ff30 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces  import {albanianTransforms} from './sq/albanian-transforms.js';  import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';  import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';  const capitalizationPreprocessors = {      decapitalize, @@ -277,6 +277,7 @@ const languageDescriptors = [          name: 'Chinese',          exampleText: '读',          isTextLookupWorthy: isStringPartiallyChinese, +        readingNormalizer: normalizePinyin,      },  ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 57b5ea90..7759fda5 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -29,6 +29,18 @@ export function getLanguageSummaries() {  }  /** + * @returns {import('language').LanguageAndReadingNormalizer[]} + */ +export function getAllLanguageReadingNormalizers() { +    const results = []; +    for (const {iso, readingNormalizer} of languageDescriptorMap.values()) { +        if (typeof readingNormalizer === 'undefined') { continue; } +        results.push({iso, readingNormalizer}); +    } +    return results; +} + +/**   * @returns {import('language').LanguageAndProcessors[]}   * @throws {Error}   */ diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index f4f3449b..8c55f41c 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -19,7 +19,7 @@  import {applyTextReplacement} from '../general/regex-util.js';  import {isCodePointJapanese} from './ja/japanese.js';  import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextProcessors} from './languages.js'; +import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js';  import {MultiLanguageTransformer} from './multi-language-transformer.js';  /** @@ -42,6 +42,8 @@ export class Translator {          this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;          /** @type {import('translation-internal').TextProcessorMap} */          this._textProcessors = new Map(); +        /** @type {import('translation-internal').ReadingNormalizerMap} */ +        this._readingNormalizers = new Map();      }      /** @@ -52,6 +54,9 @@ export class Translator {          for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {              this._textProcessors.set(iso, {textPreprocessors, textPostprocessors});          } +        for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) { +            this._readingNormalizers.set(iso, readingNormalizer); +        }      }      /** @@ -76,7 +81,7 @@ export class Translator {          switch (mode) {              case 'group': -                dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator); +                dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator);                  break;              case 'merge':                  dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator); @@ -629,7 +634,7 @@ export class Translator {       * @returns {Promise<import('translation-internal').TermDictionaryEntry[]>}       */      async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) { -        const {mainDictionary, enabledDictionaryMap} = options; +        const {mainDictionary, enabledDictionaryMap, language} = options;          /** @type {import('translator').SequenceQuery[]} */          const sequenceList = [];          /** @type {import('translation-internal').DictionaryEntryGroup[]} */ @@ -665,15 +670,15 @@ export class Translator {                  this._sortTermDictionaryEntriesById(group.dictionaryEntries);              }              if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) { -                await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); +                await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);              }          }          const newDictionaryEntries = [];          for (const group of groupedDictionaryEntries) { -            newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator)); +            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator));          } -        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator)); +        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator));          return newDictionaryEntries;      } @@ -700,13 +705,14 @@ export class Translator {      }      /** +     * @param {string} language       * @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries       * @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap       * @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap       * @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap       * @param {TranslatorTagAggregator} tagAggregator       */ -    async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { +    async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {          // Prepare grouping info          /** @type {import('dictionary-database').TermExactRequest[]} */          const termList = []; @@ -714,11 +720,14 @@ export class Translator {          /** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */          const targetMap = new Map(); +        const readingNormalizer = this._readingNormalizers.get(language); +          for (const group of groupedDictionaryEntries) {              const {dictionaryEntries} = group;              for (const dictionaryEntry of dictionaryEntries) {                  const {term, reading} = dictionaryEntry.headwords[0]; -                const key = this._createMapKey([term, reading]); +                const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +                const key = this._createMapKey([term, normalizedReading]);                  let target = targetMap.get(key);                  if (typeof target === 'undefined') {                      target = { @@ -735,7 +744,8 @@ export class Translator {          // Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading].          for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) {              const {term, reading} = dictionaryEntry.headwords[0]; -            const key = this._createMapKey([term, reading]); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading]);              const target = targetMap.get(key);              if (typeof target === 'undefined') { continue; } @@ -769,16 +779,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries       * @param {TranslatorTagAggregator} tagAggregator       * @returns {import('translation-internal').TermDictionaryEntry[]}       */ -    _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) { +    _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) {          /** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */          const groups = new Map(); +        const readingNormalizer = this._readingNormalizers.get(language);          for (const dictionaryEntry of dictionaryEntries) {              const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry; -            const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]);              let groupDictionaryEntries = groups.get(key);              if (typeof groupDictionaryEntries === 'undefined') {                  groupDictionaryEntries = []; @@ -789,7 +802,7 @@ export class Translator {          const newDictionaryEntries = [];          for (const groupDictionaryEntries of groups.values()) { -            newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator)); +            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator));          }          return newDictionaryEntries;      } @@ -1664,18 +1677,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries       * @param {boolean} checkDuplicateDefinitions       * @param {TranslatorTagAggregator} tagAggregator       * @returns {import('translation-internal').TermDictionaryEntry}       */ -    _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { +    _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {          // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained          const definitionEntries = [];          /** @type {Map<string, import('dictionary').TermHeadword>} */          const headwords = new Map();          for (const dictionaryEntry of dictionaryEntries) { -            const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator); +            const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator);              definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap});          } @@ -1788,16 +1802,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {Map<string, import('dictionary').TermHeadword>} headwordsMap       * @param {import('dictionary').TermHeadword[]} headwords       * @param {TranslatorTagAggregator} tagAggregator       * @returns {number[]}       */ -    _addTermHeadwords(headwordsMap, headwords, tagAggregator) { +    _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) {          /** @type {number[]} */          const headwordIndexMap = [];          for (const {term, reading, sources, tags, wordClasses} of headwords) { -            const key = this._createMapKey([term, reading]); +            const readingNormalizer = this._readingNormalizers.get(language); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading]);              let headword = headwordsMap.get(key);              if (typeof headword === 'undefined') {                  headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []); diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js index 086d2f0a..3072d200 100644 --- a/ext/js/language/zh/chinese.js +++ b/ext/js/language/zh/chinese.js @@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) {      }      return false;  } + +/** @type {import('language').ReadingNormalizer} */ +export function normalizePinyin(str) { +    return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, ''); +} diff --git a/test/language/chinese-reading-normalizer.test.js b/test/language/chinese-reading-normalizer.test.js new file mode 100644 index 00000000..398ef0c9 --- /dev/null +++ b/test/language/chinese-reading-normalizer.test.js @@ -0,0 +1,33 @@ +/* + * Copyright (C) 2023-2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {describe, expect, test} from 'vitest'; +import {normalizePinyin} from '../../ext/js/language/zh/chinese.js'; + +const tests = [ +    ['rìwén', 'rìwén'], +    ['Rì wén', 'rìwén'], +    ['Wéi jī Bǎi kē', 'wéijībǎikē'], +    ['wán:zhěng', 'wánzhěng'], +    ['fān・yì', 'fānyì'], +]; + +describe('Normalize Pinyin', () => { +    test.each(tests)('%s should normalize to %s', (a, b) => { +        expect(normalizePinyin(a)).toStrictEqual(b); +    }); +}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 778445de..42312937 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,7 +15,7 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import type {TextProcessor, BidirectionalConversionPreprocessor} from './language'; +import type {TextProcessor, ReadingNormalizer, BidirectionalConversionPreprocessor} from './language';  import type {LanguageTransformDescriptor} from './language-transformer';  import type {SafeAny} from './core'; @@ -36,6 +36,7 @@ type LanguageDescriptor<       * If no value is provided, `true` is assumed for all inputs.       */      isTextLookupWorthy?: IsTextLookupWorthyFunction; +    readingNormalizer?: ReadingNormalizer;      textPreprocessors?: TTextPreprocessorDescriptor;      textPostprocessors?: TTextPostprocessorDescriptor;      languageTransforms?: LanguageTransformDescriptor; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index c708f6e7..ea8c0e47 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -33,6 +33,8 @@ export type TextProcessor<T = unknown> = {      process: TextProcessorFunction<T>;  }; +export type ReadingNormalizer = (str: string) => string; +  export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';  export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>; @@ -43,6 +45,11 @@ export type LanguageAndProcessors = {      textPostprocessors?: TextProcessorWithId<unknown>[];  }; +export type LanguageAndReadingNormalizer = { +    iso: string; +    readingNormalizer: ReadingNormalizer; +}; +  export type LanguageAndTransforms = {      iso: string;      languageTransforms: LanguageTransformDescriptor; diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts index 0efbe54f..05821a64 100644 --- a/types/ext/translation-internal.d.ts +++ b/types/ext/translation-internal.d.ts @@ -71,4 +71,9 @@ export type TextProcessorMap = Map<      }  >; +export type ReadingNormalizerMap = Map< +    string, +    Language.ReadingNormalizer +>; +  export type TextCache = Map<string, Map<string, Map<unknown, string>>>; |