diff options
Diffstat (limited to 'ext/js/language')
| -rw-r--r-- | ext/js/language/language-descriptors.js | 3 | ||||
| -rwxr-xr-x | ext/js/language/languages.js | 12 | ||||
| -rw-r--r-- | ext/js/language/translator.js | 49 | ||||
| -rw-r--r-- | ext/js/language/zh/chinese.js | 5 | 
4 files changed, 52 insertions, 17 deletions
| diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 517c908c..7965ff30 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces  import {albanianTransforms} from './sq/albanian-transforms.js';  import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';  import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';  const capitalizationPreprocessors = {      decapitalize, @@ -277,6 +277,7 @@ const languageDescriptors = [          name: 'Chinese',          exampleText: '读',          isTextLookupWorthy: isStringPartiallyChinese, +        readingNormalizer: normalizePinyin,      },  ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 57b5ea90..7759fda5 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -29,6 +29,18 @@ export function getLanguageSummaries() {  }  /** + * @returns {import('language').LanguageAndReadingNormalizer[]} + */ +export function getAllLanguageReadingNormalizers() { +    const results = []; +    for (const {iso, readingNormalizer} of languageDescriptorMap.values()) { +        if (typeof readingNormalizer === 'undefined') { continue; } +        results.push({iso, readingNormalizer}); +    } +    return results; +} + +/**   * @returns {import('language').LanguageAndProcessors[]}   * @throws {Error}   */ diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index f4f3449b..8c55f41c 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -19,7 +19,7 @@  import {applyTextReplacement} from '../general/regex-util.js';  import {isCodePointJapanese} from './ja/japanese.js';  import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextProcessors} from './languages.js'; +import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js';  import {MultiLanguageTransformer} from './multi-language-transformer.js';  /** @@ -42,6 +42,8 @@ export class Translator {          this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;          /** @type {import('translation-internal').TextProcessorMap} */          this._textProcessors = new Map(); +        /** @type {import('translation-internal').ReadingNormalizerMap} */ +        this._readingNormalizers = new Map();      }      /** @@ -52,6 +54,9 @@ export class Translator {          for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {              this._textProcessors.set(iso, {textPreprocessors, textPostprocessors});          } +        for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) { +            this._readingNormalizers.set(iso, readingNormalizer); +        }      }      /** @@ -76,7 +81,7 @@ export class Translator {          switch (mode) {              case 'group': -                dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator); +                dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator);                  break;              case 'merge':                  dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator); @@ -629,7 +634,7 @@ export class Translator {       * @returns {Promise<import('translation-internal').TermDictionaryEntry[]>}       */      async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) { -        const {mainDictionary, enabledDictionaryMap} = options; +        const {mainDictionary, enabledDictionaryMap, language} = options;          /** @type {import('translator').SequenceQuery[]} */          const sequenceList = [];          /** @type {import('translation-internal').DictionaryEntryGroup[]} */ @@ -665,15 +670,15 @@ export class Translator {                  this._sortTermDictionaryEntriesById(group.dictionaryEntries);              }              if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) { -                await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); +                await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator);              }          }          const newDictionaryEntries = [];          for (const group of groupedDictionaryEntries) { -            newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator)); +            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator));          } -        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator)); +        newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator));          return newDictionaryEntries;      } @@ -700,13 +705,14 @@ export class Translator {      }      /** +     * @param {string} language       * @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries       * @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap       * @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap       * @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap       * @param {TranslatorTagAggregator} tagAggregator       */ -    async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { +    async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) {          // Prepare grouping info          /** @type {import('dictionary-database').TermExactRequest[]} */          const termList = []; @@ -714,11 +720,14 @@ export class Translator {          /** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */          const targetMap = new Map(); +        const readingNormalizer = this._readingNormalizers.get(language); +          for (const group of groupedDictionaryEntries) {              const {dictionaryEntries} = group;              for (const dictionaryEntry of dictionaryEntries) {                  const {term, reading} = dictionaryEntry.headwords[0]; -                const key = this._createMapKey([term, reading]); +                const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +                const key = this._createMapKey([term, normalizedReading]);                  let target = targetMap.get(key);                  if (typeof target === 'undefined') {                      target = { @@ -735,7 +744,8 @@ export class Translator {          // Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading].          for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) {              const {term, reading} = dictionaryEntry.headwords[0]; -            const key = this._createMapKey([term, reading]); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading]);              const target = targetMap.get(key);              if (typeof target === 'undefined') { continue; } @@ -769,16 +779,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries       * @param {TranslatorTagAggregator} tagAggregator       * @returns {import('translation-internal').TermDictionaryEntry[]}       */ -    _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) { +    _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) {          /** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */          const groups = new Map(); +        const readingNormalizer = this._readingNormalizers.get(language);          for (const dictionaryEntry of dictionaryEntries) {              const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry; -            const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]);              let groupDictionaryEntries = groups.get(key);              if (typeof groupDictionaryEntries === 'undefined') {                  groupDictionaryEntries = []; @@ -789,7 +802,7 @@ export class Translator {          const newDictionaryEntries = [];          for (const groupDictionaryEntries of groups.values()) { -            newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator)); +            newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator));          }          return newDictionaryEntries;      } @@ -1664,18 +1677,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries       * @param {boolean} checkDuplicateDefinitions       * @param {TranslatorTagAggregator} tagAggregator       * @returns {import('translation-internal').TermDictionaryEntry}       */ -    _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { +    _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) {          // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained          const definitionEntries = [];          /** @type {Map<string, import('dictionary').TermHeadword>} */          const headwords = new Map();          for (const dictionaryEntry of dictionaryEntries) { -            const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator); +            const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator);              definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap});          } @@ -1788,16 +1802,19 @@ export class Translator {      }      /** +     * @param {string} language       * @param {Map<string, import('dictionary').TermHeadword>} headwordsMap       * @param {import('dictionary').TermHeadword[]} headwords       * @param {TranslatorTagAggregator} tagAggregator       * @returns {number[]}       */ -    _addTermHeadwords(headwordsMap, headwords, tagAggregator) { +    _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) {          /** @type {number[]} */          const headwordIndexMap = [];          for (const {term, reading, sources, tags, wordClasses} of headwords) { -            const key = this._createMapKey([term, reading]); +            const readingNormalizer = this._readingNormalizers.get(language); +            const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); +            const key = this._createMapKey([term, normalizedReading]);              let headword = headwordsMap.get(key);              if (typeof headword === 'undefined') {                  headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []); diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js index 086d2f0a..3072d200 100644 --- a/ext/js/language/zh/chinese.js +++ b/ext/js/language/zh/chinese.js @@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) {      }      return false;  } + +/** @type {import('language').ReadingNormalizer} */ +export function normalizePinyin(str) { +    return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, ''); +} |