diff options
author | Cashew <52880648+cashewnuttynuts@users.noreply.github.com> | 2024-06-24 18:38:39 +0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-24 11:38:39 +0000 |
commit | b584c5440721fa7399564ced57f134fd5333d20c (patch) | |
tree | 4ea760c9232b4e8d2ab9c0b60bf07f7ed3bad1ef /ext/js | |
parent | 8f32410f34b2c839105eb508da9b9e63f6a89194 (diff) |
add cn reading processors (#1120)
* add cn reading processors
* remove meow
* fix lint
* add cn reading processors
* remove meow
* fix lint
* wip
* update names
* update test
* remove vestigial code
Diffstat (limited to 'ext/js')
-rw-r--r-- | ext/js/language/language-descriptors.js | 3 | ||||
-rwxr-xr-x | ext/js/language/languages.js | 12 | ||||
-rw-r--r-- | ext/js/language/translator.js | 49 | ||||
-rw-r--r-- | ext/js/language/zh/chinese.js | 5 |
4 files changed, 52 insertions, 17 deletions
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 517c908c..7965ff30 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -38,7 +38,7 @@ import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preproces import {albanianTransforms} from './sq/albanian-transforms.js'; import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, @@ -277,6 +277,7 @@ const languageDescriptors = [ name: 'Chinese', exampleText: '读', isTextLookupWorthy: isStringPartiallyChinese, + readingNormalizer: normalizePinyin, }, ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 57b5ea90..7759fda5 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -29,6 +29,18 @@ export function getLanguageSummaries() { } /** + * @returns {import('language').LanguageAndReadingNormalizer[]} + */ +export function getAllLanguageReadingNormalizers() { + const results = []; + for (const {iso, readingNormalizer} of languageDescriptorMap.values()) { + if (typeof readingNormalizer === 'undefined') { continue; } + results.push({iso, readingNormalizer}); + } + return results; +} + +/** * @returns {import('language').LanguageAndProcessors[]} * @throws {Error} */ diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index f4f3449b..8c55f41c 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -19,7 +19,7 @@ import {applyTextReplacement} from '../general/regex-util.js'; import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextProcessors} from './languages.js'; +import {getAllLanguageTextProcessors, getAllLanguageReadingNormalizers} from './languages.js'; import {MultiLanguageTransformer} from './multi-language-transformer.js'; /** @@ -42,6 +42,8 @@ export class Translator { this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; /** @type {import('translation-internal').TextProcessorMap} */ this._textProcessors = new Map(); + /** @type {import('translation-internal').ReadingNormalizerMap} */ + this._readingNormalizers = new Map(); } /** @@ -52,6 +54,9 @@ export class Translator { for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) { this._textProcessors.set(iso, {textPreprocessors, textPostprocessors}); } + for (const {iso, readingNormalizer} of getAllLanguageReadingNormalizers()) { + this._readingNormalizers.set(iso, readingNormalizer); + } } /** @@ -76,7 +81,7 @@ export class Translator { switch (mode) { case 'group': - dictionaryEntries = this._groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator); + dictionaryEntries = this._groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator); break; case 'merge': dictionaryEntries = await this._getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator); @@ -629,7 +634,7 @@ export class Translator { * @returns {Promise<import('translation-internal').TermDictionaryEntry[]>} */ async _getRelatedDictionaryEntries(dictionaryEntries, options, tagAggregator) { - const {mainDictionary, enabledDictionaryMap} = options; + const {mainDictionary, enabledDictionaryMap, language} = options; /** @type {import('translator').SequenceQuery[]} */ const sequenceList = []; /** @type {import('translation-internal').DictionaryEntryGroup[]} */ @@ -665,15 +670,15 @@ export class Translator { this._sortTermDictionaryEntriesById(group.dictionaryEntries); } if (ungroupedDictionaryEntriesMap.size > 0 || secondarySearchDictionaryMap.size > 0) { - await this._addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); + await this._addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator); } } const newDictionaryEntries = []; for (const group of groupedDictionaryEntries) { - newDictionaryEntries.push(this._createGroupedDictionaryEntry(group.dictionaryEntries, true, tagAggregator)); + newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, group.dictionaryEntries, true, tagAggregator)); } - newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(ungroupedDictionaryEntriesMap.values(), tagAggregator)); + newDictionaryEntries.push(...this._groupDictionaryEntriesByHeadword(language, ungroupedDictionaryEntriesMap.values(), tagAggregator)); return newDictionaryEntries; } @@ -700,13 +705,14 @@ export class Translator { } /** + * @param {string} language * @param {import('translation-internal').DictionaryEntryGroup[]} groupedDictionaryEntries * @param {Map<number, import('translation-internal').TermDictionaryEntry>} ungroupedDictionaryEntriesMap * @param {import('translation').TermEnabledDictionaryMap} enabledDictionaryMap * @param {import('translation').TermEnabledDictionaryMap} secondarySearchDictionaryMap * @param {TranslatorTagAggregator} tagAggregator */ - async _addSecondaryRelatedDictionaryEntries(groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { + async _addSecondaryRelatedDictionaryEntries(language, groupedDictionaryEntries, ungroupedDictionaryEntriesMap, enabledDictionaryMap, secondarySearchDictionaryMap, tagAggregator) { // Prepare grouping info /** @type {import('dictionary-database').TermExactRequest[]} */ const termList = []; @@ -714,11 +720,14 @@ export class Translator { /** @type {Map<string, {groups: import('translation-internal').DictionaryEntryGroup[]}>} */ const targetMap = new Map(); + const readingNormalizer = this._readingNormalizers.get(language); + for (const group of groupedDictionaryEntries) { const {dictionaryEntries} = group; for (const dictionaryEntry of dictionaryEntries) { const {term, reading} = dictionaryEntry.headwords[0]; - const key = this._createMapKey([term, reading]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); let target = targetMap.get(key); if (typeof target === 'undefined') { target = { @@ -735,7 +744,8 @@ export class Translator { // Group unsequenced dictionary entries with sequenced entries that have a matching [term, reading]. for (const [id, dictionaryEntry] of ungroupedDictionaryEntriesMap.entries()) { const {term, reading} = dictionaryEntry.headwords[0]; - const key = this._createMapKey([term, reading]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); const target = targetMap.get(key); if (typeof target === 'undefined') { continue; } @@ -769,16 +779,19 @@ export class Translator { } /** + * @param {string} language * @param {Iterable<import('translation-internal').TermDictionaryEntry>} dictionaryEntries * @param {TranslatorTagAggregator} tagAggregator * @returns {import('translation-internal').TermDictionaryEntry[]} */ - _groupDictionaryEntriesByHeadword(dictionaryEntries, tagAggregator) { + _groupDictionaryEntriesByHeadword(language, dictionaryEntries, tagAggregator) { /** @type {Map<string, import('translation-internal').TermDictionaryEntry[]>} */ const groups = new Map(); + const readingNormalizer = this._readingNormalizers.get(language); for (const dictionaryEntry of dictionaryEntries) { const {inflectionRuleChainCandidates, headwords: [{term, reading}]} = dictionaryEntry; - const key = this._createMapKey([term, reading, ...inflectionRuleChainCandidates]); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading, ...inflectionRuleChainCandidates]); let groupDictionaryEntries = groups.get(key); if (typeof groupDictionaryEntries === 'undefined') { groupDictionaryEntries = []; @@ -789,7 +802,7 @@ export class Translator { const newDictionaryEntries = []; for (const groupDictionaryEntries of groups.values()) { - newDictionaryEntries.push(this._createGroupedDictionaryEntry(groupDictionaryEntries, false, tagAggregator)); + newDictionaryEntries.push(this._createGroupedDictionaryEntry(language, groupDictionaryEntries, false, tagAggregator)); } return newDictionaryEntries; } @@ -1664,18 +1677,19 @@ export class Translator { } /** + * @param {string} language * @param {import('translation-internal').TermDictionaryEntry[]} dictionaryEntries * @param {boolean} checkDuplicateDefinitions * @param {TranslatorTagAggregator} tagAggregator * @returns {import('translation-internal').TermDictionaryEntry} */ - _createGroupedDictionaryEntry(dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { + _createGroupedDictionaryEntry(language, dictionaryEntries, checkDuplicateDefinitions, tagAggregator) { // Headwords are generated before sorting, so that the order of dictionaryEntries can be maintained const definitionEntries = []; /** @type {Map<string, import('dictionary').TermHeadword>} */ const headwords = new Map(); for (const dictionaryEntry of dictionaryEntries) { - const headwordIndexMap = this._addTermHeadwords(headwords, dictionaryEntry.headwords, tagAggregator); + const headwordIndexMap = this._addTermHeadwords(language, headwords, dictionaryEntry.headwords, tagAggregator); definitionEntries.push({index: definitionEntries.length, dictionaryEntry, headwordIndexMap}); } @@ -1788,16 +1802,19 @@ export class Translator { } /** + * @param {string} language * @param {Map<string, import('dictionary').TermHeadword>} headwordsMap * @param {import('dictionary').TermHeadword[]} headwords * @param {TranslatorTagAggregator} tagAggregator * @returns {number[]} */ - _addTermHeadwords(headwordsMap, headwords, tagAggregator) { + _addTermHeadwords(language, headwordsMap, headwords, tagAggregator) { /** @type {number[]} */ const headwordIndexMap = []; for (const {term, reading, sources, tags, wordClasses} of headwords) { - const key = this._createMapKey([term, reading]); + const readingNormalizer = this._readingNormalizers.get(language); + const normalizedReading = typeof readingNormalizer === 'undefined' ? reading : readingNormalizer(reading); + const key = this._createMapKey([term, normalizedReading]); let headword = headwordsMap.get(key); if (typeof headword === 'undefined') { headword = this._createTermHeadword(headwordsMap.size, term, reading, [], [], []); diff --git a/ext/js/language/zh/chinese.js b/ext/js/language/zh/chinese.js index 086d2f0a..3072d200 100644 --- a/ext/js/language/zh/chinese.js +++ b/ext/js/language/zh/chinese.js @@ -60,3 +60,8 @@ export function isStringPartiallyChinese(str) { } return false; } + +/** @type {import('language').ReadingNormalizer} */ +export function normalizePinyin(str) { + return str.normalize('NFC').toLowerCase().replace(/[\s・:]/g, ''); +} |