From dfc3710108fe2617a98fca0f57f5a2f6bb7d1830 Mon Sep 17 00:00:00 2001 From: Kuuuube <61125188+Kuuuube@users.noreply.github.com> Date: Wed, 26 Jun 2024 13:05:23 -0400 Subject: Add normalization of combining dakuten and handakuten to ja preprocessors (#1136) * Add normalization of combining dakuten and handakuten to ja preprocessors * Fix typo * Remove redundant variable assignment * Fix first character processed incorrectly when it is a character that gets combined * Add test for combining dakuten and handakuten --- ext/js/language/language-descriptors.js | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'ext/js/language/language-descriptors.js') diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 7965ff30..f9fb4f09 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -26,6 +26,7 @@ import { collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, + normalizeCombiningCharacters, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; @@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; -import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; +import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, @@ -155,6 +156,7 @@ const languageDescriptors = [ textPreprocessors: { convertHalfWidthCharacters, alphabeticToHiragana, + normalizeCombiningCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, -- cgit v1.2.3