diff options
Diffstat (limited to 'ext')
-rw-r--r-- | ext/js/language/ja/japanese-text-preprocessors.js | 9 | ||||
-rw-r--r-- | ext/js/language/ja/japanese.js | 59 | ||||
-rw-r--r-- | ext/js/language/language-descriptors.js | 6 |
3 files changed, 72 insertions, 2 deletions
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index 2d0d23b3..cdd8ce9a 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -24,6 +24,7 @@ import { convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana as convertHiraganaToKatakanaFunction, convertKatakanaToHiragana as convertKatakanaToHiraganaFunction, + normalizeCombiningCharacters as normalizeCombiningCharactersFunction, } from './japanese.js'; /** @type {import('language').TextProcessor<boolean>} */ @@ -90,3 +91,11 @@ export const collapseEmphaticSequences = { return str; }, }; + +/** @type {import('language').TextProcessor<boolean>} */ +export const normalizeCombiningCharacters = { + name: 'Normalize combining characters', + description: 'ド → ド (U+30C8 U+3099 → U+30C9)', + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str), +}; diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js index 3a009ebb..231d8f62 100644 --- a/ext/js/language/ja/japanese.js +++ b/ext/js/language/ja/japanese.js @@ -560,6 +560,65 @@ export function getKanaDiacriticInfo(character) { return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null; } +/** + * @param {number} codePoint + * @returns {boolean} + */ +function dakutenAllowed(codePoint) { + // To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included + // かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ + // カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ + return ((codePoint >= 0x304B && codePoint <= 0x3068) || + (codePoint >= 0x306F && codePoint <= 0x307B) || + (codePoint >= 0x30AB && codePoint <= 0x30C8) || + (codePoint >= 0x30CF && codePoint <= 0x30DB)); +} + +/** + * @param {number} codePoint + * @returns {boolean} + */ +function handakutenAllowed(codePoint) { + // To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included + // はばぱひびぴふぶぷへべぺほ + // ハバパヒビピフブプヘベペホ + return ((codePoint >= 0x306F && codePoint <= 0x307B) || + (codePoint >= 0x30CF && codePoint <= 0x30DB)); +} + +/** + * @param {string} text + * @returns {string} + */ +export function normalizeCombiningCharacters(text) { + let result = ''; + let i = text.length - 1; + // Ignoring the first character is intentional, it cannot combine with anything + while (i > 0) { + if (text[i] === '\u3099') { + const dakutenCombinee = text[i - 1].codePointAt(0); + if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) { + result = String.fromCodePoint(dakutenCombinee + 1) + result; + i -= 2; + continue; + } + } else if (text[i] === '\u309A') { + const handakutenCombinee = text[i - 1].codePointAt(0); + if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) { + result = String.fromCodePoint(handakutenCombinee + 2) + result; + i -= 2; + continue; + } + } + result = text[i] + result; + i--; + } + // i === -1 when first two characters are combined + if (i === 0) { + result = text[0] + result; + } + return result; +} // Furigana distribution diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 7965ff30..f9fb4f09 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -26,6 +26,7 @@ import { collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, + normalizeCombiningCharacters, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; @@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; -import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; -import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js'; +import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; +import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, @@ -155,6 +156,7 @@ const languageDescriptors = [ textPreprocessors: { convertHalfWidthCharacters, alphabeticToHiragana, + normalizeCombiningCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, |