From dfc3710108fe2617a98fca0f57f5a2f6bb7d1830 Mon Sep 17 00:00:00 2001
From: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Date: Wed, 26 Jun 2024 13:05:23 -0400
Subject: Add normalization of combining dakuten and handakuten to ja
 preprocessors (#1136)

* Add normalization of combining dakuten and handakuten to ja preprocessors

* Fix typo

* Remove redundant variable assignment

* Fix first character processed incorrectly when it is a character that gets combined

* Add test for combining dakuten and handakuten
---
 ext/js/language/ja/japanese-text-preprocessors.js |   9 +
 ext/js/language/ja/japanese.js                    |  59 +++++++
 ext/js/language/language-descriptors.js           |   6 +-
 test/language/japanese-preprocessors.test.js      | 202 ++++++++++++++++++++++
 types/ext/language-descriptors.d.ts               |   1 +
 5 files changed, 275 insertions(+), 2 deletions(-)
 create mode 100644 test/language/japanese-preprocessors.test.js
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index 2d0d23b3..cdd8ce9a 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -24,6 +24,7 @@ import {
     convertHalfWidthKanaToFullWidth,
     convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
     convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
+    normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
 } from './japanese.js';
 
 /** @type {import('language').TextProcessor<boolean>} */
@@ -90,3 +91,11 @@ export const collapseEmphaticSequences = {
         return str;
     },
 };
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const normalizeCombiningCharacters = {
+    name: 'Normalize combining characters',
+    description: 'ド → ド (U+30C8 U+3099 → U+30C9)',
+    options: basicTextProcessorOptions,
+    process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
+};
diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js
index 3a009ebb..231d8f62 100644
--- a/ext/js/language/ja/japanese.js
+++ b/ext/js/language/ja/japanese.js
@@ -560,6 +560,65 @@ export function getKanaDiacriticInfo(character) {
     return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null;
 }
 
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function dakutenAllowed(codePoint) {
+    // To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included
+    // かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ
+    // カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ
+    return ((codePoint >= 0x304B && codePoint <= 0x3068) ||
+    (codePoint >= 0x306F && codePoint <= 0x307B) ||
+    (codePoint >= 0x30AB && codePoint <= 0x30C8) ||
+    (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function handakutenAllowed(codePoint) {
+    // To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included
+    // はばぱひびぴふぶぷへべぺほ
+    // ハバパヒビピフブプヘベペホ
+    return ((codePoint >= 0x306F && codePoint <= 0x307B) ||
+    (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {string} text
+ * @returns {string}
+ */
+export function normalizeCombiningCharacters(text) {
+    let result = '';
+    let i = text.length - 1;
+    // Ignoring the first character is intentional, it cannot combine with anything
+    while (i > 0) {
+        if (text[i] === '\u3099') {
+            const dakutenCombinee = text[i - 1].codePointAt(0);
+            if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) {
+                result = String.fromCodePoint(dakutenCombinee + 1) + result;
+                i -= 2;
+                continue;
+            }
+        } else if (text[i] === '\u309A') {
+            const handakutenCombinee = text[i - 1].codePointAt(0);
+            if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) {
+                result = String.fromCodePoint(handakutenCombinee + 2) + result;
+                i -= 2;
+                continue;
+            }
+        }
+        result = text[i] + result;
+        i--;
+    }
+    // i === -1 when first two characters are combined
+    if (i === 0) {
+        result = text[0] + result;
+    }
+    return result;
+}
 
 // Furigana distribution
 
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 7965ff30..f9fb4f09 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -26,6 +26,7 @@ import {
     collapseEmphaticSequences,
     convertHalfWidthCharacters,
     convertHiraganaToKatakana,
+    normalizeCombiningCharacters,
 } from './ja/japanese-text-preprocessors.js';
 import {japaneseTransforms} from './ja/japanese-transforms.js';
 import {isStringPartiallyJapanese} from './ja/japanese.js';
@@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js
 import {oldIrishTransforms} from './sga/old-irish-transforms.js';
 import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
 import {albanianTransforms} from './sq/albanian-transforms.js';
-import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
 import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
+import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
 
 const capitalizationPreprocessors = {
     decapitalize,
@@ -155,6 +156,7 @@ const languageDescriptors = [
         textPreprocessors: {
             convertHalfWidthCharacters,
             alphabeticToHiragana,
+            normalizeCombiningCharacters,
             alphanumericWidthVariants,
             convertHiraganaToKatakana,
             collapseEmphaticSequences,
diff --git a/test/language/japanese-preprocessors.test.js b/test/language/japanese-preprocessors.test.js
new file mode 100644
index 00000000..90313abd
--- /dev/null
+++ b/test/language/japanese-preprocessors.test.js
@@ -0,0 +1,202 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {describe, expect, test} from 'vitest';
+import {normalizeCombiningCharacters} from '../../ext/js/language/ja/japanese-text-preprocessors.js';
+
+const testCasesDakuten = [
+    ['か\u3099', 'が'],
+    ['き\u3099', 'ぎ'],
+    ['く\u3099', 'ぐ'],
+    ['け\u3099', 'げ'],
+    ['こ\u3099', 'ご'],
+    ['さ\u3099', 'ざ'],
+    ['し\u3099', 'じ'],
+    ['す\u3099', 'ず'],
+    ['せ\u3099', 'ぜ'],
+    ['そ\u3099', 'ぞ'],
+    ['た\u3099', 'だ'],
+    ['ち\u3099', 'ぢ'],
+    ['つ\u3099', 'づ'],
+    ['て\u3099', 'で'],
+    ['と\u3099', 'ど'],
+    ['は\u3099', 'ば'],
+    ['ひ\u3099', 'び'],
+    ['ふ\u3099', 'ぶ'],
+    ['へ\u3099', 'べ'],
+    ['ほ\u3099', 'ぼ'],
+    ['カ\u3099', 'ガ'],
+    ['キ\u3099', 'ギ'],
+    ['ク\u3099', 'グ'],
+    ['ケ\u3099', 'ゲ'],
+    ['コ\u3099', 'ゴ'],
+    ['サ\u3099', 'ザ'],
+    ['シ\u3099', 'ジ'],
+    ['ス\u3099', 'ズ'],
+    ['セ\u3099', 'ゼ'],
+    ['ソ\u3099', 'ゾ'],
+    ['タ\u3099', 'ダ'],
+    ['チ\u3099', 'ヂ'],
+    ['ツ\u3099', 'ヅ'],
+    ['テ\u3099', 'デ'],
+    ['ト\u3099', 'ド'],
+    ['ハ\u3099', 'バ'],
+    ['ヒ\u3099', 'ビ'],
+    ['フ\u3099', 'ブ'],
+    ['ヘ\u3099', 'ベ'],
+    ['ホ\u3099', 'ボ'],
+];
+
+const testCasesHandakuten = [
+    ['は\u309A', 'ぱ'],
+    ['ひ\u309A', 'ぴ'],
+    ['ふ\u309A', 'ぷ'],
+    ['へ\u309A', 'ぺ'],
+    ['ほ\u309A', 'ぽ'],
+    ['ハ\u309A', 'パ'],
+    ['ヒ\u309A', 'ピ'],
+    ['フ\u309A', 'プ'],
+    ['ヘ\u309A', 'ペ'],
+    ['ホ\u309A', 'ポ'],
+];
+
+const testCasesIgnored = [
+    ['な\u3099', 'な\u3099'],
+    ['な\u309A', 'な\u309A'],
+    ['に\u3099', 'に\u3099'],
+    ['に\u309A', 'に\u309A'],
+    ['ぬ\u3099', 'ぬ\u3099'],
+    ['ぬ\u309A', 'ぬ\u309A'],
+    ['ね\u3099', 'ね\u3099'],
+    ['ね\u309A', 'ね\u309A'],
+    ['の\u3099', 'の\u3099'],
+    ['の\u309A', 'の\u309A'],
+    ['ま\u3099', 'ま\u3099'],
+    ['ま\u309A', 'ま\u309A'],
+    ['み\u3099', 'み\u3099'],
+    ['み\u309A', 'み\u309A'],
+    ['む\u3099', 'む\u3099'],
+    ['む\u309A', 'む\u309A'],
+    ['め\u3099', 'め\u3099'],
+    ['め\u309A', 'め\u309A'],
+    ['も\u3099', 'も\u3099'],
+    ['も\u309A', 'も\u309A'],
+    ['ゃ\u3099', 'ゃ\u3099'],
+    ['ゃ\u309A', 'ゃ\u309A'],
+    ['や\u3099', 'や\u3099'],
+    ['や\u309A', 'や\u309A'],
+    ['ゅ\u3099', 'ゅ\u3099'],
+    ['ゅ\u309A', 'ゅ\u309A'],
+    ['ゆ\u3099', 'ゆ\u3099'],
+    ['ゆ\u309A', 'ゆ\u309A'],
+    ['ょ\u3099', 'ょ\u3099'],
+    ['ょ\u309A', 'ょ\u309A'],
+    ['よ\u3099', 'よ\u3099'],
+    ['よ\u309A', 'よ\u309A'],
+    ['ら\u3099', 'ら\u3099'],
+    ['ら\u309A', 'ら\u309A'],
+    ['り\u3099', 'り\u3099'],
+    ['り\u309A', 'り\u309A'],
+    ['る\u3099', 'る\u3099'],
+    ['る\u309A', 'る\u309A'],
+    ['れ\u3099', 'れ\u3099'],
+    ['れ\u309A', 'れ\u309A'],
+    ['ろ\u3099', 'ろ\u3099'],
+    ['ろ\u309A', 'ろ\u309A'],
+    ['ゎ\u3099', 'ゎ\u3099'],
+    ['ゎ\u309A', 'ゎ\u309A'],
+    ['わ\u3099', 'わ\u3099'],
+    ['わ\u309A', 'わ\u309A'],
+    ['ゐ\u3099', 'ゐ\u3099'],
+    ['ゐ\u309A', 'ゐ\u309A'],
+    ['ゑ\u3099', 'ゑ\u3099'],
+    ['ゑ\u309A', 'ゑ\u309A'],
+    ['を\u3099', 'を\u3099'],
+    ['を\u309A', 'を\u309A'],
+    ['ん\u3099', 'ん\u3099'],
+    ['ん\u309A', 'ん\u309A'],
+    ['ナ\u3099', 'ナ\u3099'],
+    ['ナ\u309A', 'ナ\u309A'],
+    ['ニ\u3099', 'ニ\u3099'],
+    ['ニ\u309A', 'ニ\u309A'],
+    ['ヌ\u3099', 'ヌ\u3099'],
+    ['ヌ\u309A', 'ヌ\u309A'],
+    ['ネ\u3099', 'ネ\u3099'],
+    ['ネ\u309A', 'ネ\u309A'],
+    ['ノ\u3099', 'ノ\u3099'],
+    ['ノ\u309A', 'ノ\u309A'],
+    ['マ\u3099', 'マ\u3099'],
+    ['マ\u309A', 'マ\u309A'],
+    ['ミ\u3099', 'ミ\u3099'],
+    ['ミ\u309A', 'ミ\u309A'],
+    ['ム\u3099', 'ム\u3099'],
+    ['ム\u309A', 'ム\u309A'],
+    ['メ\u3099', 'メ\u3099'],
+    ['メ\u309A', 'メ\u309A'],
+    ['モ\u3099', 'モ\u3099'],
+    ['モ\u309A', 'モ\u309A'],
+    ['ャ\u3099', 'ャ\u3099'],
+    ['ャ\u309A', 'ャ\u309A'],
+    ['ヤ\u3099', 'ヤ\u3099'],
+    ['ヤ\u309A', 'ヤ\u309A'],
+    ['ュ\u3099', 'ュ\u3099'],
+    ['ュ\u309A', 'ュ\u309A'],
+    ['ユ\u3099', 'ユ\u3099'],
+    ['ユ\u309A', 'ユ\u309A'],
+    ['ョ\u3099', 'ョ\u3099'],
+    ['ョ\u309A', 'ョ\u309A'],
+    ['ヨ\u3099', 'ヨ\u3099'],
+    ['ヨ\u309A', 'ヨ\u309A'],
+    ['ラ\u3099', 'ラ\u3099'],
+    ['ラ\u309A', 'ラ\u309A'],
+    ['リ\u3099', 'リ\u3099'],
+    ['リ\u309A', 'リ\u309A'],
+    ['ル\u3099', 'ル\u3099'],
+    ['ル\u309A', 'ル\u309A'],
+    ['レ\u3099', 'レ\u3099'],
+    ['レ\u309A', 'レ\u309A'],
+    ['ロ\u3099', 'ロ\u3099'],
+    ['ロ\u309A', 'ロ\u309A'],
+    ['ヮ\u3099', 'ヮ\u3099'],
+    ['ヮ\u309A', 'ヮ\u309A'],
+    ['ワ\u3099', 'ワ\u3099'],
+    ['ワ\u309A', 'ワ\u309A'],
+    ['ヰ\u3099', 'ヰ\u3099'],
+    ['ヰ\u309A', 'ヰ\u309A'],
+    ['ヱ\u3099', 'ヱ\u3099'],
+    ['ヱ\u309A', 'ヱ\u309A'],
+    ['ヲ\u3099', 'ヲ\u3099'],
+    ['ヲ\u309A', 'ヲ\u309A'],
+    ['ン\u3099', 'ン\u3099'],
+    ['ン\u309A', 'ン\u309A'],
+];
+
+const textCasesMisc = [
+    ['', ''],
+    ['\u3099ハ', '\u3099ハ'],
+    ['\u309Aハ', '\u309Aハ'],
+    ['さくらし\u3099また\u3099いこん', 'さくらじまだいこん'],
+    ['いっほ\u309Aん', 'いっぽん'],
+];
+
+describe('combining dakuten/handakuten normalization', () => {
+    const {process} = normalizeCombiningCharacters;
+    const testCases = [...testCasesDakuten, ...testCasesHandakuten, ...testCasesIgnored, ...textCasesMisc];
+    test.each(testCases)('%s normalizes to %s', (input, expected) => {
+        expect(process(input, true)).toStrictEqual(expected);
+    });
+});
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 42312937..d0136d92 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -120,6 +120,7 @@ type AllTextProcessors = {
         pre: {
             convertHalfWidthCharacters: TextProcessor<boolean>;
             alphabeticToHiragana: TextProcessor<boolean>;
+            normalizeCombiningCharacters: TextProcessor<boolean>;
             alphanumericWidthVariants: BidirectionalConversionPreprocessor;
             convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
             collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
-- 
cgit v1.2.3