From d724b403f94b7fd1ecec3f6d2e4f5a1ed805c6ec Mon Sep 17 00:00:00 2001
From: Cashew <52880648+cashewnuttynuts@users.noreply.github.com>
Date: Sat, 22 Jun 2024 03:24:21 +0700
Subject: Add diacritics normalization preprocessors for Vietnamese (#1107)

* add viet diacritics normalization

* move regexp construction outside of function

* fix eslint

* add 'off' option

* fix lint

* fix type
---
 .eslintrc.json                                |  1 +
 ext/js/language/language-descriptors.js       |  6 ++-
 ext/js/language/vi/viet-text-preprocessors.js | 58 ++++++++++++++++++++++++++
 test/language/viet-text-preprocessors.test.js | 60 +++++++++++++++++++++++++++
 types/ext/language-descriptors.d.ts           |  4 +-
 5 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 ext/js/language/vi/viet-text-preprocessors.js
 create mode 100644 test/language/viet-text-preprocessors.test.js

diff --git a/.eslintrc.json b/.eslintrc.json
index 9b4f8685..c32e9266 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -653,6 +653,7 @@
                 "ext/js/language/sga/old-irish-transforms.js",
                 "ext/js/language/sh/serbo-croatian-text-preprocessors.js",
                 "ext/js/language/sq/albanian-transforms.js",
+                "ext/js/language/vi/viet-text-preprocessors.js",
                 "ext/js/language/text-processors.js",
                 "ext/js/language/translator.js",
                 "ext/js/language/zh/chinese.js",
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 2a8762e9..517c908c 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -36,6 +36,7 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js
 import {oldIrishTransforms} from './sga/old-irish-transforms.js';
 import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
 import {albanianTransforms} from './sq/albanian-transforms.js';
+import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
 import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
 import {isStringPartiallyChinese} from './zh/chinese.js';
 
@@ -261,7 +262,10 @@ const languageDescriptors = [
         iso: 'vi',
         name: 'Vietnamese',
         exampleText: 'đọc',
-        textPreprocessors: capitalizationPreprocessors,
+        textPreprocessors: {
+            ...capitalizationPreprocessors,
+            normalizeDiacritics,
+        },
     },
     {
         iso: 'yue',
diff --git a/ext/js/language/vi/viet-text-preprocessors.js b/ext/js/language/vi/viet-text-preprocessors.js
new file mode 100644
index 00000000..24453f47
--- /dev/null
+++ b/ext/js/language/vi/viet-text-preprocessors.js
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+const TONE = '([\u0300\u0309\u0303\u0301\u0323])'; // Huyền, hỏi, ngã, sắc, nặng
+const COMBINING_BREVE = '\u0306'; // Ă
+const COMBINING_CIRCUMFLEX_ACCENT = '\u0302'; // Â
+const COMBINING_HORN = '\u031B'; // Ơ
+const DIACRITICS = `${COMBINING_BREVE}${COMBINING_CIRCUMFLEX_ACCENT}${COMBINING_HORN}`;
+
+// eslint-disable-next-line no-misleading-character-class
+const re1 = new RegExp(`${TONE}([aeiouy${DIACRITICS}]+)`, 'i');
+const re2 = new RegExp(`(?<=[${DIACRITICS}])(.)${TONE}`, 'i');
+const re3 = new RegExp(`(?<=[ae])([iouy])${TONE}`, 'i');
+const re4 = new RegExp(`(?<=[oy])([iuy])${TONE}`, 'i');
+const re5 = new RegExp(`(?<!q)(u)([aeiou])${TONE}`, 'i');
+const re6 = new RegExp(`(?<!g)(i)([aeiouy])${TONE}`, 'i');
+const re7 = new RegExp(`(?<!q)([ou])([aeoy])${TONE}(?!\\w)`, 'i');
+
+/**
+ * This function is adapted from https://github.com/enricobarzetti/viet_text_tools/blob/master/viet_text_tools/__init__.py
+ * @type {import('language').TextProcessor<'old'|'new'|'off'>}
+ */
+export const normalizeDiacritics = {
+    name: 'Normalize Diacritics',
+    description: 'Normalize diacritics and their placements (in either the old style or new style). NFC normalization is used.',
+    options: ['old', 'new', 'off'],
+    process: (str, setting) => {
+        if (setting === 'off') { return str; }
+
+        let result = str.normalize('NFD');
+        // Put the tone on the second vowel
+        result = result.replace(re1, '$2$1');
+        // Put the tone on the vowel with a diacritic
+        result = result.replace(re2, '$2$1');
+        // For vowels that are not oa, oe, uy put the tone on the penultimate vowel
+        result = result.replace(re3, '$2$1');
+        result = result.replace(re4, '$2$1');
+        result = result.replace(re5, '$1$3$2');
+        result = result.replace(re6, '$1$3$2');
+
+        if (setting === 'old') { result = result.replace(re7, '$1$3$2'); }
+        return result.normalize('NFC');
+    },
+};
diff --git a/test/language/viet-text-preprocessors.test.js b/test/language/viet-text-preprocessors.test.js
new file mode 100644
index 00000000..56593c63
--- /dev/null
+++ b/test/language/viet-text-preprocessors.test.js
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2023-2024  Yomitan Authors
+ * Copyright (C) 2020-2022  Yomichan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {normalizeDiacritics} from '../../ext/js/language/vi/viet-text-preprocessors.js';
+import {describe, expect, test} from 'vitest';
+
+const testCasesOldStyle = [
+    ['hoạ', 'họa'],
+    ['choàng', 'choàng'],
+    ['thuỷ', 'thủy'],
+    ['oà', 'òa'],
+    ['toà', 'tòa'],
+    ['toàn', 'toàn'],
+    ['tòan', 'toàn'],
+];
+
+const testCasesNewStyle = [
+    ['ngòăng', 'ngoằng'],
+    ['họa', 'hoạ'],
+    ['chòang', 'choàng'],
+    ['giừơng', 'giường'],
+    ['baỷ', 'bảy'],
+    ['cuả', 'của'],
+    ['òa', 'oà'],
+    ['toàn', 'toàn'],
+];
+
+describe('diacritics normalization', () => {
+    const {options, process} = normalizeDiacritics;
+    for (const option of options) {
+        if (option === 'off') { return; }
+
+        describe(`${option} style`, () => {
+            if (option === 'new') {
+                test.each(testCasesNewStyle)('%s normalizes to %s', (input, expected) => {
+                    expect(process(input, option)).toStrictEqual(expected);
+                });
+            } else {
+                test.each(testCasesOldStyle)('%s normalizes to %s', (input, expected) => {
+                    expect(process(input, option)).toStrictEqual(expected);
+                });
+            }
+        });
+    }
+});
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 856e55b3..778445de 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -174,7 +174,9 @@ type AllTextProcessors = {
         pre: CapitalizationPreprocessors;
     };
     vi: {
-        pre: CapitalizationPreprocessors;
+        pre: CapitalizationPreprocessors & {
+            normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>;
+        };
     };
     yue: Record<string, never>;
     zh: Record<string, never>;
-- 
cgit v1.2.3