3 files changed, 20 insertions, 60 deletions
diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js
deleted file mode 100644
index ea6aae82..00000000
--- a/ext/js/language/la/latin-text-preprocessors.js
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2024  Yomitan Authors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.  If not, see <https://www.gnu.org/licenses/>.
- */
-
-import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
-
-/** @type {Record<string, string>} */
-const diacriticMap = {
-    ā: 'a',
-    ē: 'e',
-    ī: 'i',
-    ō: 'o',
-    ū: 'u',
-    ȳ: 'y',
-    Ā: 'A',
-    Ē: 'E',
-    Ī: 'I',
-    Ō: 'O',
-    Ū: 'U',
-    Ȳ: 'Y',
-    á: 'a',
-    é: 'e',
-    í: 'i',
-    ó: 'o',
-    ú: 'u',
-    ý: 'y',
-    Á: 'A',
-    É: 'E',
-    Í: 'I',
-    Ó: 'O',
-    Ú: 'U',
-    Ý: 'Y'
-};
-
-/** @type {import('language').TextPreprocessor<boolean>} */
-export const removeLatinDiacritics = {
-    name: 'Remove diacritics',
-    description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy',
-    options: basicTextPreprocessorOptions,
-    process: (str, setting) => {
-        return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str;
-    }
-};
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index b4af2f8a..b5d7573b 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -21,10 +21,9 @@ import {englishTransforms} from './en/english-transforms.js';
 import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
 import {japaneseTransforms} from './ja/japanese-transforms.js';
 import {isStringPartiallyJapanese} from './ja/japanese.js';
-import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';
 import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
 import {albanianTransforms} from './sq/albanian-transforms.js';
-import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js';
+import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js';
 
 const capitalizationPreprocessors = {
     decapitalize,
@@ -87,7 +86,10 @@ const languageDescriptors = [
         iso: 'grc',
         name: 'Ancient Greek',
         exampleText: 'γράφω',
-        textPreprocessors: capitalizationPreprocessors
+        textPreprocessors: {
+            ...capitalizationPreprocessors,
+            removeAlphabeticDiacritics
+        }
     },
     {
         iso: 'hu',
@@ -113,7 +115,7 @@ const languageDescriptors = [
         exampleText: 'legere',
         textPreprocessors: {
             ...capitalizationPreprocessors,
-            removeLatinDiacritics
+            removeAlphabeticDiacritics
         }
     },
     {
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js
index 12b3d1b6..e33fccda 100755
--- a/ext/js/language/text-preprocessors.js
+++ b/ext/js/language/text-preprocessors.js
@@ -33,3 +33,17 @@ export const capitalizeFirstLetter = {
     options: basicTextPreprocessorOptions,
     process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
 };
+
+/**
+ * WARNING: This should NOT be used with languages that use Han characters,
+ *          as it can result in undesirable normalization:
+ *            - '\u9038'.normalize('NFD') => '\u9038' (逸)
+ *            - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸)
+ * @type {import('language').TextPreprocessor<boolean>}
+ */
+export const removeAlphabeticDiacritics = {
+    name: 'Remove Alphabetic Diacritics',
+    description: 'ἄήé -> αηe',
+    options: basicTextPreprocessorOptions,
+    process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)
+};