10 files changed, 376 insertions, 57 deletions
diff --git a/.eslintrc.json b/.eslintrc.json
index d3509d85..bc1f2940 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -638,12 +638,16 @@
                 "ext/js/general/object-property-accessor.js",
                 "ext/js/general/regex-util.js",
                 "ext/js/general/text-source-map.js",
+                "ext/js/language/ar/arabic-text-preprocessors.js",
+                "ext/js/language/de/german-text-preprocessors.js",
                 "ext/js/language/ja/japanese-text-preprocessors.js",
                 "ext/js/language/ja/japanese-wanakana.js",
                 "ext/js/language/ja/japanese.js",
+                "ext/js/language/la/latin-text-preprocessors.js",
                 "ext/js/language/language-descriptors.js",
                 "ext/js/language/language-transformer.js",
                 "ext/js/language/languages.js",
+                "ext/js/language/ru/russian-text-preprocessors.js",
                 "ext/js/language/text-preprocessors.js",
                 "ext/js/language/translator.js",
                 "ext/js/media/audio-downloader.js",
diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js
new file mode 100644
index 00000000..f0118564
--- /dev/null
+++ b/ext/js/language/ar/arabic-text-preprocessors.js
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeArabicScriptDiacritics = {
+    name: 'Remove diacritics',
+    description: 'وَلَدَ ⬅️ ولد',
+    options: basicTextPreprocessorOptions,
+    process: (text, setting) => {
+        return setting ? text.replace(/[\u064E-\u0650]/g, '') : text;
+    }
+};
diff --git a/ext/js/language/de/german-text-preprocessors.js b/ext/js/language/de/german-text-preprocessors.js
new file mode 100644
index 00000000..e829bf81
--- /dev/null
+++ b/ext/js/language/de/german-text-preprocessors.js
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+
+/** @type {import('language').BidirectionalConversionPreprocessor} */
+export const eszettPreprocessor = {
+    name: 'Convert "ß" to "ss"',
+    description: 'ß → ss, ẞ → SS and vice versa',
+    options: ['off', 'direct', 'inverse'],
+    process: (str, setting) => {
+        switch (setting) {
+            case 'off':
+                return str;
+            case 'direct':
+                return str.replace(/ẞ/g, 'SS').replace(/ß/g, 'ss');
+            case 'inverse':
+                return str.replace(/SS/g, 'ẞ').replace(/ss/g, 'ß');
+        }
+    }
+};
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index ab4138c3..06f944c1 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -30,7 +30,6 @@ export const convertHalfWidthCharacters = {
     name: 'Convert half width characters to full width',
     description: 'ﾖﾐﾁｬﾝ → ヨミチャン',
     options: basicTextPreprocessorOptions,
-    /** @type {import('language').TextPreprocessorFunction<boolean>} */
     process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
 };
 
@@ -39,7 +38,6 @@ export const convertNumericCharacters = {
     name: 'Convert numeric characters to full width',
     description: '1234 → １２３４',
     options: basicTextPreprocessorOptions,
-    /** @type {import('language').TextPreprocessorFunction<boolean>} */
     process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
 };
 
@@ -48,26 +46,24 @@ export const convertAlphabeticCharacters = {
     name: 'Convert alphabetic characters to hiragana',
     description: 'yomichan → よみちゃん',
     options: basicTextPreprocessorOptions,
-    /** @type {import('language').TextPreprocessorFunction<boolean>} */
     process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
 };
 
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').BidirectionalConversionPreprocessor} */
 export const convertHiraganaToKatakana = {
     name: 'Convert hiragana to katakana',
-    description: 'よみちゃん → ヨミチャン',
-    options: basicTextPreprocessorOptions,
-    /** @type {import('language').TextPreprocessorFunction<boolean>} */
-    process: (str, setting) => (setting ? convertHiraganaToKatakanaFunction(str) : str)
-};
-
-/** @type {import('language').TextPreprocessor<boolean>} */
-export const convertKatakanaToHiragana = {
-    name: 'Convert katakana to hiragana',
-    description: 'ヨミチャン → よみちゃん',
-    options: basicTextPreprocessorOptions,
-    /** @type {import('language').TextPreprocessorFunction<boolean>} */
-    process: (str, setting) => (setting ? convertKatakanaToHiraganaFunction(str) : str)
+    description: 'よみちゃん → ヨミチャン and vice versa',
+    options: ['off', 'direct', 'inverse'],
+    process: (str, setting) => {
+        switch (setting) {
+            case 'off':
+                return str;
+            case 'direct':
+                return convertHiraganaToKatakanaFunction(str);
+            case 'inverse':
+                return convertKatakanaToHiraganaFunction(str);
+        }
+    }
 };
 
 /** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
@@ -75,7 +71,6 @@ export const collapseEmphaticSequences = {
     name: 'Collapse emphatic character sequences',
     description: 'すっっごーーい → すっごーい / すごい',
     options: [[false, false], [true, false], [true, true]],
-    /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
     process: (str, setting, sourceMap) => {
         const [collapseEmphatic, collapseEmphaticFull] = setting;
         if (collapseEmphatic) {
diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js
new file mode 100644
index 00000000..ea6aae82
--- /dev/null
+++ b/ext/js/language/la/latin-text-preprocessors.js
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {Record<string, string>} */
+const diacriticMap = {
+    ā: 'a',
+    ē: 'e',
+    ī: 'i',
+    ō: 'o',
+    ū: 'u',
+    ȳ: 'y',
+    Ā: 'A',
+    Ē: 'E',
+    Ī: 'I',
+    Ō: 'O',
+    Ū: 'U',
+    Ȳ: 'Y',
+    á: 'a',
+    é: 'e',
+    í: 'i',
+    ó: 'o',
+    ú: 'u',
+    ý: 'y',
+    Á: 'A',
+    É: 'E',
+    Í: 'I',
+    Ó: 'O',
+    Ú: 'U',
+    Ý: 'Y'
+};
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeLatinDiacritics = {
+    name: 'Remove diacritics',
+    description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy',
+    options: basicTextPreprocessorOptions,
+    process: (str, setting) => {
+        return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str;
+    }
+};
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index ee65a011..beb1417e 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -15,18 +15,99 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
+import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
+import {eszettPreprocessor} from './de/german-text-preprocessors.js';
+import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
+import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';
+import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
 import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js';
 
+const capitalizationPreprocessors = {
+    decapitalize,
+    capitalizeFirstLetter
+};
+
 /** @type {import('language-descriptors').LanguageDescriptorAny[]} */
 const languageDescriptors = [
     {
+        iso: 'ar',
+        name: 'Arabic',
+        exampleText: 'قَرَأَ',
+        textPreprocessors: {
+            removeArabicScriptDiacritics
+        }
+    },
+    {
+        iso: 'de',
+        name: 'German',
+        exampleText: 'gelesen',
+        textPreprocessors: {
+            ...capitalizationPreprocessors,
+            eszettPreprocessor
+        }
+    },
+    {
+        iso: 'el',
+        name: 'Greek',
+        exampleText: 'διαβάζω',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
         iso: 'en',
         name: 'English',
         exampleText: 'read',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'es',
+        name: 'Spanish',
+        exampleText: 'acabar de',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'fa',
+        name: 'Persian',
+        exampleText: 'خواندن',
         textPreprocessors: {
-            capitalizeFirstLetter,
-            decapitalize
+            removeArabicScriptDiacritics
+        }
+    },
+    {
+        iso: 'fr',
+        name: 'French',
+        exampleText: 'lire',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'grc',
+        name: 'Ancient Greek',
+        exampleText: 'γράφω',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'hu',
+        name: 'Hungarian',
+        exampleText: 'olvasni',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'id',
+        name: 'Indonesian',
+        exampleText: 'membaca',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'it',
+        name: 'Italian',
+        exampleText: 'leggere',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'la',
+        name: 'Latin',
+        exampleText: 'legere',
+        textPreprocessors: {
+            removeLatinDiacritics
         }
     },
     {
@@ -38,9 +119,78 @@ const languageDescriptors = [
             convertNumericCharacters,
             convertAlphabeticCharacters,
             convertHiraganaToKatakana,
-            convertKatakanaToHiragana,
             collapseEmphaticSequences
         }
+    },
+    {
+        iso: 'km',
+        name: 'Khmer',
+        exampleText: 'អាន',
+        textPreprocessors: {}
+    },
+    {
+        iso: 'pl',
+        name: 'Polish',
+        exampleText: 'czytacie',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'pt',
+        name: 'Portuguese',
+        exampleText: 'ler',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'ro',
+        name: 'Romanian',
+        exampleText: 'citit',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'ru',
+        name: 'Russian',
+        exampleText: 'читать',
+        textPreprocessors: {
+            ...capitalizationPreprocessors,
+            yoToE,
+            removeRussianDiacritics
+        }
+    },
+    {
+        iso: 'sh',
+        name: 'Serbo-Croatian',
+        exampleText: 'čitaše',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'sq',
+        name: 'Albanian',
+        exampleText: 'ndihmojme',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'sv',
+        name: 'Swedish',
+        exampleText: 'läsa',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'th',
+        name: 'Thai',
+        exampleText: 'อ่าน',
+        textPreprocessors: {}
+    },
+    {
+        iso: 'vi',
+        name: 'Vietnamese',
+        exampleText: 'đọc',
+        textPreprocessors: capitalizationPreprocessors
+    },
+    {
+        iso: 'zh',
+        name: 'Chinese',
+        exampleText: '读',
+        textPreprocessors: {}
     }
 ];
 
diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js
new file mode 100644
index 00000000..fc4472e9
--- /dev/null
+++ b/ext/js/language/ru/russian-text-preprocessors.js
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2024  Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeRussianDiacritics = {
+    name: 'Remove diacritics',
+    description: 'A\u0301 → A, a\u0301 → a',
+    options: basicTextPreprocessorOptions,
+    process: (str, setting) => {
+        return setting ? str.replace(/\u0301/g, '') : str;
+    }
+};
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const yoToE = {
+    name: 'Yo to E',
+    description: 'ё → е, Ё → Е',
+    options: basicTextPreprocessorOptions,
+    process: (str, setting) => {
+        return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str;
+    }
+};
diff --git a/ext/settings.html b/ext/settings.html
index 999ecc37..441e26df 100644
--- a/ext/settings.html
+++ b/ext/settings.html
@@ -1493,35 +1493,6 @@
         <div class="heading-container">
             <div class="heading-container-icon"><span class="icon" data-icon="translation"></span></div>
             <div class="heading-container-left"><h2 id="translation"><a href="#!translation">Translation</a></h2></div>
-            <div class="heading-container-right"><a tabindex="0" class="more-toggle more-only heading-link-light" data-parent-distance="3">Info&hellip;</a></div>
-        </div>
-        <div class="heading-description more" hidden>
-            <p>
-                The following options are used during the translation process to create alternate versions of the input text to search for.
-                This can be helpful when the input text doesn't exactly match the term or expression found in the database.
-            </p>
-            <p>
-                The conversion options below are listed in the order that the conversions are applied to the input text.
-                Most of the conversions have three possible values:
-            </p>
-            <ul>
-                <li>
-                    <strong>Disabled</strong> -
-                    This conversion will never be applied to the input text.
-                </li>
-                <li>
-                    <strong>Enabled</strong> -
-                    This conversion will always be applied to the input text.
-                </li>
-                <li>
-                    <strong>Use both variants</strong> -
-                    The translator will check the database for two variations: the raw input text and the converted input text.
-                    When multiple options use variants, the translator will search for combinations of the converted text.
-                </li>
-            </ul>
-            <p>
-                <a tabindex="0" class="more-toggle" data-parent-distance="3">Less&hellip;</a>
-            </p>
         </div>
     </div>
     <div class="settings-group advanced-only">
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 00a95883..319a3ca5 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -15,7 +15,7 @@
  * along with this program.  If not, see <https://www.gnu.org/licenses/>.
  */
 
-import type {TextPreprocessor} from './language';
+import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language';
 import type {SafeAny} from './core';
 
 type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = {
@@ -35,21 +35,55 @@ type LanguageDescriptorObjectMap = {
 
 export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap];
 
+type CapitalizationPreprocessors = {
+    capitalizeFirstLetter: TextPreprocessor<boolean>;
+    decapitalize: TextPreprocessor<boolean>;
+};
+
 /**
  * This is a mapping of the iso tag to all of the preprocessors for that language.
  * Any new language should be added to this object.
  */
 type AllTextPreprocessors = {
-    en: {
-        capitalizeFirstLetter: TextPreprocessor<boolean>;
-        decapitalize: TextPreprocessor<boolean>;
+    ar: {
+        removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+    };
+    de: CapitalizationPreprocessors & {
+        eszettPreprocessor: BidirectionalConversionPreprocessor;
+    };
+    el: CapitalizationPreprocessors;
+    en: CapitalizationPreprocessors;
+    es: CapitalizationPreprocessors;
+    fa: {
+        removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+    };
+    fr: CapitalizationPreprocessors;
+    grc: CapitalizationPreprocessors;
+    hu: CapitalizationPreprocessors;
+    id: CapitalizationPreprocessors;
+    it: CapitalizationPreprocessors;
+    la: {
+        removeLatinDiacritics: TextPreprocessor<boolean>;
     };
     ja: {
         convertHalfWidthCharacters: TextPreprocessor<boolean>;
         convertNumericCharacters: TextPreprocessor<boolean>;
         convertAlphabeticCharacters: TextPreprocessor<boolean>;
-        convertHiraganaToKatakana: TextPreprocessor<boolean>;
-        convertKatakanaToHiragana: TextPreprocessor<boolean>;
+        convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
         collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
     };
+    km: Record<string, never>;
+    pl: CapitalizationPreprocessors;
+    pt: CapitalizationPreprocessors;
+    ro: CapitalizationPreprocessors;
+    ru: CapitalizationPreprocessors & {
+        yoToE: TextPreprocessor<boolean>;
+        removeRussianDiacritics: TextPreprocessor<boolean>;
+    };
+    sh: CapitalizationPreprocessors;
+    sq: CapitalizationPreprocessors;
+    sv: CapitalizationPreprocessors;
+    th: Record<string, never>;
+    vi: CapitalizationPreprocessors;
+    zh: Record<string, never>;
 };
diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts
index efbb16c6..8e5a5c70 100644
--- a/types/ext/language.d.ts
+++ b/types/ext/language.d.ts
@@ -21,6 +21,11 @@ export type TextPreprocessorOptions<T = unknown> = T[];
 
 export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string;
 
+/**
+ * Text preprocessors are used during the translation process to create alternate versions of the input text to search for.
+ * This is helpful when the input text doesn't exactly match the term or expression found in the database.
+ * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors.
+ */
 export type TextPreprocessor<T = unknown> = {
     name: string;
     description: string;
@@ -28,6 +33,10 @@ export type TextPreprocessor<T = unknown> = {
     process: TextPreprocessorFunction<T>;
 };
 
+export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';
+
+export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>;
+
 export type LanguageAndPreprocessors = {
     iso: string;
     textPreprocessors: TextPreprocessorWithId<unknown>[];