aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.eslintrc.json4
-rw-r--r--ext/js/language/ar/arabic-text-preprocessors.js28
-rw-r--r--ext/js/language/de/german-text-preprocessors.js34
-rw-r--r--ext/js/language/ja/japanese-text-preprocessors.js31
-rw-r--r--ext/js/language/la/latin-text-preprocessors.js56
-rw-r--r--ext/js/language/language-descriptors.js158
-rw-r--r--ext/js/language/ru/russian-text-preprocessors.js38
-rw-r--r--ext/settings.html29
-rw-r--r--types/ext/language-descriptors.d.ts46
-rw-r--r--types/ext/language.d.ts9
10 files changed, 376 insertions, 57 deletions
diff --git a/.eslintrc.json b/.eslintrc.json
index d3509d85..bc1f2940 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -638,12 +638,16 @@
"ext/js/general/object-property-accessor.js",
"ext/js/general/regex-util.js",
"ext/js/general/text-source-map.js",
+ "ext/js/language/ar/arabic-text-preprocessors.js",
+ "ext/js/language/de/german-text-preprocessors.js",
"ext/js/language/ja/japanese-text-preprocessors.js",
"ext/js/language/ja/japanese-wanakana.js",
"ext/js/language/ja/japanese.js",
+ "ext/js/language/la/latin-text-preprocessors.js",
"ext/js/language/language-descriptors.js",
"ext/js/language/language-transformer.js",
"ext/js/language/languages.js",
+ "ext/js/language/ru/russian-text-preprocessors.js",
"ext/js/language/text-preprocessors.js",
"ext/js/language/translator.js",
"ext/js/media/audio-downloader.js",
diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js
new file mode 100644
index 00000000..f0118564
--- /dev/null
+++ b/ext/js/language/ar/arabic-text-preprocessors.js
@@ -0,0 +1,28 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeArabicScriptDiacritics = {
+ name: 'Remove diacritics',
+ description: 'وَلَدَ ⬅️ ولد',
+ options: basicTextPreprocessorOptions,
+ process: (text, setting) => {
+ return setting ? text.replace(/[\u064E-\u0650]/g, '') : text;
+ }
+};
diff --git a/ext/js/language/de/german-text-preprocessors.js b/ext/js/language/de/german-text-preprocessors.js
new file mode 100644
index 00000000..e829bf81
--- /dev/null
+++ b/ext/js/language/de/german-text-preprocessors.js
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+
+/** @type {import('language').BidirectionalConversionPreprocessor} */
+export const eszettPreprocessor = {
+ name: 'Convert "ß" to "ss"',
+ description: 'ß → ss, ẞ → SS and vice versa',
+ options: ['off', 'direct', 'inverse'],
+ process: (str, setting) => {
+ switch (setting) {
+ case 'off':
+ return str;
+ case 'direct':
+ return str.replace(/ẞ/g, 'SS').replace(/ß/g, 'ss');
+ case 'inverse':
+ return str.replace(/SS/g, 'ẞ').replace(/ss/g, 'ß');
+ }
+ }
+};
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index ab4138c3..06f944c1 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -30,7 +30,6 @@ export const convertHalfWidthCharacters = {
name: 'Convert half width characters to full width',
description: 'ヨミチャン → ヨミチャン',
options: basicTextPreprocessorOptions,
- /** @type {import('language').TextPreprocessorFunction<boolean>} */
process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
};
@@ -39,7 +38,6 @@ export const convertNumericCharacters = {
name: 'Convert numeric characters to full width',
description: '1234 → 1234',
options: basicTextPreprocessorOptions,
- /** @type {import('language').TextPreprocessorFunction<boolean>} */
process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
};
@@ -48,26 +46,24 @@ export const convertAlphabeticCharacters = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
options: basicTextPreprocessorOptions,
- /** @type {import('language').TextPreprocessorFunction<boolean>} */
process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
};
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').BidirectionalConversionPreprocessor} */
export const convertHiraganaToKatakana = {
name: 'Convert hiragana to katakana',
- description: 'よみちゃん → ヨミチャン',
- options: basicTextPreprocessorOptions,
- /** @type {import('language').TextPreprocessorFunction<boolean>} */
- process: (str, setting) => (setting ? convertHiraganaToKatakanaFunction(str) : str)
-};
-
-/** @type {import('language').TextPreprocessor<boolean>} */
-export const convertKatakanaToHiragana = {
- name: 'Convert katakana to hiragana',
- description: 'ヨミチャン → よみちゃん',
- options: basicTextPreprocessorOptions,
- /** @type {import('language').TextPreprocessorFunction<boolean>} */
- process: (str, setting) => (setting ? convertKatakanaToHiraganaFunction(str) : str)
+ description: 'よみちゃん → ヨミチャン and vice versa',
+ options: ['off', 'direct', 'inverse'],
+ process: (str, setting) => {
+ switch (setting) {
+ case 'off':
+ return str;
+ case 'direct':
+ return convertHiraganaToKatakanaFunction(str);
+ case 'inverse':
+ return convertKatakanaToHiraganaFunction(str);
+ }
+ }
};
/** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
@@ -75,7 +71,6 @@ export const collapseEmphaticSequences = {
name: 'Collapse emphatic character sequences',
description: 'すっっごーーい → すっごーい / すごい',
options: [[false, false], [true, false], [true, true]],
- /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
process: (str, setting, sourceMap) => {
const [collapseEmphatic, collapseEmphaticFull] = setting;
if (collapseEmphatic) {
diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js
new file mode 100644
index 00000000..ea6aae82
--- /dev/null
+++ b/ext/js/language/la/latin-text-preprocessors.js
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {Record<string, string>} */
+const diacriticMap = {
+ ā: 'a',
+ ē: 'e',
+ ī: 'i',
+ ō: 'o',
+ ū: 'u',
+ ȳ: 'y',
+ Ā: 'A',
+ Ē: 'E',
+ Ī: 'I',
+ Ō: 'O',
+ Ū: 'U',
+ Ȳ: 'Y',
+ á: 'a',
+ é: 'e',
+ í: 'i',
+ ó: 'o',
+ ú: 'u',
+ ý: 'y',
+ Á: 'A',
+ É: 'E',
+ Í: 'I',
+ Ó: 'O',
+ Ú: 'U',
+ Ý: 'Y'
+};
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeLatinDiacritics = {
+ name: 'Remove diacritics',
+ description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => {
+ return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str;
+ }
+};
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index ee65a011..beb1417e 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -15,18 +15,99 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
+import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
+import {eszettPreprocessor} from './de/german-text-preprocessors.js';
+import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
+import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';
+import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js';
+const capitalizationPreprocessors = {
+ decapitalize,
+ capitalizeFirstLetter
+};
+
/** @type {import('language-descriptors').LanguageDescriptorAny[]} */
const languageDescriptors = [
{
+ iso: 'ar',
+ name: 'Arabic',
+ exampleText: 'قَرَأَ',
+ textPreprocessors: {
+ removeArabicScriptDiacritics
+ }
+ },
+ {
+ iso: 'de',
+ name: 'German',
+ exampleText: 'gelesen',
+ textPreprocessors: {
+ ...capitalizationPreprocessors,
+ eszettPreprocessor
+ }
+ },
+ {
+ iso: 'el',
+ name: 'Greek',
+ exampleText: 'διαβάζω',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
iso: 'en',
name: 'English',
exampleText: 'read',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'es',
+ name: 'Spanish',
+ exampleText: 'acabar de',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'fa',
+ name: 'Persian',
+ exampleText: 'خواندن',
textPreprocessors: {
- capitalizeFirstLetter,
- decapitalize
+ removeArabicScriptDiacritics
+ }
+ },
+ {
+ iso: 'fr',
+ name: 'French',
+ exampleText: 'lire',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'grc',
+ name: 'Ancient Greek',
+ exampleText: 'γράφω',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'hu',
+ name: 'Hungarian',
+ exampleText: 'olvasni',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'id',
+ name: 'Indonesian',
+ exampleText: 'membaca',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'it',
+ name: 'Italian',
+ exampleText: 'leggere',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'la',
+ name: 'Latin',
+ exampleText: 'legere',
+ textPreprocessors: {
+ removeLatinDiacritics
}
},
{
@@ -38,9 +119,78 @@ const languageDescriptors = [
convertNumericCharacters,
convertAlphabeticCharacters,
convertHiraganaToKatakana,
- convertKatakanaToHiragana,
collapseEmphaticSequences
}
+ },
+ {
+ iso: 'km',
+ name: 'Khmer',
+ exampleText: 'អាន',
+ textPreprocessors: {}
+ },
+ {
+ iso: 'pl',
+ name: 'Polish',
+ exampleText: 'czytacie',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'pt',
+ name: 'Portuguese',
+ exampleText: 'ler',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'ro',
+ name: 'Romanian',
+ exampleText: 'citit',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'ru',
+ name: 'Russian',
+ exampleText: 'читать',
+ textPreprocessors: {
+ ...capitalizationPreprocessors,
+ yoToE,
+ removeRussianDiacritics
+ }
+ },
+ {
+ iso: 'sh',
+ name: 'Serbo-Croatian',
+ exampleText: 'čitaše',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'sq',
+ name: 'Albanian',
+ exampleText: 'ndihmojme',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'sv',
+ name: 'Swedish',
+ exampleText: 'läsa',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'th',
+ name: 'Thai',
+ exampleText: 'อ่าน',
+ textPreprocessors: {}
+ },
+ {
+ iso: 'vi',
+ name: 'Vietnamese',
+ exampleText: 'đọc',
+ textPreprocessors: capitalizationPreprocessors
+ },
+ {
+ iso: 'zh',
+ name: 'Chinese',
+ exampleText: '读',
+ textPreprocessors: {}
}
];
diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js
new file mode 100644
index 00000000..fc4472e9
--- /dev/null
+++ b/ext/js/language/ru/russian-text-preprocessors.js
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const removeRussianDiacritics = {
+ name: 'Remove diacritics',
+ description: 'A\u0301 → A, a\u0301 → a',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => {
+ return setting ? str.replace(/\u0301/g, '') : str;
+ }
+};
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const yoToE = {
+ name: 'Yo to E',
+ description: 'ё → е, Ё → Е',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => {
+ return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str;
+ }
+};
diff --git a/ext/settings.html b/ext/settings.html
index 999ecc37..441e26df 100644
--- a/ext/settings.html
+++ b/ext/settings.html
@@ -1493,35 +1493,6 @@
<div class="heading-container">
<div class="heading-container-icon"><span class="icon" data-icon="translation"></span></div>
<div class="heading-container-left"><h2 id="translation"><a href="#!translation">Translation</a></h2></div>
- <div class="heading-container-right"><a tabindex="0" class="more-toggle more-only heading-link-light" data-parent-distance="3">Info&hellip;</a></div>
- </div>
- <div class="heading-description more" hidden>
- <p>
- The following options are used during the translation process to create alternate versions of the input text to search for.
- This can be helpful when the input text doesn't exactly match the term or expression found in the database.
- </p>
- <p>
- The conversion options below are listed in the order that the conversions are applied to the input text.
- Most of the conversions have three possible values:
- </p>
- <ul>
- <li>
- <strong>Disabled</strong> -
- This conversion will never be applied to the input text.
- </li>
- <li>
- <strong>Enabled</strong> -
- This conversion will always be applied to the input text.
- </li>
- <li>
- <strong>Use both variants</strong> -
- The translator will check the database for two variations: the raw input text and the converted input text.
- When multiple options use variants, the translator will search for combinations of the converted text.
- </li>
- </ul>
- <p>
- <a tabindex="0" class="more-toggle" data-parent-distance="3">Less&hellip;</a>
- </p>
</div>
</div>
<div class="settings-group advanced-only">
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 00a95883..319a3ca5 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import type {TextPreprocessor} from './language';
+import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language';
import type {SafeAny} from './core';
type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = {
@@ -35,21 +35,55 @@ type LanguageDescriptorObjectMap = {
export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap];
+type CapitalizationPreprocessors = {
+ capitalizeFirstLetter: TextPreprocessor<boolean>;
+ decapitalize: TextPreprocessor<boolean>;
+};
+
/**
* This is a mapping of the iso tag to all of the preprocessors for that language.
* Any new language should be added to this object.
*/
type AllTextPreprocessors = {
- en: {
- capitalizeFirstLetter: TextPreprocessor<boolean>;
- decapitalize: TextPreprocessor<boolean>;
+ ar: {
+ removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+ };
+ de: CapitalizationPreprocessors & {
+ eszettPreprocessor: BidirectionalConversionPreprocessor;
+ };
+ el: CapitalizationPreprocessors;
+ en: CapitalizationPreprocessors;
+ es: CapitalizationPreprocessors;
+ fa: {
+ removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+ };
+ fr: CapitalizationPreprocessors;
+ grc: CapitalizationPreprocessors;
+ hu: CapitalizationPreprocessors;
+ id: CapitalizationPreprocessors;
+ it: CapitalizationPreprocessors;
+ la: {
+ removeLatinDiacritics: TextPreprocessor<boolean>;
};
ja: {
convertHalfWidthCharacters: TextPreprocessor<boolean>;
convertNumericCharacters: TextPreprocessor<boolean>;
convertAlphabeticCharacters: TextPreprocessor<boolean>;
- convertHiraganaToKatakana: TextPreprocessor<boolean>;
- convertKatakanaToHiragana: TextPreprocessor<boolean>;
+ convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
};
+ km: Record<string, never>;
+ pl: CapitalizationPreprocessors;
+ pt: CapitalizationPreprocessors;
+ ro: CapitalizationPreprocessors;
+ ru: CapitalizationPreprocessors & {
+ yoToE: TextPreprocessor<boolean>;
+ removeRussianDiacritics: TextPreprocessor<boolean>;
+ };
+ sh: CapitalizationPreprocessors;
+ sq: CapitalizationPreprocessors;
+ sv: CapitalizationPreprocessors;
+ th: Record<string, never>;
+ vi: CapitalizationPreprocessors;
+ zh: Record<string, never>;
};
diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts
index efbb16c6..8e5a5c70 100644
--- a/types/ext/language.d.ts
+++ b/types/ext/language.d.ts
@@ -21,6 +21,11 @@ export type TextPreprocessorOptions<T = unknown> = T[];
export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string;
+/**
+ * Text preprocessors are used during the translation process to create alternate versions of the input text to search for.
+ * This is helpful when the input text doesn't exactly match the term or expression found in the database.
+ * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors.
+ */
export type TextPreprocessor<T = unknown> = {
name: string;
description: string;
@@ -28,6 +33,10 @@ export type TextPreprocessor<T = unknown> = {
process: TextPreprocessorFunction<T>;
};
+export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';
+
+export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>;
+
export type LanguageAndPreprocessors = {
iso: string;
textPreprocessors: TextPreprocessorWithId<unknown>[];