diff options
-rw-r--r-- | .eslintrc.json | 4 | ||||
-rw-r--r-- | ext/js/language/ar/arabic-text-preprocessors.js | 28 | ||||
-rw-r--r-- | ext/js/language/de/german-text-preprocessors.js | 34 | ||||
-rw-r--r-- | ext/js/language/ja/japanese-text-preprocessors.js | 31 | ||||
-rw-r--r-- | ext/js/language/la/latin-text-preprocessors.js | 56 | ||||
-rw-r--r-- | ext/js/language/language-descriptors.js | 158 | ||||
-rw-r--r-- | ext/js/language/ru/russian-text-preprocessors.js | 38 | ||||
-rw-r--r-- | ext/settings.html | 29 | ||||
-rw-r--r-- | types/ext/language-descriptors.d.ts | 46 | ||||
-rw-r--r-- | types/ext/language.d.ts | 9 |
10 files changed, 376 insertions, 57 deletions
diff --git a/.eslintrc.json b/.eslintrc.json index d3509d85..bc1f2940 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -638,12 +638,16 @@ "ext/js/general/object-property-accessor.js", "ext/js/general/regex-util.js", "ext/js/general/text-source-map.js", + "ext/js/language/ar/arabic-text-preprocessors.js", + "ext/js/language/de/german-text-preprocessors.js", "ext/js/language/ja/japanese-text-preprocessors.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", + "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", "ext/js/language/language-transformer.js", "ext/js/language/languages.js", + "ext/js/language/ru/russian-text-preprocessors.js", "ext/js/language/text-preprocessors.js", "ext/js/language/translator.js", "ext/js/media/audio-downloader.js", diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js new file mode 100644 index 00000000..f0118564 --- /dev/null +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeArabicScriptDiacritics = { + name: 'Remove diacritics', + description: 'وَلَدَ ⬅️ ولد', + options: basicTextPreprocessorOptions, + process: (text, setting) => { + return setting ? text.replace(/[\u064E-\u0650]/g, '') : text; + } +}; diff --git a/ext/js/language/de/german-text-preprocessors.js b/ext/js/language/de/german-text-preprocessors.js new file mode 100644 index 00000000..e829bf81 --- /dev/null +++ b/ext/js/language/de/german-text-preprocessors.js @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + + +/** @type {import('language').BidirectionalConversionPreprocessor} */ +export const eszettPreprocessor = { + name: 'Convert "ß" to "ss"', + description: 'ß → ss, ẞ → SS and vice versa', + options: ['off', 'direct', 'inverse'], + process: (str, setting) => { + switch (setting) { + case 'off': + return str; + case 'direct': + return str.replace(/ẞ/g, 'SS').replace(/ß/g, 'ss'); + case 'inverse': + return str.replace(/SS/g, 'ẞ').replace(/ss/g, 'ß'); + } + } +}; diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index ab4138c3..06f944c1 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -30,7 +30,6 @@ export const convertHalfWidthCharacters = { name: 'Convert half width characters to full width', description: 'ヨミチャン → ヨミチャン', options: basicTextPreprocessorOptions, - /** @type {import('language').TextPreprocessorFunction<boolean>} */ process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) }; @@ -39,7 +38,6 @@ export const convertNumericCharacters = { name: 'Convert numeric characters to full width', description: '1234 → 1234', options: basicTextPreprocessorOptions, - /** @type {import('language').TextPreprocessorFunction<boolean>} */ process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) }; @@ -48,26 +46,24 @@ export const convertAlphabeticCharacters = { name: 'Convert alphabetic characters to hiragana', description: 'yomichan → よみちゃん', options: basicTextPreprocessorOptions, - /** @type {import('language').TextPreprocessorFunction<boolean>} */ process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').BidirectionalConversionPreprocessor} */ export const convertHiraganaToKatakana = { name: 'Convert hiragana to katakana', - description: 'よみちゃん → ヨミチャン', - options: basicTextPreprocessorOptions, - /** @type {import('language').TextPreprocessorFunction<boolean>} */ - process: (str, setting) => (setting ? convertHiraganaToKatakanaFunction(str) : str) -}; - -/** @type {import('language').TextPreprocessor<boolean>} */ -export const convertKatakanaToHiragana = { - name: 'Convert katakana to hiragana', - description: 'ヨミチャン → よみちゃん', - options: basicTextPreprocessorOptions, - /** @type {import('language').TextPreprocessorFunction<boolean>} */ - process: (str, setting) => (setting ? convertKatakanaToHiraganaFunction(str) : str) + description: 'よみちゃん → ヨミチャン and vice versa', + options: ['off', 'direct', 'inverse'], + process: (str, setting) => { + switch (setting) { + case 'off': + return str; + case 'direct': + return convertHiraganaToKatakanaFunction(str); + case 'inverse': + return convertKatakanaToHiraganaFunction(str); + } + } }; /** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ @@ -75,7 +71,6 @@ export const collapseEmphaticSequences = { name: 'Collapse emphatic character sequences', description: 'すっっごーーい → すっごーい / すごい', options: [[false, false], [true, false], [true, true]], - /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ process: (str, setting, sourceMap) => { const [collapseEmphatic, collapseEmphaticFull] = setting; if (collapseEmphatic) { diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js new file mode 100644 index 00000000..ea6aae82 --- /dev/null +++ b/ext/js/language/la/latin-text-preprocessors.js @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {Record<string, string>} */ +const diacriticMap = { + ā: 'a', + ē: 'e', + ī: 'i', + ō: 'o', + ū: 'u', + ȳ: 'y', + Ā: 'A', + Ē: 'E', + Ī: 'I', + Ō: 'O', + Ū: 'U', + Ȳ: 'Y', + á: 'a', + é: 'e', + í: 'i', + ó: 'o', + ú: 'u', + ý: 'y', + Á: 'A', + É: 'E', + Í: 'I', + Ó: 'O', + Ú: 'U', + Ý: 'Y' +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeLatinDiacritics = { + name: 'Remove diacritics', + description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy', + options: basicTextPreprocessorOptions, + process: (str, setting) => { + return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str; + } +}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index ee65a011..beb1417e 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -15,18 +15,99 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; +import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js'; +import {eszettPreprocessor} from './de/german-text-preprocessors.js'; +import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; +import {removeLatinDiacritics} from './la/latin-text-preprocessors.js'; +import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js'; +const capitalizationPreprocessors = { + decapitalize, + capitalizeFirstLetter +}; + /** @type {import('language-descriptors').LanguageDescriptorAny[]} */ const languageDescriptors = [ { + iso: 'ar', + name: 'Arabic', + exampleText: 'قَرَأَ', + textPreprocessors: { + removeArabicScriptDiacritics + } + }, + { + iso: 'de', + name: 'German', + exampleText: 'gelesen', + textPreprocessors: { + ...capitalizationPreprocessors, + eszettPreprocessor + } + }, + { + iso: 'el', + name: 'Greek', + exampleText: 'διαβάζω', + textPreprocessors: capitalizationPreprocessors + }, + { iso: 'en', name: 'English', exampleText: 'read', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'es', + name: 'Spanish', + exampleText: 'acabar de', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'fa', + name: 'Persian', + exampleText: 'خواندن', textPreprocessors: { - capitalizeFirstLetter, - decapitalize + removeArabicScriptDiacritics + } + }, + { + iso: 'fr', + name: 'French', + exampleText: 'lire', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'grc', + name: 'Ancient Greek', + exampleText: 'γράφω', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'hu', + name: 'Hungarian', + exampleText: 'olvasni', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'id', + name: 'Indonesian', + exampleText: 'membaca', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'it', + name: 'Italian', + exampleText: 'leggere', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'la', + name: 'Latin', + exampleText: 'legere', + textPreprocessors: { + removeLatinDiacritics } }, { @@ -38,9 +119,78 @@ const languageDescriptors = [ convertNumericCharacters, convertAlphabeticCharacters, convertHiraganaToKatakana, - convertKatakanaToHiragana, collapseEmphaticSequences } + }, + { + iso: 'km', + name: 'Khmer', + exampleText: 'អាន', + textPreprocessors: {} + }, + { + iso: 'pl', + name: 'Polish', + exampleText: 'czytacie', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'pt', + name: 'Portuguese', + exampleText: 'ler', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'ro', + name: 'Romanian', + exampleText: 'citit', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'ru', + name: 'Russian', + exampleText: 'читать', + textPreprocessors: { + ...capitalizationPreprocessors, + yoToE, + removeRussianDiacritics + } + }, + { + iso: 'sh', + name: 'Serbo-Croatian', + exampleText: 'čitaše', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'sq', + name: 'Albanian', + exampleText: 'ndihmojme', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'sv', + name: 'Swedish', + exampleText: 'läsa', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'th', + name: 'Thai', + exampleText: 'อ่าน', + textPreprocessors: {} + }, + { + iso: 'vi', + name: 'Vietnamese', + exampleText: 'đọc', + textPreprocessors: capitalizationPreprocessors + }, + { + iso: 'zh', + name: 'Chinese', + exampleText: '读', + textPreprocessors: {} } ]; diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js new file mode 100644 index 00000000..fc4472e9 --- /dev/null +++ b/ext/js/language/ru/russian-text-preprocessors.js @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeRussianDiacritics = { + name: 'Remove diacritics', + description: 'A\u0301 → A, a\u0301 → a', + options: basicTextPreprocessorOptions, + process: (str, setting) => { + return setting ? str.replace(/\u0301/g, '') : str; + } +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const yoToE = { + name: 'Yo to E', + description: 'ё → е, Ё → Е', + options: basicTextPreprocessorOptions, + process: (str, setting) => { + return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str; + } +}; diff --git a/ext/settings.html b/ext/settings.html index 999ecc37..441e26df 100644 --- a/ext/settings.html +++ b/ext/settings.html @@ -1493,35 +1493,6 @@ <div class="heading-container"> <div class="heading-container-icon"><span class="icon" data-icon="translation"></span></div> <div class="heading-container-left"><h2 id="translation"><a href="#!translation">Translation</a></h2></div> - <div class="heading-container-right"><a tabindex="0" class="more-toggle more-only heading-link-light" data-parent-distance="3">Info…</a></div> - </div> - <div class="heading-description more" hidden> - <p> - The following options are used during the translation process to create alternate versions of the input text to search for. - This can be helpful when the input text doesn't exactly match the term or expression found in the database. - </p> - <p> - The conversion options below are listed in the order that the conversions are applied to the input text. - Most of the conversions have three possible values: - </p> - <ul> - <li> - <strong>Disabled</strong> - - This conversion will never be applied to the input text. - </li> - <li> - <strong>Enabled</strong> - - This conversion will always be applied to the input text. - </li> - <li> - <strong>Use both variants</strong> - - The translator will check the database for two variations: the raw input text and the converted input text. - When multiple options use variants, the translator will search for combinations of the converted text. - </li> - </ul> - <p> - <a tabindex="0" class="more-toggle" data-parent-distance="3">Less…</a> - </p> </div> </div> <div class="settings-group advanced-only"> diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 00a95883..319a3ca5 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,7 +15,7 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import type {TextPreprocessor} from './language'; +import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language'; import type {SafeAny} from './core'; type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = { @@ -35,21 +35,55 @@ type LanguageDescriptorObjectMap = { export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap]; +type CapitalizationPreprocessors = { + capitalizeFirstLetter: TextPreprocessor<boolean>; + decapitalize: TextPreprocessor<boolean>; +}; + /** * This is a mapping of the iso tag to all of the preprocessors for that language. * Any new language should be added to this object. */ type AllTextPreprocessors = { - en: { - capitalizeFirstLetter: TextPreprocessor<boolean>; - decapitalize: TextPreprocessor<boolean>; + ar: { + removeArabicScriptDiacritics: TextPreprocessor<boolean>; + }; + de: CapitalizationPreprocessors & { + eszettPreprocessor: BidirectionalConversionPreprocessor; + }; + el: CapitalizationPreprocessors; + en: CapitalizationPreprocessors; + es: CapitalizationPreprocessors; + fa: { + removeArabicScriptDiacritics: TextPreprocessor<boolean>; + }; + fr: CapitalizationPreprocessors; + grc: CapitalizationPreprocessors; + hu: CapitalizationPreprocessors; + id: CapitalizationPreprocessors; + it: CapitalizationPreprocessors; + la: { + removeLatinDiacritics: TextPreprocessor<boolean>; }; ja: { convertHalfWidthCharacters: TextPreprocessor<boolean>; convertNumericCharacters: TextPreprocessor<boolean>; convertAlphabeticCharacters: TextPreprocessor<boolean>; - convertHiraganaToKatakana: TextPreprocessor<boolean>; - convertKatakanaToHiragana: TextPreprocessor<boolean>; + convertHiraganaToKatakana: BidirectionalConversionPreprocessor; collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; }; + km: Record<string, never>; + pl: CapitalizationPreprocessors; + pt: CapitalizationPreprocessors; + ro: CapitalizationPreprocessors; + ru: CapitalizationPreprocessors & { + yoToE: TextPreprocessor<boolean>; + removeRussianDiacritics: TextPreprocessor<boolean>; + }; + sh: CapitalizationPreprocessors; + sq: CapitalizationPreprocessors; + sv: CapitalizationPreprocessors; + th: Record<string, never>; + vi: CapitalizationPreprocessors; + zh: Record<string, never>; }; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index efbb16c6..8e5a5c70 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -21,6 +21,11 @@ export type TextPreprocessorOptions<T = unknown> = T[]; export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string; +/** + * Text preprocessors are used during the translation process to create alternate versions of the input text to search for. + * This is helpful when the input text doesn't exactly match the term or expression found in the database. + * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors. + */ export type TextPreprocessor<T = unknown> = { name: string; description: string; @@ -28,6 +33,10 @@ export type TextPreprocessor<T = unknown> = { process: TextPreprocessorFunction<T>; }; +export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse'; + +export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>; + export type LanguageAndPreprocessors = { iso: string; textPreprocessors: TextPreprocessorWithId<unknown>[]; |