diff options
| -rw-r--r-- | .eslintrc.json | 4 | ||||
| -rw-r--r-- | ext/js/language/ar/arabic-text-preprocessors.js | 28 | ||||
| -rw-r--r-- | ext/js/language/de/german-text-preprocessors.js | 34 | ||||
| -rw-r--r-- | ext/js/language/ja/japanese-text-preprocessors.js | 31 | ||||
| -rw-r--r-- | ext/js/language/la/latin-text-preprocessors.js | 56 | ||||
| -rw-r--r-- | ext/js/language/language-descriptors.js | 158 | ||||
| -rw-r--r-- | ext/js/language/ru/russian-text-preprocessors.js | 38 | ||||
| -rw-r--r-- | ext/settings.html | 29 | ||||
| -rw-r--r-- | types/ext/language-descriptors.d.ts | 46 | ||||
| -rw-r--r-- | types/ext/language.d.ts | 9 | 
10 files changed, 376 insertions, 57 deletions
| diff --git a/.eslintrc.json b/.eslintrc.json index d3509d85..bc1f2940 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -638,12 +638,16 @@                  "ext/js/general/object-property-accessor.js",                  "ext/js/general/regex-util.js",                  "ext/js/general/text-source-map.js", +                "ext/js/language/ar/arabic-text-preprocessors.js", +                "ext/js/language/de/german-text-preprocessors.js",                  "ext/js/language/ja/japanese-text-preprocessors.js",                  "ext/js/language/ja/japanese-wanakana.js",                  "ext/js/language/ja/japanese.js", +                "ext/js/language/la/latin-text-preprocessors.js",                  "ext/js/language/language-descriptors.js",                  "ext/js/language/language-transformer.js",                  "ext/js/language/languages.js", +                "ext/js/language/ru/russian-text-preprocessors.js",                  "ext/js/language/text-preprocessors.js",                  "ext/js/language/translator.js",                  "ext/js/media/audio-downloader.js", diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js new file mode 100644 index 00000000..f0118564 --- /dev/null +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeArabicScriptDiacritics = { +    name: 'Remove diacritics', +    description: 'وَلَدَ ⬅️ ولد', +    options: basicTextPreprocessorOptions, +    process: (text, setting) => { +        return setting ? text.replace(/[\u064E-\u0650]/g, '') : text; +    } +}; diff --git a/ext/js/language/de/german-text-preprocessors.js b/ext/js/language/de/german-text-preprocessors.js new file mode 100644 index 00000000..e829bf81 --- /dev/null +++ b/ext/js/language/de/german-text-preprocessors.js @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + + +/** @type {import('language').BidirectionalConversionPreprocessor} */ +export const eszettPreprocessor = { +    name: 'Convert "ß" to "ss"', +    description: 'ß → ss, ẞ → SS and vice versa', +    options: ['off', 'direct', 'inverse'], +    process: (str, setting) => { +        switch (setting) { +            case 'off': +                return str; +            case 'direct': +                return str.replace(/ẞ/g, 'SS').replace(/ß/g, 'ss'); +            case 'inverse': +                return str.replace(/SS/g, 'ẞ').replace(/ss/g, 'ß'); +        } +    } +}; diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index ab4138c3..06f944c1 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -30,7 +30,6 @@ export const convertHalfWidthCharacters = {      name: 'Convert half width characters to full width',      description: 'ヨミチャン → ヨミチャン',      options: basicTextPreprocessorOptions, -    /** @type {import('language').TextPreprocessorFunction<boolean>} */      process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)  }; @@ -39,7 +38,6 @@ export const convertNumericCharacters = {      name: 'Convert numeric characters to full width',      description: '1234 → 1234',      options: basicTextPreprocessorOptions, -    /** @type {import('language').TextPreprocessorFunction<boolean>} */      process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)  }; @@ -48,26 +46,24 @@ export const convertAlphabeticCharacters = {      name: 'Convert alphabetic characters to hiragana',      description: 'yomichan → よみちゃん',      options: basicTextPreprocessorOptions, -    /** @type {import('language').TextPreprocessorFunction<boolean>} */      process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)  }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').BidirectionalConversionPreprocessor} */  export const convertHiraganaToKatakana = {      name: 'Convert hiragana to katakana', -    description: 'よみちゃん → ヨミチャン', -    options: basicTextPreprocessorOptions, -    /** @type {import('language').TextPreprocessorFunction<boolean>} */ -    process: (str, setting) => (setting ? convertHiraganaToKatakanaFunction(str) : str) -}; - -/** @type {import('language').TextPreprocessor<boolean>} */ -export const convertKatakanaToHiragana = { -    name: 'Convert katakana to hiragana', -    description: 'ヨミチャン → よみちゃん', -    options: basicTextPreprocessorOptions, -    /** @type {import('language').TextPreprocessorFunction<boolean>} */ -    process: (str, setting) => (setting ? convertKatakanaToHiraganaFunction(str) : str) +    description: 'よみちゃん → ヨミチャン and vice versa', +    options: ['off', 'direct', 'inverse'], +    process: (str, setting) => { +        switch (setting) { +            case 'off': +                return str; +            case 'direct': +                return convertHiraganaToKatakanaFunction(str); +            case 'inverse': +                return convertKatakanaToHiraganaFunction(str); +        } +    }  };  /** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ @@ -75,7 +71,6 @@ export const collapseEmphaticSequences = {      name: 'Collapse emphatic character sequences',      description: 'すっっごーーい → すっごーい / すごい',      options: [[false, false], [true, false], [true, true]], -    /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */      process: (str, setting, sourceMap) => {          const [collapseEmphatic, collapseEmphaticFull] = setting;          if (collapseEmphatic) { diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js new file mode 100644 index 00000000..ea6aae82 --- /dev/null +++ b/ext/js/language/la/latin-text-preprocessors.js @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {Record<string, string>} */ +const diacriticMap = { +    ā: 'a', +    ē: 'e', +    ī: 'i', +    ō: 'o', +    ū: 'u', +    ȳ: 'y', +    Ā: 'A', +    Ē: 'E', +    Ī: 'I', +    Ō: 'O', +    Ū: 'U', +    Ȳ: 'Y', +    á: 'a', +    é: 'e', +    í: 'i', +    ó: 'o', +    ú: 'u', +    ý: 'y', +    Á: 'A', +    É: 'E', +    Í: 'I', +    Ó: 'O', +    Ú: 'U', +    Ý: 'Y' +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeLatinDiacritics = { +    name: 'Remove diacritics', +    description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => { +        return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str; +    } +}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index ee65a011..beb1417e 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -15,18 +15,99 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; +import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js'; +import {eszettPreprocessor} from './de/german-text-preprocessors.js'; +import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; +import {removeLatinDiacritics} from './la/latin-text-preprocessors.js'; +import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';  import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js'; +const capitalizationPreprocessors = { +    decapitalize, +    capitalizeFirstLetter +}; +  /** @type {import('language-descriptors').LanguageDescriptorAny[]} */  const languageDescriptors = [      { +        iso: 'ar', +        name: 'Arabic', +        exampleText: 'قَرَأَ', +        textPreprocessors: { +            removeArabicScriptDiacritics +        } +    }, +    { +        iso: 'de', +        name: 'German', +        exampleText: 'gelesen', +        textPreprocessors: { +            ...capitalizationPreprocessors, +            eszettPreprocessor +        } +    }, +    { +        iso: 'el', +        name: 'Greek', +        exampleText: 'διαβάζω', +        textPreprocessors: capitalizationPreprocessors +    }, +    {          iso: 'en',          name: 'English',          exampleText: 'read', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'es', +        name: 'Spanish', +        exampleText: 'acabar de', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'fa', +        name: 'Persian', +        exampleText: 'خواندن',          textPreprocessors: { -            capitalizeFirstLetter, -            decapitalize +            removeArabicScriptDiacritics +        } +    }, +    { +        iso: 'fr', +        name: 'French', +        exampleText: 'lire', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'grc', +        name: 'Ancient Greek', +        exampleText: 'γράφω', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'hu', +        name: 'Hungarian', +        exampleText: 'olvasni', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'id', +        name: 'Indonesian', +        exampleText: 'membaca', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'it', +        name: 'Italian', +        exampleText: 'leggere', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'la', +        name: 'Latin', +        exampleText: 'legere', +        textPreprocessors: { +            removeLatinDiacritics          }      },      { @@ -38,9 +119,78 @@ const languageDescriptors = [              convertNumericCharacters,              convertAlphabeticCharacters,              convertHiraganaToKatakana, -            convertKatakanaToHiragana,              collapseEmphaticSequences          } +    }, +    { +        iso: 'km', +        name: 'Khmer', +        exampleText: 'អាន', +        textPreprocessors: {} +    }, +    { +        iso: 'pl', +        name: 'Polish', +        exampleText: 'czytacie', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'pt', +        name: 'Portuguese', +        exampleText: 'ler', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'ro', +        name: 'Romanian', +        exampleText: 'citit', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'ru', +        name: 'Russian', +        exampleText: 'читать', +        textPreprocessors: { +            ...capitalizationPreprocessors, +            yoToE, +            removeRussianDiacritics +        } +    }, +    { +        iso: 'sh', +        name: 'Serbo-Croatian', +        exampleText: 'čitaše', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'sq', +        name: 'Albanian', +        exampleText: 'ndihmojme', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'sv', +        name: 'Swedish', +        exampleText: 'läsa', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'th', +        name: 'Thai', +        exampleText: 'อ่าน', +        textPreprocessors: {} +    }, +    { +        iso: 'vi', +        name: 'Vietnamese', +        exampleText: 'đọc', +        textPreprocessors: capitalizationPreprocessors +    }, +    { +        iso: 'zh', +        name: 'Chinese', +        exampleText: '读', +        textPreprocessors: {}      }  ]; diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js new file mode 100644 index 00000000..fc4472e9 --- /dev/null +++ b/ext/js/language/ru/russian-text-preprocessors.js @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const removeRussianDiacritics = { +    name: 'Remove diacritics', +    description: 'A\u0301 → A, a\u0301 → a', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => { +        return setting ? str.replace(/\u0301/g, '') : str; +    } +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const yoToE = { +    name: 'Yo to E', +    description: 'ё → е, Ё → Е', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => { +        return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str; +    } +}; diff --git a/ext/settings.html b/ext/settings.html index 999ecc37..441e26df 100644 --- a/ext/settings.html +++ b/ext/settings.html @@ -1493,35 +1493,6 @@          <div class="heading-container">              <div class="heading-container-icon"><span class="icon" data-icon="translation"></span></div>              <div class="heading-container-left"><h2 id="translation"><a href="#!translation">Translation</a></h2></div> -            <div class="heading-container-right"><a tabindex="0" class="more-toggle more-only heading-link-light" data-parent-distance="3">Info…</a></div> -        </div> -        <div class="heading-description more" hidden> -            <p> -                The following options are used during the translation process to create alternate versions of the input text to search for. -                This can be helpful when the input text doesn't exactly match the term or expression found in the database. -            </p> -            <p> -                The conversion options below are listed in the order that the conversions are applied to the input text. -                Most of the conversions have three possible values: -            </p> -            <ul> -                <li> -                    <strong>Disabled</strong> - -                    This conversion will never be applied to the input text. -                </li> -                <li> -                    <strong>Enabled</strong> - -                    This conversion will always be applied to the input text. -                </li> -                <li> -                    <strong>Use both variants</strong> - -                    The translator will check the database for two variations: the raw input text and the converted input text. -                    When multiple options use variants, the translator will search for combinations of the converted text. -                </li> -            </ul> -            <p> -                <a tabindex="0" class="more-toggle" data-parent-distance="3">Less…</a> -            </p>          </div>      </div>      <div class="settings-group advanced-only"> diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 00a95883..319a3ca5 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,7 +15,7 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import type {TextPreprocessor} from './language'; +import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language';  import type {SafeAny} from './core';  type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = { @@ -35,21 +35,55 @@ type LanguageDescriptorObjectMap = {  export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap]; +type CapitalizationPreprocessors = { +    capitalizeFirstLetter: TextPreprocessor<boolean>; +    decapitalize: TextPreprocessor<boolean>; +}; +  /**   * This is a mapping of the iso tag to all of the preprocessors for that language.   * Any new language should be added to this object.   */  type AllTextPreprocessors = { -    en: { -        capitalizeFirstLetter: TextPreprocessor<boolean>; -        decapitalize: TextPreprocessor<boolean>; +    ar: { +        removeArabicScriptDiacritics: TextPreprocessor<boolean>; +    }; +    de: CapitalizationPreprocessors & { +        eszettPreprocessor: BidirectionalConversionPreprocessor; +    }; +    el: CapitalizationPreprocessors; +    en: CapitalizationPreprocessors; +    es: CapitalizationPreprocessors; +    fa: { +        removeArabicScriptDiacritics: TextPreprocessor<boolean>; +    }; +    fr: CapitalizationPreprocessors; +    grc: CapitalizationPreprocessors; +    hu: CapitalizationPreprocessors; +    id: CapitalizationPreprocessors; +    it: CapitalizationPreprocessors; +    la: { +        removeLatinDiacritics: TextPreprocessor<boolean>;      };      ja: {          convertHalfWidthCharacters: TextPreprocessor<boolean>;          convertNumericCharacters: TextPreprocessor<boolean>;          convertAlphabeticCharacters: TextPreprocessor<boolean>; -        convertHiraganaToKatakana: TextPreprocessor<boolean>; -        convertKatakanaToHiragana: TextPreprocessor<boolean>; +        convertHiraganaToKatakana: BidirectionalConversionPreprocessor;          collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;      }; +    km: Record<string, never>; +    pl: CapitalizationPreprocessors; +    pt: CapitalizationPreprocessors; +    ro: CapitalizationPreprocessors; +    ru: CapitalizationPreprocessors & { +        yoToE: TextPreprocessor<boolean>; +        removeRussianDiacritics: TextPreprocessor<boolean>; +    }; +    sh: CapitalizationPreprocessors; +    sq: CapitalizationPreprocessors; +    sv: CapitalizationPreprocessors; +    th: Record<string, never>; +    vi: CapitalizationPreprocessors; +    zh: Record<string, never>;  }; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index efbb16c6..8e5a5c70 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -21,6 +21,11 @@ export type TextPreprocessorOptions<T = unknown> = T[];  export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string; +/** + * Text preprocessors are used during the translation process to create alternate versions of the input text to search for. + * This is helpful when the input text doesn't exactly match the term or expression found in the database. + * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors. + */  export type TextPreprocessor<T = unknown> = {      name: string;      description: string; @@ -28,6 +33,10 @@ export type TextPreprocessor<T = unknown> = {      process: TextPreprocessorFunction<T>;  }; +export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse'; + +export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>; +  export type LanguageAndPreprocessors = {      iso: string;      textPreprocessors: TextPreprocessorWithId<unknown>[]; |