diff options
| -rw-r--r-- | .eslintrc.json | 1 | ||||
| -rw-r--r-- | ext/js/language/la/latin-text-preprocessors.js | 56 | ||||
| -rw-r--r-- | ext/js/language/language-descriptors.js | 10 | ||||
| -rwxr-xr-x | ext/js/language/text-preprocessors.js | 14 | ||||
| -rw-r--r-- | types/ext/language-descriptors.d.ts | 6 | 
5 files changed, 24 insertions, 63 deletions
| diff --git a/.eslintrc.json b/.eslintrc.json index faac16a8..e44a326e 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,7 +647,6 @@                  "ext/js/language/ja/japanese-transforms.js",                  "ext/js/language/ja/japanese-wanakana.js",                  "ext/js/language/ja/japanese.js", -                "ext/js/language/la/latin-text-preprocessors.js",                  "ext/js/language/language-descriptors.js",                  "ext/js/language/language-transformer.js",                  "ext/js/language/language-transforms.js", diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js deleted file mode 100644 index ea6aae82..00000000 --- a/ext/js/language/la/latin-text-preprocessors.js +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2024  Yomitan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <https://www.gnu.org/licenses/>. - */ - -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; - -/** @type {Record<string, string>} */ -const diacriticMap = { -    ā: 'a', -    ē: 'e', -    ī: 'i', -    ō: 'o', -    ū: 'u', -    ȳ: 'y', -    Ā: 'A', -    Ē: 'E', -    Ī: 'I', -    Ō: 'O', -    Ū: 'U', -    Ȳ: 'Y', -    á: 'a', -    é: 'e', -    í: 'i', -    ó: 'o', -    ú: 'u', -    ý: 'y', -    Á: 'A', -    É: 'E', -    Í: 'I', -    Ó: 'O', -    Ú: 'U', -    Ý: 'Y' -}; - -/** @type {import('language').TextPreprocessor<boolean>} */ -export const removeLatinDiacritics = { -    name: 'Remove diacritics', -    description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy', -    options: basicTextPreprocessorOptions, -    process: (str, setting) => { -        return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str; -    } -}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index b4af2f8a..b5d7573b 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -21,10 +21,9 @@ import {englishTransforms} from './en/english-transforms.js';  import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';  import {japaneseTransforms} from './ja/japanese-transforms.js';  import {isStringPartiallyJapanese} from './ja/japanese.js'; -import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';  import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';  import {albanianTransforms} from './sq/albanian-transforms.js'; -import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js'; +import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js';  const capitalizationPreprocessors = {      decapitalize, @@ -87,7 +86,10 @@ const languageDescriptors = [          iso: 'grc',          name: 'Ancient Greek',          exampleText: 'γράφω', -        textPreprocessors: capitalizationPreprocessors +        textPreprocessors: { +            ...capitalizationPreprocessors, +            removeAlphabeticDiacritics +        }      },      {          iso: 'hu', @@ -113,7 +115,7 @@ const languageDescriptors = [          exampleText: 'legere',          textPreprocessors: {              ...capitalizationPreprocessors, -            removeLatinDiacritics +            removeAlphabeticDiacritics          }      },      { diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js index 12b3d1b6..e33fccda 100755 --- a/ext/js/language/text-preprocessors.js +++ b/ext/js/language/text-preprocessors.js @@ -33,3 +33,17 @@ export const capitalizeFirstLetter = {      options: basicTextPreprocessorOptions,      process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)  }; + +/** + * WARNING: This should NOT be used with languages that use Han characters, + *          as it can result in undesirable normalization: + *            - '\u9038'.normalize('NFD') => '\u9038' (逸) + *            - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸) + * @type {import('language').TextPreprocessor<boolean>} + */ +export const removeAlphabeticDiacritics = { +    name: 'Remove Alphabetic Diacritics', +    description: 'ἄήé -> αηe', +    options: basicTextPreprocessorOptions, +    process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str) +}; diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index dae586a2..41a1eec8 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -69,12 +69,14 @@ type AllTextPreprocessors = {          removeArabicScriptDiacritics: TextPreprocessor<boolean>;      };      fr: CapitalizationPreprocessors; -    grc: CapitalizationPreprocessors; +    grc: CapitalizationPreprocessors & { +        removeAlphabeticDiacritics: TextPreprocessor<boolean>; +    };      hu: CapitalizationPreprocessors;      id: CapitalizationPreprocessors;      it: CapitalizationPreprocessors;      la: CapitalizationPreprocessors & { -        removeLatinDiacritics: TextPreprocessor<boolean>; +        removeAlphabeticDiacritics: TextPreprocessor<boolean>;      };      ja: {          convertHalfWidthCharacters: TextPreprocessor<boolean>; |