From 0663774b02faeb108d4b18d8f8a7e6e93e277313 Mon Sep 17 00:00:00 2001 From: Matttttt <18152455+martholomew@users.noreply.github.com> Date: Mon, 8 Apr 2024 19:54:04 +0100 Subject: Simplify diacratic removal; modify Latin & Greek preprocessors (#724) * Simplified diacratic removal and added preprocessors to LA and GRC * linted * Clarified the name of removeAlphabeticDiacritics * Add comment to removeAlphabeticDiacritics Signed-off-by: Darius Jahandarie * Change to NFD Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * Remove trailing spaces in comment Signed-off-by: Darius Jahandarie * Remove latin preprocessors .eslintrc.json Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * fix tests --------- Signed-off-by: Darius Jahandarie Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> Co-authored-by: martholomew Co-authored-by: Darius Jahandarie Co-authored-by: Stefan Vukovic --- .eslintrc.json | 1 - ext/js/language/la/latin-text-preprocessors.js | 56 -------------------------- ext/js/language/language-descriptors.js | 10 +++-- ext/js/language/text-preprocessors.js | 14 +++++++ types/ext/language-descriptors.d.ts | 6 ++- 5 files changed, 24 insertions(+), 63 deletions(-) delete mode 100644 ext/js/language/la/latin-text-preprocessors.js diff --git a/.eslintrc.json b/.eslintrc.json index faac16a8..e44a326e 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -647,7 +647,6 @@ "ext/js/language/ja/japanese-transforms.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", - "ext/js/language/la/latin-text-preprocessors.js", "ext/js/language/language-descriptors.js", "ext/js/language/language-transformer.js", "ext/js/language/language-transforms.js", diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js deleted file mode 100644 index ea6aae82..00000000 --- a/ext/js/language/la/latin-text-preprocessors.js +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2024 Yomitan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; - -/** @type {Record} */ -const diacriticMap = { - ā: 'a', - ē: 'e', - ī: 'i', - ō: 'o', - ū: 'u', - ȳ: 'y', - Ā: 'A', - Ē: 'E', - Ī: 'I', - Ō: 'O', - Ū: 'U', - Ȳ: 'Y', - á: 'a', - é: 'e', - í: 'i', - ó: 'o', - ú: 'u', - ý: 'y', - Á: 'A', - É: 'E', - Í: 'I', - Ó: 'O', - Ú: 'U', - Ý: 'Y' -}; - -/** @type {import('language').TextPreprocessor} */ -export const removeLatinDiacritics = { - name: 'Remove diacritics', - description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy', - options: basicTextPreprocessorOptions, - process: (str, setting) => { - return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str; - } -}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index b4af2f8a..b5d7573b 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -21,10 +21,9 @@ import {englishTransforms} from './en/english-transforms.js'; import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; -import {removeLatinDiacritics} from './la/latin-text-preprocessors.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; -import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js'; +import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js'; const capitalizationPreprocessors = { decapitalize, @@ -87,7 +86,10 @@ const languageDescriptors = [ iso: 'grc', name: 'Ancient Greek', exampleText: 'γράφω', - textPreprocessors: capitalizationPreprocessors + textPreprocessors: { + ...capitalizationPreprocessors, + removeAlphabeticDiacritics + } }, { iso: 'hu', @@ -113,7 +115,7 @@ const languageDescriptors = [ exampleText: 'legere', textPreprocessors: { ...capitalizationPreprocessors, - removeLatinDiacritics + removeAlphabeticDiacritics } }, { diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js index 12b3d1b6..e33fccda 100755 --- a/ext/js/language/text-preprocessors.js +++ b/ext/js/language/text-preprocessors.js @@ -33,3 +33,17 @@ export const capitalizeFirstLetter = { options: basicTextPreprocessorOptions, process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str) }; + +/** + * WARNING: This should NOT be used with languages that use Han characters, + * as it can result in undesirable normalization: + * - '\u9038'.normalize('NFD') => '\u9038' (逸) + * - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸) + * @type {import('language').TextPreprocessor} + */ +export const removeAlphabeticDiacritics = { + name: 'Remove Alphabetic Diacritics', + description: 'ἄήé -> αηe', + options: basicTextPreprocessorOptions, + process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str) +}; diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index dae586a2..41a1eec8 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -69,12 +69,14 @@ type AllTextPreprocessors = { removeArabicScriptDiacritics: TextPreprocessor; }; fr: CapitalizationPreprocessors; - grc: CapitalizationPreprocessors; + grc: CapitalizationPreprocessors & { + removeAlphabeticDiacritics: TextPreprocessor; + }; hu: CapitalizationPreprocessors; id: CapitalizationPreprocessors; it: CapitalizationPreprocessors; la: CapitalizationPreprocessors & { - removeLatinDiacritics: TextPreprocessor; + removeAlphabeticDiacritics: TextPreprocessor; }; ja: { convertHalfWidthCharacters: TextPreprocessor; -- cgit v1.2.3