aboutsummaryrefslogtreecommitdiff
path: root/ext/js
diff options
context:
space:
mode:
authorMatttttt <18152455+martholomew@users.noreply.github.com>2024-04-08 19:54:04 +0100
committerGitHub <noreply@github.com>2024-04-08 18:54:04 +0000
commit0663774b02faeb108d4b18d8f8a7e6e93e277313 (patch)
tree8b9cddef6a6987e273c58bca79959c20d943dcfb /ext/js
parent2c5af215ee533a18b4da39bad6b696701dd07978 (diff)
Simplify diacratic removal; modify Latin & Greek preprocessors (#724)
* Simplified diacratic removal and added preprocessors to LA and GRC * linted * Clarified the name of removeAlphabeticDiacritics * Add comment to removeAlphabeticDiacritics Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> * Change to NFD Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * Remove trailing spaces in comment Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> * Remove latin preprocessors .eslintrc.json Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * fix tests --------- Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> Co-authored-by: martholomew <martholomew@users.noreply.github.com> Co-authored-by: Darius Jahandarie <djahandarie@gmail.com> Co-authored-by: Stefan Vukovic <stefanvukovic44@gmail.com>
Diffstat (limited to 'ext/js')
-rw-r--r--ext/js/language/la/latin-text-preprocessors.js56
-rw-r--r--ext/js/language/language-descriptors.js10
-rwxr-xr-xext/js/language/text-preprocessors.js14
3 files changed, 20 insertions, 60 deletions
diff --git a/ext/js/language/la/latin-text-preprocessors.js b/ext/js/language/la/latin-text-preprocessors.js
deleted file mode 100644
index ea6aae82..00000000
--- a/ext/js/language/la/latin-text-preprocessors.js
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 2024 Yomitan Authors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
-
-/** @type {Record<string, string>} */
-const diacriticMap = {
- ā: 'a',
- ē: 'e',
- ī: 'i',
- ō: 'o',
- ū: 'u',
- ȳ: 'y',
- Ā: 'A',
- Ē: 'E',
- Ī: 'I',
- Ō: 'O',
- Ū: 'U',
- Ȳ: 'Y',
- á: 'a',
- é: 'e',
- í: 'i',
- ó: 'o',
- ú: 'u',
- ý: 'y',
- Á: 'A',
- É: 'E',
- Í: 'I',
- Ó: 'O',
- Ú: 'U',
- Ý: 'Y'
-};
-
-/** @type {import('language').TextPreprocessor<boolean>} */
-export const removeLatinDiacritics = {
- name: 'Remove diacritics',
- description: 'āēīōūȳ → aeiouy, áéíóúý → aeiouy',
- options: basicTextPreprocessorOptions,
- process: (str, setting) => {
- return setting ? str.replace(/[āēīōūȳáéíóúýĀĒĪŌŪȲÁÉÍÓÚÝ]/g, (match) => diacriticMap[match] || match) : str;
- }
-};
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index b4af2f8a..b5d7573b 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -21,10 +21,9 @@ import {englishTransforms} from './en/english-transforms.js';
import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
-import {removeLatinDiacritics} from './la/latin-text-preprocessors.js';
import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
import {albanianTransforms} from './sq/albanian-transforms.js';
-import {capitalizeFirstLetter, decapitalize} from './text-preprocessors.js';
+import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js';
const capitalizationPreprocessors = {
decapitalize,
@@ -87,7 +86,10 @@ const languageDescriptors = [
iso: 'grc',
name: 'Ancient Greek',
exampleText: 'γράφω',
- textPreprocessors: capitalizationPreprocessors
+ textPreprocessors: {
+ ...capitalizationPreprocessors,
+ removeAlphabeticDiacritics
+ }
},
{
iso: 'hu',
@@ -113,7 +115,7 @@ const languageDescriptors = [
exampleText: 'legere',
textPreprocessors: {
...capitalizationPreprocessors,
- removeLatinDiacritics
+ removeAlphabeticDiacritics
}
},
{
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js
index 12b3d1b6..e33fccda 100755
--- a/ext/js/language/text-preprocessors.js
+++ b/ext/js/language/text-preprocessors.js
@@ -33,3 +33,17 @@ export const capitalizeFirstLetter = {
options: basicTextPreprocessorOptions,
process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
};
+
+/**
+ * WARNING: This should NOT be used with languages that use Han characters,
+ * as it can result in undesirable normalization:
+ * - '\u9038'.normalize('NFD') => '\u9038' (逸)
+ * - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸)
+ * @type {import('language').TextPreprocessor<boolean>}
+ */
+export const removeAlphabeticDiacritics = {
+ name: 'Remove Alphabetic Diacritics',
+ description: 'ἄήé -> αηe',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)
+};