summaryrefslogtreecommitdiff
path: root/ext/js/language/text-preprocessors.js
diff options
context:
space:
mode:
authorMatttttt <18152455+martholomew@users.noreply.github.com>2024-04-08 19:54:04 +0100
committerGitHub <noreply@github.com>2024-04-08 18:54:04 +0000
commit0663774b02faeb108d4b18d8f8a7e6e93e277313 (patch)
tree8b9cddef6a6987e273c58bca79959c20d943dcfb /ext/js/language/text-preprocessors.js
parent2c5af215ee533a18b4da39bad6b696701dd07978 (diff)
Simplify diacratic removal; modify Latin & Greek preprocessors (#724)
* Simplified diacratic removal and added preprocessors to LA and GRC * linted * Clarified the name of removeAlphabeticDiacritics * Add comment to removeAlphabeticDiacritics Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> * Change to NFD Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * Remove trailing spaces in comment Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> * Remove latin preprocessors .eslintrc.json Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> * fix tests --------- Signed-off-by: Darius Jahandarie <djahandarie@gmail.com> Signed-off-by: Matttttt <18152455+martholomew@users.noreply.github.com> Co-authored-by: martholomew <martholomew@users.noreply.github.com> Co-authored-by: Darius Jahandarie <djahandarie@gmail.com> Co-authored-by: Stefan Vukovic <stefanvukovic44@gmail.com>
Diffstat (limited to 'ext/js/language/text-preprocessors.js')
-rwxr-xr-xext/js/language/text-preprocessors.js14
1 files changed, 14 insertions, 0 deletions
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js
index 12b3d1b6..e33fccda 100755
--- a/ext/js/language/text-preprocessors.js
+++ b/ext/js/language/text-preprocessors.js
@@ -33,3 +33,17 @@ export const capitalizeFirstLetter = {
options: basicTextPreprocessorOptions,
process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
};
+
+/**
+ * WARNING: This should NOT be used with languages that use Han characters,
+ * as it can result in undesirable normalization:
+ * - '\u9038'.normalize('NFD') => '\u9038' (逸)
+ * - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸)
+ * @type {import('language').TextPreprocessor<boolean>}
+ */
+export const removeAlphabeticDiacritics = {
+ name: 'Remove Alphabetic Diacritics',
+ description: 'ἄήé -> αηe',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)
+};