diff options
author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-06-03 23:11:34 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-06-03 21:11:34 +0000 |
commit | 3a2a740b6517d18de726a44b75b34155fe8f1259 (patch) | |
tree | 9ff8a0a7709abffe4a4cbea1e1f9da0445b3177d | |
parent | 7955fc85ac089d856b44bdea78eccd26ffbd690c (diff) |
[sh] preprocess accent marks (#1024)
-rw-r--r-- | .eslintrc.json | 1 | ||||
-rw-r--r-- | ext/js/language/language-descriptors.js | 6 | ||||
-rw-r--r-- | ext/js/language/sh/serbo-croatian-text-preprocessors.js | 31 | ||||
-rw-r--r-- | types/ext/language-descriptors.d.ts | 4 |
4 files changed, 40 insertions, 2 deletions
diff --git a/.eslintrc.json b/.eslintrc.json index 5a81064d..3f6b467b 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -660,6 +660,7 @@ "ext/js/language/multi-language-transformer.js", "ext/js/language/ru/russian-text-preprocessors.js", "ext/js/language/sga/old-irish-transforms.js", + "ext/js/language/sh/serbo-croatian-text-preprocessors.js", "ext/js/language/sq/albanian-transforms.js", "ext/js/language/text-processors.js", "ext/js/language/translator.js", diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index defd73a8..98d3f6c8 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -34,6 +34,7 @@ import {koreanTransforms} from './ko/korean-transforms.js'; import {latinTransforms} from './la/latin-transforms.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {oldIrishTransforms} from './sga/old-irish-transforms.js'; +import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; import {isStringPartiallyChinese} from './zh/chinese.js'; @@ -224,7 +225,10 @@ const languageDescriptors = [ iso: 'sh', name: 'Serbo-Croatian', exampleText: 'čitaše', - textPreprocessors: capitalizationPreprocessors, + textPreprocessors: { + ...capitalizationPreprocessors, + removeSerboCroatianAccentMarks, + }, }, { iso: 'sq', diff --git a/ext/js/language/sh/serbo-croatian-text-preprocessors.js b/ext/js/language/sh/serbo-croatian-text-preprocessors.js new file mode 100644 index 00000000..7b1b69a1 --- /dev/null +++ b/ext/js/language/sh/serbo-croatian-text-preprocessors.js @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextProcessorOptions} from '../text-processors.js'; + +/** @type {import('language').TextProcessor<boolean>} */ +export const removeSerboCroatianAccentMarks = { + name: 'Remove diacritics', + description: 'A\u0301 → A, a\u0301 → a', + options: basicTextProcessorOptions, + process: (str, setting) => ( + setting ? + str.normalize('NFD').replace(/[aeiourAEIOUR][\u0300-\u036f]/g, (match) => match[0]) : + str + ), + +}; diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 69ccec44..270d753b 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -157,7 +157,9 @@ type AllTextProcessors = { }; }; sh: { - pre: CapitalizationPreprocessors; + pre: CapitalizationPreprocessors & { + removeSerboCroatianAccentMarks: TextProcessor<boolean>; + }; }; sq: { pre: CapitalizationPreprocessors; |