From d724b403f94b7fd1ecec3f6d2e4f5a1ed805c6ec Mon Sep 17 00:00:00 2001 From: Cashew <52880648+cashewnuttynuts@users.noreply.github.com> Date: Sat, 22 Jun 2024 03:24:21 +0700 Subject: Add diacritics normalization preprocessors for Vietnamese (#1107) * add viet diacritics normalization * move regexp construction outside of function * fix eslint * add 'off' option * fix lint * fix type --- ext/js/language/language-descriptors.js | 6 ++- ext/js/language/vi/viet-text-preprocessors.js | 58 +++++++++++++++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) create mode 100644 ext/js/language/vi/viet-text-preprocessors.js (limited to 'ext/js') diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 2a8762e9..517c908c 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -36,6 +36,7 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; +import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; import {isStringPartiallyChinese} from './zh/chinese.js'; @@ -261,7 +262,10 @@ const languageDescriptors = [ iso: 'vi', name: 'Vietnamese', exampleText: 'đọc', - textPreprocessors: capitalizationPreprocessors, + textPreprocessors: { + ...capitalizationPreprocessors, + normalizeDiacritics, + }, }, { iso: 'yue', diff --git a/ext/js/language/vi/viet-text-preprocessors.js b/ext/js/language/vi/viet-text-preprocessors.js new file mode 100644 index 00000000..24453f47 --- /dev/null +++ b/ext/js/language/vi/viet-text-preprocessors.js @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +const TONE = '([\u0300\u0309\u0303\u0301\u0323])'; // Huyền, hỏi, ngã, sắc, nặng +const COMBINING_BREVE = '\u0306'; // Ă +const COMBINING_CIRCUMFLEX_ACCENT = '\u0302'; // Â +const COMBINING_HORN = '\u031B'; // Ơ +const DIACRITICS = `${COMBINING_BREVE}${COMBINING_CIRCUMFLEX_ACCENT}${COMBINING_HORN}`; + +// eslint-disable-next-line no-misleading-character-class +const re1 = new RegExp(`${TONE}([aeiouy${DIACRITICS}]+)`, 'i'); +const re2 = new RegExp(`(?<=[${DIACRITICS}])(.)${TONE}`, 'i'); +const re3 = new RegExp(`(?<=[ae])([iouy])${TONE}`, 'i'); +const re4 = new RegExp(`(?<=[oy])([iuy])${TONE}`, 'i'); +const re5 = new RegExp(`(?} + */ +export const normalizeDiacritics = { + name: 'Normalize Diacritics', + description: 'Normalize diacritics and their placements (in either the old style or new style). NFC normalization is used.', + options: ['old', 'new', 'off'], + process: (str, setting) => { + if (setting === 'off') { return str; } + + let result = str.normalize('NFD'); + // Put the tone on the second vowel + result = result.replace(re1, '$2$1'); + // Put the tone on the vowel with a diacritic + result = result.replace(re2, '$2$1'); + // For vowels that are not oa, oe, uy put the tone on the penultimate vowel + result = result.replace(re3, '$2$1'); + result = result.replace(re4, '$2$1'); + result = result.replace(re5, '$1$3$2'); + result = result.replace(re6, '$1$3$2'); + + if (setting === 'old') { result = result.replace(re7, '$1$3$2'); } + return result.normalize('NFC'); + }, +}; -- cgit v1.2.3