diff options
| author | Cashew <52880648+cashewnuttynuts@users.noreply.github.com> | 2024-06-22 03:24:21 +0700 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-06-21 20:24:21 +0000 | 
| commit | d724b403f94b7fd1ecec3f6d2e4f5a1ed805c6ec (patch) | |
| tree | 1f696742b75b4f8377fe3b952863af141ff6494e | |
| parent | 1c609d972ae76f8779190d7a3621f77a664a6dec (diff) | |
Add diacritics normalization preprocessors for Vietnamese (#1107)
* add viet diacritics normalization
* move regexp construction outside of function
* fix eslint
* add 'off' option
* fix lint
* fix type
| -rw-r--r-- | .eslintrc.json | 1 | ||||
| -rw-r--r-- | ext/js/language/language-descriptors.js | 6 | ||||
| -rw-r--r-- | ext/js/language/vi/viet-text-preprocessors.js | 58 | ||||
| -rw-r--r-- | test/language/viet-text-preprocessors.test.js | 60 | ||||
| -rw-r--r-- | types/ext/language-descriptors.d.ts | 4 | 
5 files changed, 127 insertions, 2 deletions
| diff --git a/.eslintrc.json b/.eslintrc.json index 9b4f8685..c32e9266 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -653,6 +653,7 @@                  "ext/js/language/sga/old-irish-transforms.js",                  "ext/js/language/sh/serbo-croatian-text-preprocessors.js",                  "ext/js/language/sq/albanian-transforms.js", +                "ext/js/language/vi/viet-text-preprocessors.js",                  "ext/js/language/text-processors.js",                  "ext/js/language/translator.js",                  "ext/js/language/zh/chinese.js", diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 2a8762e9..517c908c 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -36,6 +36,7 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js  import {oldIrishTransforms} from './sga/old-irish-transforms.js';  import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';  import {albanianTransforms} from './sq/albanian-transforms.js'; +import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';  import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';  import {isStringPartiallyChinese} from './zh/chinese.js'; @@ -261,7 +262,10 @@ const languageDescriptors = [          iso: 'vi',          name: 'Vietnamese',          exampleText: 'đọc', -        textPreprocessors: capitalizationPreprocessors, +        textPreprocessors: { +            ...capitalizationPreprocessors, +            normalizeDiacritics, +        },      },      {          iso: 'yue', diff --git a/ext/js/language/vi/viet-text-preprocessors.js b/ext/js/language/vi/viet-text-preprocessors.js new file mode 100644 index 00000000..24453f47 --- /dev/null +++ b/ext/js/language/vi/viet-text-preprocessors.js @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +const TONE = '([\u0300\u0309\u0303\u0301\u0323])'; // Huyền, hỏi, ngã, sắc, nặng +const COMBINING_BREVE = '\u0306'; // Ă +const COMBINING_CIRCUMFLEX_ACCENT = '\u0302'; // Â +const COMBINING_HORN = '\u031B'; // Ơ +const DIACRITICS = `${COMBINING_BREVE}${COMBINING_CIRCUMFLEX_ACCENT}${COMBINING_HORN}`; + +// eslint-disable-next-line no-misleading-character-class +const re1 = new RegExp(`${TONE}([aeiouy${DIACRITICS}]+)`, 'i'); +const re2 = new RegExp(`(?<=[${DIACRITICS}])(.)${TONE}`, 'i'); +const re3 = new RegExp(`(?<=[ae])([iouy])${TONE}`, 'i'); +const re4 = new RegExp(`(?<=[oy])([iuy])${TONE}`, 'i'); +const re5 = new RegExp(`(?<!q)(u)([aeiou])${TONE}`, 'i'); +const re6 = new RegExp(`(?<!g)(i)([aeiouy])${TONE}`, 'i'); +const re7 = new RegExp(`(?<!q)([ou])([aeoy])${TONE}(?!\\w)`, 'i'); + +/** + * This function is adapted from https://github.com/enricobarzetti/viet_text_tools/blob/master/viet_text_tools/__init__.py + * @type {import('language').TextProcessor<'old'|'new'|'off'>} + */ +export const normalizeDiacritics = { +    name: 'Normalize Diacritics', +    description: 'Normalize diacritics and their placements (in either the old style or new style). NFC normalization is used.', +    options: ['old', 'new', 'off'], +    process: (str, setting) => { +        if (setting === 'off') { return str; } + +        let result = str.normalize('NFD'); +        // Put the tone on the second vowel +        result = result.replace(re1, '$2$1'); +        // Put the tone on the vowel with a diacritic +        result = result.replace(re2, '$2$1'); +        // For vowels that are not oa, oe, uy put the tone on the penultimate vowel +        result = result.replace(re3, '$2$1'); +        result = result.replace(re4, '$2$1'); +        result = result.replace(re5, '$1$3$2'); +        result = result.replace(re6, '$1$3$2'); + +        if (setting === 'old') { result = result.replace(re7, '$1$3$2'); } +        return result.normalize('NFC'); +    }, +}; diff --git a/test/language/viet-text-preprocessors.test.js b/test/language/viet-text-preprocessors.test.js new file mode 100644 index 00000000..56593c63 --- /dev/null +++ b/test/language/viet-text-preprocessors.test.js @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2023-2024  Yomitan Authors + * Copyright (C) 2020-2022  Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {normalizeDiacritics} from '../../ext/js/language/vi/viet-text-preprocessors.js'; +import {describe, expect, test} from 'vitest'; + +const testCasesOldStyle = [ +    ['hoạ', 'họa'], +    ['choàng', 'choàng'], +    ['thuỷ', 'thủy'], +    ['oà', 'òa'], +    ['toà', 'tòa'], +    ['toàn', 'toàn'], +    ['tòan', 'toàn'], +]; + +const testCasesNewStyle = [ +    ['ngòăng', 'ngoằng'], +    ['họa', 'hoạ'], +    ['chòang', 'choàng'], +    ['giừơng', 'giường'], +    ['baỷ', 'bảy'], +    ['cuả', 'của'], +    ['òa', 'oà'], +    ['toàn', 'toàn'], +]; + +describe('diacritics normalization', () => { +    const {options, process} = normalizeDiacritics; +    for (const option of options) { +        if (option === 'off') { return; } + +        describe(`${option} style`, () => { +            if (option === 'new') { +                test.each(testCasesNewStyle)('%s normalizes to %s', (input, expected) => { +                    expect(process(input, option)).toStrictEqual(expected); +                }); +            } else { +                test.each(testCasesOldStyle)('%s normalizes to %s', (input, expected) => { +                    expect(process(input, option)).toStrictEqual(expected); +                }); +            } +        }); +    } +}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 856e55b3..778445de 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -174,7 +174,9 @@ type AllTextProcessors = {          pre: CapitalizationPreprocessors;      };      vi: { -        pre: CapitalizationPreprocessors; +        pre: CapitalizationPreprocessors & { +            normalizeDiacritics: TextProcessor<'old' | 'new' | 'off'>; +        };      };      yue: Record<string, never>;      zh: Record<string, never>; |