summaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorKuuuube <61125188+Kuuuube@users.noreply.github.com>2024-06-26 13:05:23 -0400
committerGitHub <noreply@github.com>2024-06-26 17:05:23 +0000
commitdfc3710108fe2617a98fca0f57f5a2f6bb7d1830 (patch)
treead402a4028c1b4f678262776c5a8b84d25e7dcf9 /ext
parent2a92a0b98c2bb08c2adaca24ff8af3322874ef59 (diff)
Add normalization of combining dakuten and handakuten to ja preprocessors (#1136)
* Add normalization of combining dakuten and handakuten to ja preprocessors * Fix typo * Remove redundant variable assignment * Fix first character processed incorrectly when it is a character that gets combined * Add test for combining dakuten and handakuten
Diffstat (limited to 'ext')
-rw-r--r--ext/js/language/ja/japanese-text-preprocessors.js9
-rw-r--r--ext/js/language/ja/japanese.js59
-rw-r--r--ext/js/language/language-descriptors.js6
3 files changed, 72 insertions, 2 deletions
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index 2d0d23b3..cdd8ce9a 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -24,6 +24,7 @@ import {
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
+ normalizeCombiningCharacters as normalizeCombiningCharactersFunction,
} from './japanese.js';
/** @type {import('language').TextProcessor<boolean>} */
@@ -90,3 +91,11 @@ export const collapseEmphaticSequences = {
return str;
},
};
+
+/** @type {import('language').TextProcessor<boolean>} */
+export const normalizeCombiningCharacters = {
+ name: 'Normalize combining characters',
+ description: 'ド → ド (U+30C8 U+3099 → U+30C9)',
+ options: basicTextProcessorOptions,
+ process: (str, setting) => (setting ? normalizeCombiningCharactersFunction(str) : str),
+};
diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js
index 3a009ebb..231d8f62 100644
--- a/ext/js/language/ja/japanese.js
+++ b/ext/js/language/ja/japanese.js
@@ -560,6 +560,65 @@ export function getKanaDiacriticInfo(character) {
return typeof info !== 'undefined' ? {character: info.character, type: info.type} : null;
}
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function dakutenAllowed(codePoint) {
+ // To reduce processing time some characters which shouldn't have dakuten but are highly unlikely to have a combining character attached are included
+ // かがきぎくぐけげこごさざしじすずせぜそぞただちぢっつづてでとはばぱひびぴふぶぷへべぺほ
+ // カガキギクグケゲコゴサザシジスズセゼソゾタダチヂッツヅテデトハバパヒビピフブプヘベペホ
+ return ((codePoint >= 0x304B && codePoint <= 0x3068) ||
+ (codePoint >= 0x306F && codePoint <= 0x307B) ||
+ (codePoint >= 0x30AB && codePoint <= 0x30C8) ||
+ (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {number} codePoint
+ * @returns {boolean}
+ */
+function handakutenAllowed(codePoint) {
+ // To reduce processing time some characters which shouldn't have handakuten but are highly unlikely to have a combining character attached are included
+ // はばぱひびぴふぶぷへべぺほ
+ // ハバパヒビピフブプヘベペホ
+ return ((codePoint >= 0x306F && codePoint <= 0x307B) ||
+ (codePoint >= 0x30CF && codePoint <= 0x30DB));
+}
+
+/**
+ * @param {string} text
+ * @returns {string}
+ */
+export function normalizeCombiningCharacters(text) {
+ let result = '';
+ let i = text.length - 1;
+ // Ignoring the first character is intentional, it cannot combine with anything
+ while (i > 0) {
+ if (text[i] === '\u3099') {
+ const dakutenCombinee = text[i - 1].codePointAt(0);
+ if (dakutenCombinee && dakutenAllowed(dakutenCombinee)) {
+ result = String.fromCodePoint(dakutenCombinee + 1) + result;
+ i -= 2;
+ continue;
+ }
+ } else if (text[i] === '\u309A') {
+ const handakutenCombinee = text[i - 1].codePointAt(0);
+ if (handakutenCombinee && handakutenAllowed(handakutenCombinee)) {
+ result = String.fromCodePoint(handakutenCombinee + 2) + result;
+ i -= 2;
+ continue;
+ }
+ }
+ result = text[i] + result;
+ i--;
+ }
+ // i === -1 when first two characters are combined
+ if (i === 0) {
+ result = text[0] + result;
+ }
+ return result;
+}
// Furigana distribution
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 7965ff30..f9fb4f09 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -26,6 +26,7 @@ import {
collapseEmphaticSequences,
convertHalfWidthCharacters,
convertHiraganaToKatakana,
+ normalizeCombiningCharacters,
} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
@@ -36,9 +37,9 @@ import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js
import {oldIrishTransforms} from './sga/old-irish-transforms.js';
import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js';
import {albanianTransforms} from './sq/albanian-transforms.js';
-import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
-import {normalizePinyin, isStringPartiallyChinese} from './zh/chinese.js';
+import {normalizeDiacritics} from './vi/viet-text-preprocessors.js';
+import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js';
const capitalizationPreprocessors = {
decapitalize,
@@ -155,6 +156,7 @@ const languageDescriptors = [
textPreprocessors: {
convertHalfWidthCharacters,
alphabeticToHiragana,
+ normalizeCombiningCharacters,
alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences,