diff options
author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-05-22 22:45:39 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-05-22 20:45:39 +0000 |
commit | d19b898792bffed8ab2d5724472e5b65a5f5b146 (patch) | |
tree | b3e0d5111d748dfcc5d74d9dbf68e79193fa6a7f /ext/js/language | |
parent | 125cde3d98c18b08e71e075b4a9776fc7bd4b4a0 (diff) |
[ja] add preprocessor for width of alphabetic characters (#964)
* add japanese text preprocessor for variants in width of alphabetic characters
* try combining with numeric to improve performance
* Update ext/js/language/ja/japanese.js
Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com>
* Update ext/js/language/ja/japanese.js
Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com>
* fix tests
---------
Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com>
Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Diffstat (limited to 'ext/js/language')
-rw-r--r-- | ext/js/language/ja/japanese-text-preprocessors.js | 31 | ||||
-rw-r--r-- | ext/js/language/ja/japanese.js | 32 | ||||
-rw-r--r-- | ext/js/language/language-descriptors.js | 12 |
3 files changed, 58 insertions, 17 deletions
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index b3d50817..32e45c83 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -19,10 +19,11 @@ import {basicTextProcessorOptions} from '../text-processors.js'; import {convertAlphabeticToKana} from './japanese-wanakana.js'; import { collapseEmphaticSequences as collapseEmphaticSequencesFunction, + convertAlphanumericToFullWidth, + convertFullWidthAlphanumericToNormal, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana as convertHiraganaToKatakanaFunction, - convertKatakanaToHiragana as convertKatakanaToHiraganaFunction, - convertNumericToFullWidth + convertKatakanaToHiragana as convertKatakanaToHiraganaFunction } from './japanese.js'; /** @type {import('language').TextProcessor<boolean>} */ @@ -33,16 +34,9 @@ export const convertHalfWidthCharacters = { process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str) }; -/** @type {import('language').TextProcessor<boolean>} */ -export const convertNumericCharacters = { - name: 'Convert numeric characters to full width', - description: '1234 → 1234', - options: basicTextProcessorOptions, - process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) -}; /** @type {import('language').TextProcessor<boolean>} */ -export const convertAlphabeticCharacters = { +export const alphabeticToHiragana = { name: 'Convert alphabetic characters to hiragana', description: 'yomichan → よみちゃん', options: basicTextProcessorOptions, @@ -50,6 +44,23 @@ export const convertAlphabeticCharacters = { }; /** @type {import('language').BidirectionalConversionPreprocessor} */ +export const alphanumericWidthVariants = { + name: 'Convert between alphabetic width variants', + description: 'yomitan → yomitan and vice versa', + options: ['off', 'direct', 'inverse'], + process: (str, setting) => { + switch (setting) { + case 'off': + return str; + case 'direct': + return convertFullWidthAlphanumericToNormal(str); + case 'inverse': + return convertAlphanumericToFullWidth(str); + } + } +}; + +/** @type {import('language').BidirectionalConversionPreprocessor} */ export const convertHiraganaToKatakana = { name: 'Convert hiragana to katakana', description: 'よみちゃん → ヨミチャン and vice versa', diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js index 3507e5df..2200e077 100644 --- a/ext/js/language/ja/japanese.js +++ b/ext/js/language/ja/japanese.js @@ -15,6 +15,7 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ + const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5; @@ -523,16 +524,39 @@ export function convertHiraganaToKatakana(text) { * @param {string} text * @returns {string} */ -export function convertNumericToFullWidth(text) { +export function convertAlphanumericToFullWidth(text) { let result = ''; for (const char of text) { let c = /** @type {number} */ (char.codePointAt(0)); if (c >= 0x30 && c <= 0x39) { // ['0', '9'] c += 0xff10 - 0x30; // 0xff10 = '0' full width - result += String.fromCodePoint(c); - } else { - result += char; + } else if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] + c += 0xff21 - 0x41; // 0xff21 = 'A' full width + } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] + c += 0xff41 - 0x61; // 0xff41 = 'a' full width + } + result += String.fromCodePoint(c); + } + return result; +} + +/** + * @param {string} text + * @returns {string} + */ +export function convertFullWidthAlphanumericToNormal(text) { + let result = ''; + const length = text.length; + for (let i = 0; i < length; i++) { + let c = /** @type {number} */ (text[i].codePointAt(0)); + if (c >= 0xff10 && c <= 0xff19) { // ['0', '9'] + c -= 0xff10 - 0x30; // 0x30 = '0' + } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] + c -= 0xff21 - 0x41; // 0x41 = 'A' + } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] + c -= 0xff41 - 0x61; // 0x61 = 'a' } + result += String.fromCodePoint(c); } return result; } diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index 726842f1..baf53f81 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -19,7 +19,13 @@ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js'; import {eszettPreprocessor} from './de/german-text-preprocessors.js'; import {germanTransforms} from './de/german-transforms.js'; import {englishTransforms} from './en/english-transforms.js'; -import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js'; +import { + alphabeticToHiragana, + alphanumericWidthVariants, + collapseEmphaticSequences, + convertHalfWidthCharacters, + convertHiraganaToKatakana +} from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js'; @@ -143,8 +149,8 @@ const languageDescriptors = [ isTextLookupWorthy: isStringPartiallyJapanese, textPreprocessors: { convertHalfWidthCharacters, - convertNumericCharacters, - convertAlphabeticCharacters, + alphabeticToHiragana, + alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences }, |