aboutsummaryrefslogtreecommitdiff
path: root/ext
diff options
context:
space:
mode:
authorStefanVukovic99 <stefanvukovic44@gmail.com>2024-05-22 22:45:39 +0200
committerGitHub <noreply@github.com>2024-05-22 20:45:39 +0000
commitd19b898792bffed8ab2d5724472e5b65a5f5b146 (patch)
treeb3e0d5111d748dfcc5d74d9dbf68e79193fa6a7f /ext
parent125cde3d98c18b08e71e075b4a9776fc7bd4b4a0 (diff)
[ja] add preprocessor for width of alphabetic characters (#964)
* add japanese text preprocessor for variants in width of alphabetic characters * try combining with numeric to improve performance * Update ext/js/language/ja/japanese.js Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com> Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com> * Update ext/js/language/ja/japanese.js Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com> Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com> * fix tests --------- Signed-off-by: StefanVukovic99 <stefanvukovic44@gmail.com> Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com>
Diffstat (limited to 'ext')
-rw-r--r--ext/js/language/ja/japanese-text-preprocessors.js31
-rw-r--r--ext/js/language/ja/japanese.js32
-rw-r--r--ext/js/language/language-descriptors.js12
3 files changed, 58 insertions, 17 deletions
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index b3d50817..32e45c83 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -19,10 +19,11 @@ import {basicTextProcessorOptions} from '../text-processors.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {
collapseEmphaticSequences as collapseEmphaticSequencesFunction,
+ convertAlphanumericToFullWidth,
+ convertFullWidthAlphanumericToNormal,
convertHalfWidthKanaToFullWidth,
convertHiraganaToKatakana as convertHiraganaToKatakanaFunction,
- convertKatakanaToHiragana as convertKatakanaToHiraganaFunction,
- convertNumericToFullWidth
+ convertKatakanaToHiragana as convertKatakanaToHiraganaFunction
} from './japanese.js';
/** @type {import('language').TextProcessor<boolean>} */
@@ -33,16 +34,9 @@ export const convertHalfWidthCharacters = {
process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str)
};
-/** @type {import('language').TextProcessor<boolean>} */
-export const convertNumericCharacters = {
- name: 'Convert numeric characters to full width',
- description: '1234 → 1234',
- options: basicTextProcessorOptions,
- process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
-};
/** @type {import('language').TextProcessor<boolean>} */
-export const convertAlphabeticCharacters = {
+export const alphabeticToHiragana = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
options: basicTextProcessorOptions,
@@ -50,6 +44,23 @@ export const convertAlphabeticCharacters = {
};
/** @type {import('language').BidirectionalConversionPreprocessor} */
+export const alphanumericWidthVariants = {
+ name: 'Convert between alphabetic width variants',
+ description: 'yomitan → yomitan and vice versa',
+ options: ['off', 'direct', 'inverse'],
+ process: (str, setting) => {
+ switch (setting) {
+ case 'off':
+ return str;
+ case 'direct':
+ return convertFullWidthAlphanumericToNormal(str);
+ case 'inverse':
+ return convertAlphanumericToFullWidth(str);
+ }
+ }
+};
+
+/** @type {import('language').BidirectionalConversionPreprocessor} */
export const convertHiraganaToKatakana = {
name: 'Convert hiragana to katakana',
description: 'よみちゃん → ヨミチャン and vice versa',
diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js
index 3507e5df..2200e077 100644
--- a/ext/js/language/ja/japanese.js
+++ b/ext/js/language/ja/japanese.js
@@ -15,6 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
+
const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
const KATAKANA_SMALL_KA_CODE_POINT = 0x30f5;
@@ -523,16 +524,39 @@ export function convertHiraganaToKatakana(text) {
* @param {string} text
* @returns {string}
*/
-export function convertNumericToFullWidth(text) {
+export function convertAlphanumericToFullWidth(text) {
let result = '';
for (const char of text) {
let c = /** @type {number} */ (char.codePointAt(0));
if (c >= 0x30 && c <= 0x39) { // ['0', '9']
c += 0xff10 - 0x30; // 0xff10 = '0' full width
- result += String.fromCodePoint(c);
- } else {
- result += char;
+ } else if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z']
+ c += 0xff21 - 0x41; // 0xff21 = 'A' full width
+ } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z']
+ c += 0xff41 - 0x61; // 0xff41 = 'a' full width
+ }
+ result += String.fromCodePoint(c);
+ }
+ return result;
+}
+
+/**
+ * @param {string} text
+ * @returns {string}
+ */
+export function convertFullWidthAlphanumericToNormal(text) {
+ let result = '';
+ const length = text.length;
+ for (let i = 0; i < length; i++) {
+ let c = /** @type {number} */ (text[i].codePointAt(0));
+ if (c >= 0xff10 && c <= 0xff19) { // ['0', '9']
+ c -= 0xff10 - 0x30; // 0x30 = '0'
+ } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z']
+ c -= 0xff21 - 0x41; // 0x41 = 'A'
+ } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z']
+ c -= 0xff41 - 0x61; // 0x61 = 'a'
}
+ result += String.fromCodePoint(c);
}
return result;
}
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index 726842f1..baf53f81 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -19,7 +19,13 @@ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';
import {eszettPreprocessor} from './de/german-text-preprocessors.js';
import {germanTransforms} from './de/german-transforms.js';
import {englishTransforms} from './en/english-transforms.js';
-import {collapseEmphaticSequences, convertAlphabeticCharacters, convertHalfWidthCharacters, convertHiraganaToKatakana, convertNumericCharacters} from './ja/japanese-text-preprocessors.js';
+import {
+ alphabeticToHiragana,
+ alphanumericWidthVariants,
+ collapseEmphaticSequences,
+ convertHalfWidthCharacters,
+ convertHiraganaToKatakana
+} from './ja/japanese-text-preprocessors.js';
import {japaneseTransforms} from './ja/japanese-transforms.js';
import {isStringPartiallyJapanese} from './ja/japanese.js';
import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js';
@@ -143,8 +149,8 @@ const languageDescriptors = [
isTextLookupWorthy: isStringPartiallyJapanese,
textPreprocessors: {
convertHalfWidthCharacters,
- convertNumericCharacters,
- convertAlphabeticCharacters,
+ alphabeticToHiragana,
+ alphanumericWidthVariants,
convertHiraganaToKatakana,
collapseEmphaticSequences
},