summaryrefslogtreecommitdiff
path: root/ext/mixed/js
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2020-01-22 21:41:32 -0500
committertoasted-nutbread <toasted-nutbread@users.noreply.github.com>2020-01-24 20:15:25 -0500
commit9b509d50a94110f92ac52db2ff9566d1104e33c6 (patch)
tree6ea6d8ce607e6c0f1b0982a397dd0f5534ae904a /ext/mixed/js
parent3c17388ff85daf477a1cae21103f705bce96bfcd (diff)
Add character range definitions
Diffstat (limited to 'ext/mixed/js')
-rw-r--r--ext/mixed/js/japanese.js35
1 files changed, 35 insertions, 0 deletions
diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js
index 8dd5651c..956d246a 100644
--- a/ext/mixed/js/japanese.js
+++ b/ext/mixed/js/japanese.js
@@ -76,6 +76,41 @@ const JP_HALFWIDTH_KATAKANA_MAPPING = new Map([
['ン', 'ン--']
]);
+const JP_HIRAGANA_RANGE = [0x3040, 0x309f];
+const JP_KATAKANA_RANGE = [0x30a0, 0x30ff];
+const JP_KANA_RANGES = [JP_HIRAGANA_RANGE, JP_KATAKANA_RANGE];
+
+const JP_CJK_COMMON_RANGE = [0x4e00, 0x9fff];
+const JP_CJK_RARE_RANGE = [0x3400, 0x4dbf];
+const JP_CJK_RANGES = [JP_CJK_COMMON_RANGE, JP_CJK_RARE_RANGE];
+
+const JP_ITERATION_MARK_CHAR_CODE = 0x3005;
+
+// Japanese character ranges, roughly ordered in order of expected frequency
+const JP_JAPANESE_RANGES = [
+ JP_HIRAGANA_RANGE,
+ JP_KATAKANA_RANGE,
+
+ JP_CJK_COMMON_RANGE,
+ JP_CJK_RARE_RANGE,
+
+ [0xff66, 0xff9f], // Halfwidth katakana
+
+ [0x30fb, 0x30fc], // Katakana punctuation
+ [0xff61, 0xff65], // Kana punctuation
+ [0x3000, 0x303f], // CJK punctuation
+
+ [0xff10, 0xff19], // Fullwidth numbers
+ [0xff21, 0xff3a], // Fullwidth upper case Latin letters
+ [0xff41, 0xff5a], // Fullwidth lower case Latin letters
+
+ [0xff01, 0xff0f], // Fullwidth punctuation 1
+ [0xff1a, 0xff1f], // Fullwidth punctuation 2
+ [0xff3b, 0xff3f], // Fullwidth punctuation 3
+ [0xff5b, 0xff60], // Fullwidth punctuation 4
+ [0xffe0, 0xffee], // Currency markers
+];
+
function jpIsKanji(c) {
const code = c.charCodeAt(0);