aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-02 19:21:39 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-02 19:21:39 +0200
commitf4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
treef9ae82ae9549330d12a30ffee8960f2577fff9aa
parentce9e0788317b25e5d297ed38d9fed0754a341288 (diff)
implement Japanese class
-rw-r--r--api/japanese.ts153
-rw-r--r--api/word.ts1
-rw-r--r--deno.lock1
-rw-r--r--test/api-japanese.ts24
-rw-r--r--test/halfwidth2fullwidth.ts5
-rw-r--r--util/array.ts14
-rw-r--r--util/japanese.ts133
-rw-r--r--util/number.ts11
-rw-r--r--util/string.ts48
9 files changed, 376 insertions, 14 deletions
diff --git a/api/japanese.ts b/api/japanese.ts
new file mode 100644
index 0000000..0396821
--- /dev/null
+++ b/api/japanese.ts
@@ -0,0 +1,153 @@
+import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
+
+import "../util/string.ts";
+import "../util/japanese.ts";
+import "../util/array.ts";
+
+/** @interface Piece */
+interface JapaneseToken {
+ /** @prop token writing (kanji/katakana/hiragana) */
+ writing: string;
+ /** @prop token reading (katakana/hiragana) */
+ reading: string;
+ /** @prop normalized token reading (always hiragana) */
+ normalized: string;
+ /** @prop show reading when parsed by formatter */
+ ruby: boolean;
+};
+
+/** @class Japanese string with reading and output formatters */
+export default class Japanese {
+ public writing: string;
+ public reading: string;
+ private normalized: string;
+
+ private formatters = {
+ "HTML": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`;
+ else out += token.writing;
+ return out;
+ }, ""),
+ "parenthesis": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `${token.writing}(${token.reading}) `;
+ else out += token.writing;
+ return out;
+ }, ""),
+ "refold-tools": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `[${token.writing}](${token.reading})`;
+ else out += token.writing;
+ return out;
+ }, ""),
+ } satisfies Record<string, (tokens: Array<JapaneseToken>) => string>;
+
+ constructor(writing: string, reading: string) {
+ this.writing = writing;
+ this.reading = reading;
+ this.normalized = reading.normalizeKana();
+ }
+
+ /** @summary format this as text with furigana */
+ public furigana(format: keyof typeof this.formatters = "HTML"): string {
+ return this.formatters[format](this.tokenize());
+ }
+
+ /**
+ * @summary attempt to match kana in this.reading to sections of the same
+ * script in this.writing
+ */
+ private tokenize(): Array<JapaneseToken> {
+ var tokens: Array<JapaneseToken> = [];
+
+ // split this.writing into tokens with different scripts
+ var token: JapaneseToken = {
+ writing: "",
+ reading: "",
+ normalized: "",
+ ruby: true,
+ };
+ var kana: boolean = this.writing[0].kanaOnly();
+ for (var char of this.writing) {
+ if (char.kanaOnly() != kana) {
+ tokens.push({ ...token });
+ token.writing = "";
+ }
+ token.writing += char;
+ kana = char.kanaOnly();
+ }
+ tokens.push(token);
+
+ // find kana-only tokens and normalize them
+ tokens = tokens.map(token => {
+ if (!token.writing.kanaOnly()) return token;
+ token.normalized = token.writing.normalizeKana();
+ token.reading = token.writing;
+ token.ruby = false;
+ return token;
+ });
+
+ // don't try to spread reading across kanji if there is only one kanji/kana string
+ if (tokens.length == 1) {
+ tokens[0].reading = this.reading;
+ tokens[0].normalized = this.reading.normalizeKana();
+ tokens[0].ruby = !this.writing.kanaOnly();
+ return tokens;
+ }
+
+ // list of indices where anchor token could be in reading
+ var possibilities: Array<Array<number>> = [];
+ // find all possible arrangements (in-order) of anchor indices in this.reading
+ var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
+ // this arrangement is a possibility because the last token fit
+ if (tokenIndex == tokens.length) {
+ possibilities.push(path);
+ return;
+ }
+ // skip until next 'anchor' token
+ if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
+
+ // try all positions where current (anchor) token fits in this.reading
+ while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
+ match(tokenIndex + 1, searchStart, [...path, searchStart]);
+ }
+ };
+ match();
+
+ // create index slices from possibilities
+ var slices = possibilities
+ .map(match => { // convert start index of anchor to start and stop index (based on anchor length)
+ var out = [0];
+ let matchIndex = 0;
+ for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
+ if (tokens[tokenIndex].normalized.length == 0) continue;
+ out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
+ matchIndex++;
+ }
+ if (out.peek() != this.reading.length) out.push(this.reading.length);
+ return out;
+ })
+ .filter(slice => slice.length == tokens.length + 1)
+ .filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
+ .filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
+
+ // cop-out if there is no valid way to split reading across kanji
+ if (slices.length == 0) {
+ return [{
+ writing: this.writing,
+ reading: this.reading,
+ normalized: this.normalized,
+ ruby: true,
+ }];
+ }
+
+ var slice = slices[0]; // TODO: pick most "balanced" out of these instead
+
+ for (let i = 0; i < tokens.length; i++) {
+ // slice[i+1] is safe because slice.length == tokens.length + 1
+ tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
+ tokens[i].normalized = tokens[i].reading.normalizeKana();
+ }
+
+ return tokens;
+ }
+}
+
diff --git a/api/word.ts b/api/word.ts
index 63dce10..7eba936 100644
--- a/api/word.ts
+++ b/api/word.ts
@@ -5,6 +5,7 @@ import { ParseToken } from "../language/types.ts";
export default class Word extends APIBase {
public writing = "TODO";
public reading = "TODO";
+ public conjugated = "TODO";
constructor() {
super();
diff --git a/deno.lock b/deno.lock
index 9705540..b57c7aa 100644
--- a/deno.lock
+++ b/deno.lock
@@ -61,6 +61,7 @@
"https://deno.land/std@0.192.0/async/retry.ts": "6521c061a5ab24e8b1ae624bdc581c4243d1d574f99dc7f5a2a195c2241fb1b8",
"https://deno.land/std@0.192.0/async/tee.ts": "47e42d35f622650b02234d43803d0383a89eb4387e1b83b5a40106d18ae36757",
"https://deno.land/std@0.192.0/fmt/colors.ts": "d67e3cd9f472535241a8e410d33423980bec45047e343577554d3356e1f0ef4e",
+ "https://deno.land/std@0.192.0/html/entities.ts": "1c9fa4d76e36a9bdbe370a65f1612771f3cc2cf802d217b4e633850e2fa25c16",
"https://deno.land/std@0.192.0/http/server.ts": "1b23463b5b36e4eebc495417f6af47a6f7d52e3294827a1226d2a1aab23d9d20",
"https://deno.land/std@0.192.0/testing/_diff.ts": "1a3c044aedf77647d6cac86b798c6417603361b66b54c53331b312caeb447aea",
"https://deno.land/std@0.192.0/testing/_format.ts": "a69126e8a469009adf4cf2a50af889aca364c349797e63174884a52ff75cf4c7",
diff --git a/test/api-japanese.ts b/test/api-japanese.ts
new file mode 100644
index 0000000..86f2ac1
--- /dev/null
+++ b/test/api-japanese.ts
@@ -0,0 +1,24 @@
+import Japanese from "../api/japanese.ts";
+
+// https://japanese.stackexchange.com/questions/69521/reading-per-kanji-irregular-readings
+// wow
+
+var cases = [
+ ["繰り返す", "くりかえす"], // [繰](く)り[返](かえ)す
+ ["漢字テスト", "かんじてすと"], // [漢字](かんじ)テスト
+ ["凛々しく", "りりしく"], // [凛々](りり)しく
+ ["字のテスト", "じのテスト"], // [字](じ)のテスト
+ ["文字", "っ"], // [漢字](っ)
+ ["文字りす", "りりりす"], // [文字](りり)りす
+ ["気を引き締める", "きをひきしめる"], // [気](き)を[引](ひ)き[締](し)める
+ ["文字り漢字", "りりりり"], // ?????
+ ["大口魚", "たら"], // [大口魚](たら)
+] satisfies Array<[string, string]>;
+
+for (var args of cases) {
+ var test = new Japanese(...args as [string, string]);
+ // console.log(test.reading);
+ // console.log(test["tokenize"]());
+ console.log(JSON.stringify(args) + " -> " + test.furigana("parenthesis"));
+}
+
diff --git a/test/halfwidth2fullwidth.ts b/test/halfwidth2fullwidth.ts
new file mode 100644
index 0000000..4855f7f
--- /dev/null
+++ b/test/halfwidth2fullwidth.ts
@@ -0,0 +1,5 @@
+import "../util/japanese.ts";
+
+console.log("これが オレのウィンターバケーション スタイル!".widenKatakana().katakanaToHiragana());
+
+
diff --git a/util/array.ts b/util/array.ts
index f032935..c5a26c6 100644
--- a/util/array.ts
+++ b/util/array.ts
@@ -1,3 +1,5 @@
+import "./set.ts";
+
declare global {
interface Array<T> {
/** @summary check if any of the elements of `arr2` are included in `this` */
@@ -8,6 +10,10 @@ declare global {
set(): Set<T>;
/** @summary clear array */
clear(): void;
+ /** @summary filter duplicates from array */
+ filterDuplicates(): Array<T>;
+ /** @summary `true` if the array doesn't contain duplicate items */
+ isUniq(): boolean;
}
}
@@ -27,3 +33,11 @@ Array.prototype.clear = function() {
while (this.length > 0) this.pop();
}
+Array.prototype.filterDuplicates = function() {
+ return this.set().arr(); // TODO: optimize this
+}
+
+Array.prototype.isUniq = function() {
+ return this.length == this.filterDuplicates().length;
+}
+
diff --git a/util/japanese.ts b/util/japanese.ts
index 2017280..d398b60 100644
--- a/util/japanese.ts
+++ b/util/japanese.ts
@@ -1,4 +1,5 @@
import { UnicodeRange } from "./string.ts";
+import "./number.ts";
declare global {
interface String {
@@ -57,6 +58,15 @@ declare global {
* `strict` to true
*/
japaneseOnly(strict?: boolean): boolean
+
+ /** @summary convert any half-width katakana to full-width */
+ widenKatakana(): string;
+
+ /** @summary convert any full-width katakana to hiragana */
+ katakanaToHiragana(): string;
+
+ /** @summary convert any kana (full and half-width) to full-width hiragana */
+ normalizeKana(): string;
}
}
@@ -85,7 +95,7 @@ function stringOnly(input: string, check: (key: string, val: number) => StringOn
String.prototype.hiraganaOnly = function(strict = false) {
return stringOnly(this as string, (key, val) => {
- if (key == UnicodeRange.JapaneseHiragana)
+ if (key == UnicodeRange.JapaneseFWHiragana)
return StringOnlyReturnValue.TallyAdd; // count hiragana characters
else if (!strict && key.startsWith("any-"))
return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -97,7 +107,7 @@ String.prototype.hiraganaOnly = function(strict = false) {
String.prototype.katakanaOnly = function(strict = false) {
return stringOnly(this as string, (key, val) => {
- if (key == UnicodeRange.JapaneseKatakana)
+ if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana].includes(key as UnicodeRange))
return StringOnlyReturnValue.TallyAdd; // count katakana characters
else if (!strict && key.startsWith("any-"))
return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -121,7 +131,7 @@ String.prototype.kanjiOnly = function(strict = false) {
String.prototype.kanaOnly = function(strict = false) {
return stringOnly(this as string, (key, val) => {
- if (key == UnicodeRange.JapaneseHiragana || key == UnicodeRange.JapaneseKatakana)
+ if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana, UnicodeRange.JapaneseFWHiragana].includes(key as UnicodeRange))
return StringOnlyReturnValue.TallyAdd; // count kana characters
else if (!strict && key.startsWith("any-"))
return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -143,3 +153,120 @@ String.prototype.japaneseOnly = function(strict = false) {
});
}
+String.prototype.widenKatakana = function() {
+ const map: { [key: string]: string } = {
+ "ァ": "ァ",
+ "ア": "ア",
+ "ィ": "ィ",
+ "イ": "イ",
+ "ゥ": "ゥ",
+ "ウ": "ウ",
+ "ェ": "ェ",
+ "エ": "エ",
+ "ォ": "ォ",
+ "オ": "オ",
+ "ガ": "ガ",
+ "カ": "カ",
+ "ギ": "ギ",
+ "キ": "キ",
+ "グ": "グ",
+ "ク": "ク",
+ "ゲ": "ゲ",
+ "ケ": "ケ",
+ "ゴ": "ゴ",
+ "コ": "コ",
+ "ザ": "ザ",
+ "サ": "サ",
+ "ジ": "ジ",
+ "シ": "シ",
+ "ズ": "ズ",
+ "ス": "ス",
+ "ゼ": "ゼ",
+ "セ": "セ",
+ "ゾ": "ゾ",
+ "ソ": "ソ",
+ "ダ": "ダ",
+ "タ": "タ",
+ "ヂ": "ヂ",
+ "チ": "チ",
+ "ヅ": "ヅ",
+ "ッ": "ッ",
+ "ツ": "ツ",
+ "デ": "デ",
+ "テ": "テ",
+ "ド": "ド",
+ "ト": "ト",
+ "ナ": "ナ",
+ "ニ": "ニ",
+ "ヌ": "ヌ",
+ "ネ": "ネ",
+ "ノ": "ノ",
+ "バ": "バ",
+ "パ": "パ",
+ "ハ": "ハ",
+ "ビ": "ビ",
+ "ピ": "ピ",
+ "ヒ": "ヒ",
+ "ブ": "ブ",
+ "プ": "プ",
+ "フ": "フ",
+ "ベ": "ベ",
+ "ペ": "ペ",
+ "ヘ": "ヘ",
+ "ボ": "ボ",
+ "ポ": "ポ",
+ "ホ": "ホ",
+ "マ": "マ",
+ "ミ": "ミ",
+ "ム": "ム",
+ "メ": "メ",
+ "モ": "モ",
+ "ャ": "ャ",
+ "ヤ": "ヤ",
+ "ュ": "ュ",
+ "ユ": "ユ",
+ "ョ": "ョ",
+ "ヨ": "ヨ",
+ "ラ": "ラ",
+ "リ": "リ",
+ "ル": "ル",
+ "レ": "レ",
+ "ロ": "ロ",
+ "ワ": "ワ",
+ "ヲ": "ヲ",
+ "ン": "ン",
+ "ヴ": "ヴ",
+ "ヷ": "ヷ",
+ "イ゙": "イ゙",
+ "エ゙": "エ゙",
+ "ヺ": "ヺ",
+ "ー": "ー",
+ };
+
+ var out = "";
+ outer:
+ for (let i = 0; i < this.length; i++) {
+ for (var key in map) {
+ if (!this.substring(i).startsWith(key)) continue;
+ out += map[key];
+ i += key.length - 1;
+ continue outer;
+ }
+
+ out += this[i];
+ }
+ return out;
+}
+
+String.prototype.katakanaToHiragana = function() {
+ return this.map(char => {
+ var code = char.codePointAt(0)!;
+ if (0x30a1 <= code && code <= 0x30f6) return (code + (0x3041 - 0x30a1)).toChar();
+ return char;
+ })
+}
+
+String.prototype.normalizeKana = function() {
+ return this.widenKatakana().katakanaToHiragana();
+}
+
diff --git a/util/number.ts b/util/number.ts
new file mode 100644
index 0000000..c28864f
--- /dev/null
+++ b/util/number.ts
@@ -0,0 +1,11 @@
+declare global {
+ interface Number {
+ /** @summary convert number to character by charCode */
+ toChar(): string;
+ }
+}
+
+Number.prototype.toChar = function() {
+ return String.fromCharCode(this as number);
+}
+
diff --git a/util/string.ts b/util/string.ts
index 397dcd6..327b884 100644
--- a/util/string.ts
+++ b/util/string.ts
@@ -26,6 +26,12 @@ declare global {
* @argument fallback return this value if parsing fails
*/
json(fallback?: any): any;
+
+ /**
+ * @summary map each character of a string to another character using
+ * `mapFn`
+ */
+ map(mapFn: (char: string) => string): string;
}
}
@@ -34,11 +40,15 @@ export enum UnicodeRange {
Whitespace = "any-whitespace",
Punctuation = "any-punctuation",
Unknown = "any-unknown",
- JapanesePunctuation = "jp-punctuation",
- JapaneseHiragana = "jp-hiragana",
- JapaneseKatakana = "jp-katakana",
- JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana",
+ JapaneseFWPunctuation = "jp-full-width-punctuation",
+ JapaneseHWPunctuation = "jp-half-width-punctuation",
+ JapaneseFWHiragana = "jp-full-width-hiragana",
+ JapaneseFWKatakana = "jp-full-width-katakana",
+ JapaneseFWLatin = "jp-full-width-latin",
+ JapaneseHWKatakana = "jp-half-width-katakana",
JapaneseKanji = "jp-kanji",
+ JapaneseKanjiRadicals = "jp-kanji-radicals",
+ JapaneseAuxiliary = "jp-aux",
}
type RangeTally = Record<UnicodeRange, number>;
@@ -52,12 +62,22 @@ String.prototype.range = function() {
if (0x2e == code) return UnicodeRange.Punctuation; // full stop
if (0x3f == code) return UnicodeRange.Punctuation; // question mark
+ // https://stackoverflow.com/a/53807563
if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin;
- if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation;
- if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana;
- if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana;
- if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana;
- if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji;
+ if (0x2e80 <= code && code <= 0x2fd5) return UnicodeRange.JapaneseKanjiRadicals;
+ if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapaneseFWPunctuation;
+ if (0xff5f <= code && code <= 0xff60) return UnicodeRange.JapaneseFWPunctuation;
+ if (0x3041 <= code && code <= 0x3096) return UnicodeRange.JapaneseFWHiragana;
+ if (0x30a1 <= code && code <= 0x30ff) return UnicodeRange.JapaneseFWKatakana;
+ if (0x3400 <= code && code <= 0x4db5) return UnicodeRange.JapaneseKanji;
+ if (0x4e00 <= code && code <= 0x9fcb) return UnicodeRange.JapaneseKanji;
+ if (0xf900 <= code && code <= 0xfa6a) return UnicodeRange.JapaneseKanji;
+ if (0xff61 <= code && code <= 0xff65) return UnicodeRange.JapaneseHWPunctuation;
+ if (0xff66 <= code && code <= 0xff9f) return UnicodeRange.JapaneseHWKatakana;
+ if (0x31f0 <= code && code <= 0x31ff) return UnicodeRange.JapaneseAuxiliary;
+ if (0x3220 <= code && code <= 0x3243) return UnicodeRange.JapaneseAuxiliary;
+ if (0x3280 <= code && code <= 0x337f) return UnicodeRange.JapaneseAuxiliary;
+ if (0xff01 <= code && code <= 0xff5e) return UnicodeRange.JapaneseFWLatin;
return UnicodeRange.Unknown;
}
@@ -71,11 +91,11 @@ String.prototype.parseTags = function() {
return parseTags(this as string);
}
-String.prototype.removeAll = function(searchValue: string | RegExp) {
+String.prototype.removeAll = function(searchValue) {
return this.replaceAll(searchValue, "");
}
-String.prototype.json = function(fallback?: any) {
+String.prototype.json = function(fallback) {
if (fallback) {
try {
return JSON.parse(this as string);
@@ -87,3 +107,9 @@ String.prototype.json = function(fallback?: any) {
}
}
+String.prototype.map = function(mapFn) {
+ var out = "";
+ for (var char of this) out += mapFn(char);
+ return out;
+}
+