implement Japanese class

author: lonkaars <loek@pipeframe.xyz> 2023-07-02 19:21:39 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-07-02 19:21:39 +0200
commit: f4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
tree: f9ae82ae9549330d12a30ffee8960f2577fff9aa
parent: ce9e0788317b25e5d297ed38d9fed0754a341288 (diff)
9 files changed, 376 insertions, 14 deletions
diff --git a/api/japanese.ts b/api/japanese.ts
new file mode 100644
index 0000000..0396821
--- /dev/null
+++ b/api/japanese.ts
@@ -0,0 +1,153 @@
+import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
+
+import "../util/string.ts";
+import "../util/japanese.ts";
+import "../util/array.ts";
+
+/** @interface Piece */
+interface JapaneseToken {
+	/** @prop token writing (kanji/katakana/hiragana) */
+	writing: string;
+	/** @prop token reading (katakana/hiragana) */
+	reading: string;
+	/** @prop normalized token reading (always hiragana) */
+	normalized: string;
+	/** @prop show reading when parsed by formatter */
+	ruby: boolean;
+};
+
+/** @class Japanese string with reading and output formatters */
+export default class Japanese {
+	public writing: string;
+	public reading: string;
+	private normalized: string;
+
+	private formatters = {
+		"HTML": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`;
+			else out += token.writing;
+			return out;
+		}, ""),
+		"parenthesis": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `${token.writing}(${token.reading}) `;
+			else out += token.writing;
+			return out;
+		}, ""),
+		"refold-tools": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `[${token.writing}](${token.reading})`;
+			else out += token.writing;
+			return out;
+		}, ""),
+	} satisfies Record<string, (tokens: Array<JapaneseToken>) => string>;
+
+	constructor(writing: string, reading: string) {
+		this.writing = writing;
+		this.reading = reading;
+		this.normalized = reading.normalizeKana();
+	}
+
+	/** @summary format this as text with furigana */
+	public furigana(format: keyof typeof this.formatters = "HTML"): string {
+		return this.formatters[format](this.tokenize());
+	}
+
+	/**
+	 * @summary attempt to match kana in this.reading to sections of the same
+	 * script in this.writing
+	 */
+	private tokenize(): Array<JapaneseToken> {
+		var tokens: Array<JapaneseToken> = [];
+
+		// split this.writing into tokens with different scripts
+		var token: JapaneseToken = {
+			writing: "",
+			reading: "",
+			normalized: "",
+			ruby: true,
+		};
+		var kana: boolean = this.writing[0].kanaOnly();
+		for (var char of this.writing) {
+			if (char.kanaOnly() != kana) {
+				tokens.push({ ...token });
+				token.writing = "";
+			}
+			token.writing += char;
+			kana = char.kanaOnly();
+		}
+		tokens.push(token);
+
+		// find kana-only tokens and normalize them
+		tokens = tokens.map(token => {
+			if (!token.writing.kanaOnly()) return token;
+			token.normalized = token.writing.normalizeKana();
+			token.reading = token.writing;
+			token.ruby = false;
+			return token;
+		});
+
+		// don't try to spread reading across kanji if there is only one kanji/kana string
+		if (tokens.length == 1) {
+			tokens[0].reading = this.reading;
+			tokens[0].normalized = this.reading.normalizeKana();
+			tokens[0].ruby = !this.writing.kanaOnly();
+			return tokens;
+		}
+
+		// list of indices where anchor token could be in reading
+		var possibilities: Array<Array<number>> = [];
+		// find all possible arrangements (in-order) of anchor indices in this.reading
+		var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
+			// this arrangement is a possibility because the last token fit
+			if (tokenIndex == tokens.length) {
+				possibilities.push(path);
+				return;
+			}
+			// skip until next 'anchor' token
+			if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
+
+			// try all positions where current (anchor) token fits in this.reading
+			while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
+				match(tokenIndex + 1, searchStart, [...path, searchStart]);
+			}
+		};
+		match();
+
+		// create index slices from possibilities
+		var slices = possibilities
+			.map(match => { // convert start index of anchor to start and stop index (based on anchor length)
+				var out = [0];
+				let matchIndex = 0;
+				for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
+					if (tokens[tokenIndex].normalized.length == 0) continue;
+					out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
+					matchIndex++;
+				}
+				if (out.peek() != this.reading.length) out.push(this.reading.length);
+				return out;
+			})
+			.filter(slice => slice.length == tokens.length + 1)
+			.filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
+			.filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
+
+		// cop-out if there is no valid way to split reading across kanji
+		if (slices.length == 0) {
+			return [{
+				writing: this.writing,
+				reading: this.reading,
+				normalized: this.normalized,
+				ruby: true,
+			}];
+		}
+
+		var slice = slices[0]; // TODO: pick most "balanced" out of these instead
+
+		for (let i = 0; i < tokens.length; i++) {
+			// slice[i+1] is safe because slice.length == tokens.length + 1
+			tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
+			tokens[i].normalized = tokens[i].reading.normalizeKana();
+		}
+
+		return tokens;
+	}
+}
+
diff --git a/api/word.ts b/api/word.ts
index 63dce10..7eba936 100644
--- a/api/word.ts
+++ b/api/word.ts
@@ -5,6 +5,7 @@ import { ParseToken } from "../language/types.ts";
 export default class Word extends APIBase {
   public writing = "TODO";
   public reading = "TODO";
+	public conjugated = "TODO";
 
   constructor() {
     super();
diff --git a/deno.lock b/deno.lock
index 9705540..b57c7aa 100644
--- a/deno.lock
+++ b/deno.lock
@@ -61,6 +61,7 @@
     "https://deno.land/std@0.192.0/async/retry.ts": "6521c061a5ab24e8b1ae624bdc581c4243d1d574f99dc7f5a2a195c2241fb1b8",
     "https://deno.land/std@0.192.0/async/tee.ts": "47e42d35f622650b02234d43803d0383a89eb4387e1b83b5a40106d18ae36757",
     "https://deno.land/std@0.192.0/fmt/colors.ts": "d67e3cd9f472535241a8e410d33423980bec45047e343577554d3356e1f0ef4e",
+    "https://deno.land/std@0.192.0/html/entities.ts": "1c9fa4d76e36a9bdbe370a65f1612771f3cc2cf802d217b4e633850e2fa25c16",
     "https://deno.land/std@0.192.0/http/server.ts": "1b23463b5b36e4eebc495417f6af47a6f7d52e3294827a1226d2a1aab23d9d20",
     "https://deno.land/std@0.192.0/testing/_diff.ts": "1a3c044aedf77647d6cac86b798c6417603361b66b54c53331b312caeb447aea",
     "https://deno.land/std@0.192.0/testing/_format.ts": "a69126e8a469009adf4cf2a50af889aca364c349797e63174884a52ff75cf4c7",
diff --git a/test/api-japanese.ts b/test/api-japanese.ts
new file mode 100644
index 0000000..86f2ac1
--- /dev/null
+++ b/test/api-japanese.ts
@@ -0,0 +1,24 @@
+import Japanese from "../api/japanese.ts";
+
+// https://japanese.stackexchange.com/questions/69521/reading-per-kanji-irregular-readings
+// wow
+
+var cases = [
+	["繰り返す", "くりかえす"], // [繰](く)り[返](かえ)す
+	["漢字テスト", "かんじてすと"], // [漢字](かんじ)テスト
+	["凛々しく", "りりしく"], // [凛々](りり)しく
+	["字のテスト", "じのテスト"], // [字](じ)のテスト
+	["文字", "っ"], // [漢字](っ)
+	["文字りす", "りりりす"], // [文字](りり)りす
+	["気を引き締める", "きをひきしめる"], // [気](き)を[引](ひ)き[締](し)める
+	["文字り漢字", "りりりり"], // ?????
+	["大口魚", "たら"], // [大口魚](たら)
+] satisfies Array<[string, string]>;
+
+for (var args of cases) {
+	var test = new Japanese(...args as [string, string]);
+	// console.log(test.reading);
+	// console.log(test["tokenize"]());
+	console.log(JSON.stringify(args) + " -> " + test.furigana("parenthesis"));
+}
+
diff --git a/test/halfwidth2fullwidth.ts b/test/halfwidth2fullwidth.ts
new file mode 100644
index 0000000..4855f7f
--- /dev/null
+++ b/test/halfwidth2fullwidth.ts
@@ -0,0 +1,5 @@
+import "../util/japanese.ts";
+
+console.log("これが ｵﾚのｳｨﾝﾀｰﾊﾞｹｰｼｮﾝ ｽﾀｲﾙ！".widenKatakana().katakanaToHiragana());
+
+
diff --git a/util/array.ts b/util/array.ts
index f032935..c5a26c6 100644
--- a/util/array.ts
+++ b/util/array.ts
@@ -1,3 +1,5 @@
+import "./set.ts";
+
 declare global {
 	interface Array<T> {
 		/** @summary check if any of the elements of `arr2` are included in `this` */
@@ -8,6 +10,10 @@ declare global {
 		set(): Set<T>;
 		/** @summary clear array */
 		clear(): void;
+		/** @summary filter duplicates from array */
+		filterDuplicates(): Array<T>;
+		/** @summary `true` if the array doesn't contain duplicate items */
+		isUniq(): boolean;
 	}
 }
 
@@ -27,3 +33,11 @@ Array.prototype.clear = function() {
 	while (this.length > 0) this.pop();
 }
 
+Array.prototype.filterDuplicates = function() {
+	return this.set().arr(); // TODO: optimize this
+}
+
+Array.prototype.isUniq = function() {
+	return this.length == this.filterDuplicates().length;
+}
+
diff --git a/util/japanese.ts b/util/japanese.ts
index 2017280..d398b60 100644
--- a/util/japanese.ts
+++ b/util/japanese.ts
@@ -1,4 +1,5 @@
 import { UnicodeRange } from "./string.ts";
+import "./number.ts";
 
 declare global {
 	interface String {
@@ -57,6 +58,15 @@ declare global {
 		 * `strict` to true
 		 */
 		japaneseOnly(strict?: boolean): boolean
+
+		/** @summary convert any half-width katakana to full-width */
+		widenKatakana(): string;
+
+		/** @summary convert any full-width katakana to hiragana */
+		katakanaToHiragana(): string;
+
+		/** @summary convert any kana (full and half-width) to full-width hiragana */
+		normalizeKana(): string;
 	}
 }
 
@@ -85,7 +95,7 @@ function stringOnly(input: string, check: (key: string, val: number) => StringOn
 
 String.prototype.hiraganaOnly = function(strict = false) {
 	return stringOnly(this as string, (key, val) => {
-		if (key == UnicodeRange.JapaneseHiragana)
+		if (key == UnicodeRange.JapaneseFWHiragana)
 			return StringOnlyReturnValue.TallyAdd; // count hiragana characters
 		else if (!strict && key.startsWith("any-"))
 			return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -97,7 +107,7 @@ String.prototype.hiraganaOnly = function(strict = false) {
 
 String.prototype.katakanaOnly = function(strict = false) {
 	return stringOnly(this as string, (key, val) => {
-		if (key == UnicodeRange.JapaneseKatakana)
+		if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana].includes(key as UnicodeRange))
 			return StringOnlyReturnValue.TallyAdd; // count katakana characters
 		else if (!strict && key.startsWith("any-"))
 			return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -121,7 +131,7 @@ String.prototype.kanjiOnly = function(strict = false) {
 
 String.prototype.kanaOnly = function(strict = false) {
 	return stringOnly(this as string, (key, val) => {
-		if (key == UnicodeRange.JapaneseHiragana || key == UnicodeRange.JapaneseKatakana)
+		if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana, UnicodeRange.JapaneseFWHiragana].includes(key as UnicodeRange))
 			return StringOnlyReturnValue.TallyAdd; // count kana characters
 		else if (!strict && key.startsWith("any-"))
 			return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation)
@@ -143,3 +153,120 @@ String.prototype.japaneseOnly = function(strict = false) {
 	});
 }
 
+String.prototype.widenKatakana = function() {
+	const map: { [key: string]: string } = {
+		"ｧ": "ァ",
+		"ｱ": "ア",
+		"ｨ": "ィ",
+		"ｲ": "イ",
+		"ｩ": "ゥ",
+		"ｳ": "ウ",
+		"ｪ": "ェ",
+		"ｴ": "エ",
+		"ｫ": "ォ",
+		"ｵ": "オ",
+		"ｶﾞ": "ガ",
+		"ｶ": "カ",
+		"ｷﾞ": "ギ",
+		"ｷ": "キ",
+		"ｸﾞ": "グ",
+		"ｸ": "ク",
+		"ｹﾞ": "ゲ",
+		"ｹ": "ケ",
+		"ｺﾞ": "ゴ",
+		"ｺ": "コ",
+		"ｻﾞ": "ザ",
+		"ｻ": "サ",
+		"ｼﾞ": "ジ",
+		"ｼ": "シ",
+		"ｽﾞ": "ズ",
+		"ｽ": "ス",
+		"ｾﾞ": "ゼ",
+		"ｾ": "セ",
+		"ｿﾞ": "ゾ",
+		"ｿ": "ソ",
+		"ﾀﾞ": "ダ",
+		"ﾀ": "タ",
+		"ﾁﾞ": "ヂ",
+		"ﾁ": "チ",
+		"ﾂﾞ": "ヅ",
+		"ｯ": "ッ",
+		"ﾂ": "ツ",
+		"ﾃﾞ": "デ",
+		"ﾃ": "テ",
+		"ﾄﾞ": "ド",
+		"ﾄ": "ト",
+		"ﾅ": "ナ",
+		"ﾆ": "ニ",
+		"ﾇ": "ヌ",
+		"ﾈ": "ネ",
+		"ﾉ": "ノ",
+		"ﾊﾞ": "バ",
+		"ﾊﾟ": "パ",
+		"ﾊ": "ハ",
+		"ﾋﾞ": "ビ",
+		"ﾋﾟ": "ピ",
+		"ﾋ": "ヒ",
+		"ﾌﾞ": "ブ",
+		"ﾌﾟ": "プ",
+		"ﾌ": "フ",
+		"ﾍﾞ": "ベ",
+		"ﾍﾟ": "ペ",
+		"ﾍ": "ヘ",
+		"ﾎﾞ": "ボ",
+		"ﾎﾟ": "ポ",
+		"ﾎ": "ホ",
+		"ﾏ": "マ",
+		"ﾐ": "ミ",
+		"ﾑ": "ム",
+		"ﾒ": "メ",
+		"ﾓ": "モ",
+		"ｬ": "ャ",
+		"ﾔ": "ヤ",
+		"ｭ": "ュ",
+		"ﾕ": "ユ",
+		"ｮ": "ョ",
+		"ﾖ": "ヨ",
+		"ﾗ": "ラ",
+		"ﾘ": "リ",
+		"ﾙ": "ル",
+		"ﾚ": "レ",
+		"ﾛ": "ロ",
+		"ﾜ": "ワ",
+		"ｦ": "ヲ",
+		"ﾝ": "ン",
+		"ｳﾞ": "ヴ",
+		"ﾜﾞ": "ヷ",
+		"ｲﾞ": "イ゙",
+		"ｴﾞ": "エ゙",
+		"ｦﾞ": "ヺ",
+		"ｰ": "ー",
+	};
+
+	var out = "";
+	outer:
+	for (let i = 0; i < this.length; i++) {
+		for (var key in map) {
+			if (!this.substring(i).startsWith(key)) continue;
+			out += map[key];
+			i += key.length - 1;
+			continue outer;
+		}
+		
+		out += this[i];
+	}
+	return out;
+}
+
+String.prototype.katakanaToHiragana = function() {
+	return this.map(char => {
+		var code = char.codePointAt(0)!;
+		if (0x30a1 <= code && code <= 0x30f6) return (code + (0x3041 - 0x30a1)).toChar();
+		return char;
+	})
+}
+
+String.prototype.normalizeKana = function() {
+	return this.widenKatakana().katakanaToHiragana();
+}
+
diff --git a/util/number.ts b/util/number.ts
new file mode 100644
index 0000000..c28864f
--- /dev/null
+++ b/util/number.ts
@@ -0,0 +1,11 @@
+declare global {
+	interface Number {
+		/** @summary convert number to character by charCode */
+		toChar(): string;
+	}
+}
+
+Number.prototype.toChar = function() {
+	return String.fromCharCode(this as number);
+}
+
diff --git a/util/string.ts b/util/string.ts
index 397dcd6..327b884 100644
--- a/util/string.ts
+++ b/util/string.ts
@@ -26,6 +26,12 @@ declare global {
 		 * @argument fallback  return this value if parsing fails
 		 */
 		json(fallback?: any): any;
+
+		/**
+		 * @summary map each character of a string to another character using
+		 * `mapFn`
+		 */
+		map(mapFn: (char: string) => string): string;
 	}
 }
 
@@ -34,11 +40,15 @@ export enum UnicodeRange {
 	Whitespace = "any-whitespace",
 	Punctuation = "any-punctuation",
 	Unknown = "any-unknown",
-	JapanesePunctuation = "jp-punctuation",
-	JapaneseHiragana = "jp-hiragana",
-	JapaneseKatakana = "jp-katakana",
-	JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana",
+	JapaneseFWPunctuation = "jp-full-width-punctuation",
+	JapaneseHWPunctuation = "jp-half-width-punctuation",
+	JapaneseFWHiragana = "jp-full-width-hiragana",
+	JapaneseFWKatakana = "jp-full-width-katakana",
+	JapaneseFWLatin = "jp-full-width-latin",
+	JapaneseHWKatakana = "jp-half-width-katakana",
 	JapaneseKanji = "jp-kanji",
+	JapaneseKanjiRadicals = "jp-kanji-radicals",
+	JapaneseAuxiliary = "jp-aux",
 }
 
 type RangeTally = Record<UnicodeRange, number>;
@@ -52,12 +62,22 @@ String.prototype.range = function() {
 	if (0x2e == code) return UnicodeRange.Punctuation; // full stop
 	if (0x3f == code) return UnicodeRange.Punctuation; // question mark
 
+	// https://stackoverflow.com/a/53807563
 	if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin;
-	if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation;
-	if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana;
-	if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana;
-	if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana;
-	if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji;
+	if (0x2e80 <= code && code <= 0x2fd5) return UnicodeRange.JapaneseKanjiRadicals;
+	if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapaneseFWPunctuation;
+	if (0xff5f <= code && code <= 0xff60) return UnicodeRange.JapaneseFWPunctuation;
+	if (0x3041 <= code && code <= 0x3096) return UnicodeRange.JapaneseFWHiragana;
+	if (0x30a1 <= code && code <= 0x30ff) return UnicodeRange.JapaneseFWKatakana;
+	if (0x3400 <= code && code <= 0x4db5) return UnicodeRange.JapaneseKanji;
+	if (0x4e00 <= code && code <= 0x9fcb) return UnicodeRange.JapaneseKanji;
+	if (0xf900 <= code && code <= 0xfa6a) return UnicodeRange.JapaneseKanji;
+	if (0xff61 <= code && code <= 0xff65) return UnicodeRange.JapaneseHWPunctuation;
+	if (0xff66 <= code && code <= 0xff9f) return UnicodeRange.JapaneseHWKatakana;
+	if (0x31f0 <= code && code <= 0x31ff) return UnicodeRange.JapaneseAuxiliary;
+	if (0x3220 <= code && code <= 0x3243) return UnicodeRange.JapaneseAuxiliary;
+	if (0x3280 <= code && code <= 0x337f) return UnicodeRange.JapaneseAuxiliary;
+	if (0xff01 <= code && code <= 0xff5e) return UnicodeRange.JapaneseFWLatin;
 	return UnicodeRange.Unknown;
 }
 
@@ -71,11 +91,11 @@ String.prototype.parseTags = function() {
 	return parseTags(this as string);
 }
 
-String.prototype.removeAll = function(searchValue: string | RegExp) {
+String.prototype.removeAll = function(searchValue) {
 	return this.replaceAll(searchValue, "");
 }
 
-String.prototype.json = function(fallback?: any) {
+String.prototype.json = function(fallback) {
 	if (fallback) {
 		try {
 			return JSON.parse(this as string);
@@ -87,3 +107,9 @@ String.prototype.json = function(fallback?: any) {
 	}
 }
 
+String.prototype.map = function(mapFn) {
+	var out = "";
+	for (var char of this) out += mapFn(char);
+	return out;
+}
+
author	lonkaars <loek@pipeframe.xyz>	2023-07-02 19:21:39 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-07-02 19:21:39 +0200
commit	f4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
tree	f9ae82ae9549330d12a30ffee8960f2577fff9aa
parent	ce9e0788317b25e5d297ed38d9fed0754a341288 (diff)