implement Japanese class

author: lonkaars <loek@pipeframe.xyz> 2023-07-02 19:21:39 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-07-02 19:21:39 +0200
commit: f4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
tree: f9ae82ae9549330d12a30ffee8960f2577fff9aa /api/japanese.ts
parent: ce9e0788317b25e5d297ed38d9fed0754a341288 (diff)
1 files changed, 153 insertions, 0 deletions
diff --git a/api/japanese.ts b/api/japanese.ts
new file mode 100644
index 0000000..0396821
--- /dev/null
+++ b/api/japanese.ts
@@ -0,0 +1,153 @@
+import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
+
+import "../util/string.ts";
+import "../util/japanese.ts";
+import "../util/array.ts";
+
+/** @interface Piece */
+interface JapaneseToken {
+	/** @prop token writing (kanji/katakana/hiragana) */
+	writing: string;
+	/** @prop token reading (katakana/hiragana) */
+	reading: string;
+	/** @prop normalized token reading (always hiragana) */
+	normalized: string;
+	/** @prop show reading when parsed by formatter */
+	ruby: boolean;
+};
+
+/** @class Japanese string with reading and output formatters */
+export default class Japanese {
+	public writing: string;
+	public reading: string;
+	private normalized: string;
+
+	private formatters = {
+		"HTML": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`;
+			else out += token.writing;
+			return out;
+		}, ""),
+		"parenthesis": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `${token.writing}(${token.reading}) `;
+			else out += token.writing;
+			return out;
+		}, ""),
+		"refold-tools": tokens => tokens.reduce((out, token) => {
+			if (token.ruby) out += `[${token.writing}](${token.reading})`;
+			else out += token.writing;
+			return out;
+		}, ""),
+	} satisfies Record<string, (tokens: Array<JapaneseToken>) => string>;
+
+	constructor(writing: string, reading: string) {
+		this.writing = writing;
+		this.reading = reading;
+		this.normalized = reading.normalizeKana();
+	}
+
+	/** @summary format this as text with furigana */
+	public furigana(format: keyof typeof this.formatters = "HTML"): string {
+		return this.formatters[format](this.tokenize());
+	}
+
+	/**
+	 * @summary attempt to match kana in this.reading to sections of the same
+	 * script in this.writing
+	 */
+	private tokenize(): Array<JapaneseToken> {
+		var tokens: Array<JapaneseToken> = [];
+
+		// split this.writing into tokens with different scripts
+		var token: JapaneseToken = {
+			writing: "",
+			reading: "",
+			normalized: "",
+			ruby: true,
+		};
+		var kana: boolean = this.writing[0].kanaOnly();
+		for (var char of this.writing) {
+			if (char.kanaOnly() != kana) {
+				tokens.push({ ...token });
+				token.writing = "";
+			}
+			token.writing += char;
+			kana = char.kanaOnly();
+		}
+		tokens.push(token);
+
+		// find kana-only tokens and normalize them
+		tokens = tokens.map(token => {
+			if (!token.writing.kanaOnly()) return token;
+			token.normalized = token.writing.normalizeKana();
+			token.reading = token.writing;
+			token.ruby = false;
+			return token;
+		});
+
+		// don't try to spread reading across kanji if there is only one kanji/kana string
+		if (tokens.length == 1) {
+			tokens[0].reading = this.reading;
+			tokens[0].normalized = this.reading.normalizeKana();
+			tokens[0].ruby = !this.writing.kanaOnly();
+			return tokens;
+		}
+
+		// list of indices where anchor token could be in reading
+		var possibilities: Array<Array<number>> = [];
+		// find all possible arrangements (in-order) of anchor indices in this.reading
+		var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
+			// this arrangement is a possibility because the last token fit
+			if (tokenIndex == tokens.length) {
+				possibilities.push(path);
+				return;
+			}
+			// skip until next 'anchor' token
+			if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
+
+			// try all positions where current (anchor) token fits in this.reading
+			while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
+				match(tokenIndex + 1, searchStart, [...path, searchStart]);
+			}
+		};
+		match();
+
+		// create index slices from possibilities
+		var slices = possibilities
+			.map(match => { // convert start index of anchor to start and stop index (based on anchor length)
+				var out = [0];
+				let matchIndex = 0;
+				for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
+					if (tokens[tokenIndex].normalized.length == 0) continue;
+					out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
+					matchIndex++;
+				}
+				if (out.peek() != this.reading.length) out.push(this.reading.length);
+				return out;
+			})
+			.filter(slice => slice.length == tokens.length + 1)
+			.filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
+			.filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
+
+		// cop-out if there is no valid way to split reading across kanji
+		if (slices.length == 0) {
+			return [{
+				writing: this.writing,
+				reading: this.reading,
+				normalized: this.normalized,
+				ruby: true,
+			}];
+		}
+
+		var slice = slices[0]; // TODO: pick most "balanced" out of these instead
+
+		for (let i = 0; i < tokens.length; i++) {
+			// slice[i+1] is safe because slice.length == tokens.length + 1
+			tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
+			tokens[i].normalized = tokens[i].reading.normalizeKana();
+		}
+
+		return tokens;
+	}
+}
+
author	lonkaars <loek@pipeframe.xyz>	2023-07-02 19:21:39 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-07-02 19:21:39 +0200
commit	f4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
tree	f9ae82ae9549330d12a30ffee8960f2577fff9aa /api/japanese.ts
parent	ce9e0788317b25e5d297ed38d9fed0754a341288 (diff)