api/japanese.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160

import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";

import "../util/string.ts";
import "../util/japanese.ts";
import "../util/array.ts";
import { Wrap } from "../util/wrap.ts";

const formatters = {
	"HTML": tokens => tokens.reduce((out, token) => {
		if (token.ruby) {
			out += (escape(token.writing) +
			        escape(token.reading).wrap(Wrap.HTML.rubyText)).wrap(Wrap.HTML.ruby);
		} else out += token.writing;
		return out;
	}, ""),
	"parenthesis": tokens => tokens.reduce((out, token) => {
		if (token.ruby) out += token.writing + token.reading.wrap(Wrap.parenthesis) + " ";
		else out += token.writing;
		return out;
	}, ""),
	"refold-tools": tokens => tokens.reduce((out, token) => {
		if (token.ruby) out += token.writing.wrap(Wrap.bracket) + token.reading.wrap(Wrap.parenthesis);
		else out += token.writing;
		return out;
	}, ""),
} satisfies { [name: string]: (tokens: Array<JapaneseToken>) => string };

export type JapaneseFormatter = keyof typeof formatters;

/** @interface Piece */
interface JapaneseToken {
	/** @prop token writing (kanji/katakana/hiragana) */
	writing: string;
	/** @prop token reading (katakana/hiragana) */
	reading: string;
	/** @prop normalized token reading (always hiragana) */
	normalized: string;
	/** @prop show reading when parsed by formatter */
	ruby: boolean;
};

/** @class Japanese string with reading and output formatters */
export default class Japanese {
	public writing: string;
	public reading: string;
	private normalized: string;

	constructor(writing: string, reading: string) {
		this.writing = writing;
		this.reading = reading;
		this.normalized = reading.normalizeKana();
	}

	/** @summary format this as text with furigana */
	public furigana(format: JapaneseFormatter = "HTML"): string {
		return formatters[format](this.tokenize());
	}

	/**
	 * @summary attempt to match kana in this.reading to sections of the same
	 * script in this.writing
	 */
	private tokenize(): Array<JapaneseToken> {
		var tokens: Array<JapaneseToken> = [];

		// split this.writing into tokens with different scripts
		var token: JapaneseToken = {
			writing: "",
			reading: "",
			normalized: "",
			ruby: true,
		};
		var kana: boolean = this.writing[0].kanaOnly();
		for (var char of this.writing) {
			if (char.kanaOnly() != kana) {
				tokens.push({ ...token });
				token.writing = "";
			}
			token.writing += char;
			kana = char.kanaOnly();
		}
		tokens.push(token);

		// find kana-only tokens and normalize them
		tokens = tokens.map(token => {
			if (!token.writing.kanaOnly()) return token;
			token.normalized = token.writing.normalizeKana();
			token.reading = token.writing;
			token.ruby = false;
			return token;
		});

		// don't try to spread reading across kanji if there is only one kanji/kana string
		if (tokens.length == 1) {
			tokens[0].reading = this.reading;
			tokens[0].normalized = this.reading.normalizeKana();
			tokens[0].ruby = !this.writing.kanaOnly();
			return tokens;
		}

		// list of indices where anchor token could be in reading
		var possibilities: Array<Array<number>> = [];
		// find all possible arrangements (in-order) of anchor indices in this.reading
		var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
			// this arrangement is a possibility because the last token fit
			if (tokenIndex == tokens.length) {
				possibilities.push(path);
				return;
			}
			// skip until next 'anchor' token (token with no-kanji characters only)
			if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);

			// try all positions where current (anchor) token fits in this.reading
			while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
				match(tokenIndex + 1, searchStart, [...path, searchStart]);
			}
		};
		match();

		// create index slices from possibilities
		var slices = possibilities
			.map(match => { // convert start index of anchor to start and stop index (based on anchor length)
				var out = [];
				let matchIndex = 0;
				if (tokens[0].ruby) out.push(0);
				for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
					if (tokens[tokenIndex].normalized.length == 0) continue;
					out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
					matchIndex++;
				}
				if (tokens.peek().ruby) out.push(this.reading.length);
				// if (out.peek() != this.reading.length) out.push(this.reading.length);
				return out;
			})
			.filter(slice => slice.length == tokens.length + 1)
			.filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
			.filter(slice => slice.peek() == this.reading.length); // slice should match entire reading

		// cop-out if there is no valid way to split reading across kanji
		if (slices.length == 0) {
			return [{
				writing: this.writing,
				reading: this.reading,
				normalized: this.normalized,
				ruby: true,
			}];
		}

		var slice = slices[0]; // TODO: pick most "balanced" out of these instead

		for (let i = 0; i < tokens.length; i++) {
			// slice[i+1] is safe because slice.length == tokens.length + 1
			tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
			tokens[i].normalized = tokens[i].reading.normalizeKana();
		}

		return tokens;
	}
}