1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
import "../util/string.ts";
import "../util/japanese.ts";
import "../util/array.ts";
import { Wrap } from "../util/wrap.ts";
const formatters = {
"HTML": tokens => tokens.reduce((out, token) => {
if (token.ruby) {
out += (escape(token.writing) +
escape(token.reading).wrap(Wrap.HTML.rubyText)).wrap(Wrap.HTML.ruby);
} else out += token.writing;
return out;
}, ""),
"parenthesis": tokens => tokens.reduce((out, token) => {
if (token.ruby) out += token.writing + token.reading.wrap(Wrap.parenthesis) + " ";
else out += token.writing;
return out;
}, ""),
"refold-tools": tokens => tokens.reduce((out, token) => {
if (token.ruby) out += token.writing.wrap(Wrap.bracket) + token.reading.wrap(Wrap.parenthesis);
else out += token.writing;
return out;
}, ""),
} satisfies { [name: string]: (tokens: Array<JapaneseToken>) => string };
export type JapaneseFormatter = keyof typeof formatters;
/** @interface Piece */
interface JapaneseToken {
/** @prop token writing (kanji/katakana/hiragana) */
writing: string;
/** @prop token reading (katakana/hiragana) */
reading: string;
/** @prop normalized token reading (always hiragana) */
normalized: string;
/** @prop show reading when parsed by formatter */
ruby: boolean;
};
/** @class Japanese string with reading and output formatters */
export default class Japanese {
public writing: string;
public reading: string;
private normalized: string;
constructor(writing: string, reading: string) {
this.writing = writing;
this.reading = reading;
this.normalized = reading.normalizeKana();
}
/** @summary format this as text with furigana */
public furigana(format: JapaneseFormatter = "HTML"): string {
return formatters[format](this.tokenize());
}
/**
* @summary attempt to match kana in this.reading to sections of the same
* script in this.writing
*/
private tokenize(): Array<JapaneseToken> {
var tokens: Array<JapaneseToken> = [];
// split this.writing into tokens with different scripts
var token: JapaneseToken = {
writing: "",
reading: "",
normalized: "",
ruby: true,
};
var kanji: boolean = this.writing[0].kanjiOnly(true);
for (var char of this.writing) {
var kanjiNow = char.kanjiOnly(true);
if (kanjiNow != kanji) {
tokens.push({ ...token });
token.writing = "";
}
token.writing += char;
kanji = kanjiNow;
}
tokens.push(token);
// find kana-only tokens and normalize them
tokens = tokens.map(token => {
if (!token.writing.kanaOnly()) return token;
token.normalized = token.writing.normalizeKana();
token.reading = token.writing;
token.ruby = false;
return token;
});
// don't try to spread reading across kanji if there is only one kanji/kana string
if (tokens.length == 1) {
tokens[0].reading = this.reading;
tokens[0].normalized = this.reading.normalizeKana();
tokens[0].ruby = !this.writing.kanaOnly();
return tokens;
}
// list of indices where anchor token could be in reading
var possibilities: Array<Array<number>> = [];
// find all possible arrangements (in-order) of anchor indices in this.reading
var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
// this arrangement is a possibility because the last token fit
if (tokenIndex == tokens.length) {
possibilities.push(path);
return;
}
// skip until next 'anchor' token (token with no-kanji characters only)
if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
// try all positions where current (anchor) token fits in this.reading
while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
match(tokenIndex + 1, searchStart, [...path, searchStart]);
}
};
match();
// create index slices from possibilities
var slices = possibilities
.map(match => { // convert start index of anchor to start and stop index (based on anchor length)
var out = [];
let matchIndex = 0;
if (tokens[0].ruby) out.push(0);
for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
if (tokens[tokenIndex].normalized.length == 0) continue;
out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
matchIndex++;
}
if (tokens.peek().ruby) out.push(this.reading.length);
// if (out.peek() != this.reading.length) out.push(this.reading.length);
return out;
})
.filter(slice => slice.length == tokens.length + 1)
.filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
.filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
// cop-out if there is no valid way to split reading across kanji
if (slices.length == 0) {
return [{
writing: this.writing,
reading: this.reading,
normalized: this.normalized,
ruby: true,
}];
}
var slice = slices[0]; // TODO: pick most "balanced" out of these instead
for (let i = 0; i < tokens.length; i++) {
// slice[i+1] is safe because slice.length == tokens.length + 1
tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
tokens[i].normalized = tokens[i].reading.normalizeKana();
}
return tokens;
}
}
|