aboutsummaryrefslogtreecommitdiff
path: root/api/japanese.ts
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-02 19:21:39 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-02 19:21:39 +0200
commitf4963b89ee542592e9ae95ca29d74ddc57841c3f (patch)
treef9ae82ae9549330d12a30ffee8960f2577fff9aa /api/japanese.ts
parentce9e0788317b25e5d297ed38d9fed0754a341288 (diff)
implement Japanese class
Diffstat (limited to 'api/japanese.ts')
-rw-r--r--api/japanese.ts153
1 files changed, 153 insertions, 0 deletions
diff --git a/api/japanese.ts b/api/japanese.ts
new file mode 100644
index 0000000..0396821
--- /dev/null
+++ b/api/japanese.ts
@@ -0,0 +1,153 @@
+import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
+
+import "../util/string.ts";
+import "../util/japanese.ts";
+import "../util/array.ts";
+
+/** @interface Piece */
+interface JapaneseToken {
+ /** @prop token writing (kanji/katakana/hiragana) */
+ writing: string;
+ /** @prop token reading (katakana/hiragana) */
+ reading: string;
+ /** @prop normalized token reading (always hiragana) */
+ normalized: string;
+ /** @prop show reading when parsed by formatter */
+ ruby: boolean;
+};
+
+/** @class Japanese string with reading and output formatters */
+export default class Japanese {
+ public writing: string;
+ public reading: string;
+ private normalized: string;
+
+ private formatters = {
+ "HTML": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`;
+ else out += token.writing;
+ return out;
+ }, ""),
+ "parenthesis": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `${token.writing}(${token.reading}) `;
+ else out += token.writing;
+ return out;
+ }, ""),
+ "refold-tools": tokens => tokens.reduce((out, token) => {
+ if (token.ruby) out += `[${token.writing}](${token.reading})`;
+ else out += token.writing;
+ return out;
+ }, ""),
+ } satisfies Record<string, (tokens: Array<JapaneseToken>) => string>;
+
+ constructor(writing: string, reading: string) {
+ this.writing = writing;
+ this.reading = reading;
+ this.normalized = reading.normalizeKana();
+ }
+
+ /** @summary format this as text with furigana */
+ public furigana(format: keyof typeof this.formatters = "HTML"): string {
+ return this.formatters[format](this.tokenize());
+ }
+
+ /**
+ * @summary attempt to match kana in this.reading to sections of the same
+ * script in this.writing
+ */
+ private tokenize(): Array<JapaneseToken> {
+ var tokens: Array<JapaneseToken> = [];
+
+ // split this.writing into tokens with different scripts
+ var token: JapaneseToken = {
+ writing: "",
+ reading: "",
+ normalized: "",
+ ruby: true,
+ };
+ var kana: boolean = this.writing[0].kanaOnly();
+ for (var char of this.writing) {
+ if (char.kanaOnly() != kana) {
+ tokens.push({ ...token });
+ token.writing = "";
+ }
+ token.writing += char;
+ kana = char.kanaOnly();
+ }
+ tokens.push(token);
+
+ // find kana-only tokens and normalize them
+ tokens = tokens.map(token => {
+ if (!token.writing.kanaOnly()) return token;
+ token.normalized = token.writing.normalizeKana();
+ token.reading = token.writing;
+ token.ruby = false;
+ return token;
+ });
+
+ // don't try to spread reading across kanji if there is only one kanji/kana string
+ if (tokens.length == 1) {
+ tokens[0].reading = this.reading;
+ tokens[0].normalized = this.reading.normalizeKana();
+ tokens[0].ruby = !this.writing.kanaOnly();
+ return tokens;
+ }
+
+ // list of indices where anchor token could be in reading
+ var possibilities: Array<Array<number>> = [];
+ // find all possible arrangements (in-order) of anchor indices in this.reading
+ var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => {
+ // this arrangement is a possibility because the last token fit
+ if (tokenIndex == tokens.length) {
+ possibilities.push(path);
+ return;
+ }
+ // skip until next 'anchor' token
+ if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
+
+ // try all positions where current (anchor) token fits in this.reading
+ while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
+ match(tokenIndex + 1, searchStart, [...path, searchStart]);
+ }
+ };
+ match();
+
+ // create index slices from possibilities
+ var slices = possibilities
+ .map(match => { // convert start index of anchor to start and stop index (based on anchor length)
+ var out = [0];
+ let matchIndex = 0;
+ for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
+ if (tokens[tokenIndex].normalized.length == 0) continue;
+ out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
+ matchIndex++;
+ }
+ if (out.peek() != this.reading.length) out.push(this.reading.length);
+ return out;
+ })
+ .filter(slice => slice.length == tokens.length + 1)
+ .filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
+ .filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
+
+ // cop-out if there is no valid way to split reading across kanji
+ if (slices.length == 0) {
+ return [{
+ writing: this.writing,
+ reading: this.reading,
+ normalized: this.normalized,
+ ruby: true,
+ }];
+ }
+
+ var slice = slices[0]; // TODO: pick most "balanced" out of these instead
+
+ for (let i = 0; i < tokens.length; i++) {
+ // slice[i+1] is safe because slice.length == tokens.length + 1
+ tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
+ tokens[i].normalized = tokens[i].reading.normalizeKana();
+ }
+
+ return tokens;
+ }
+}
+