import { escape } from "https://deno.land/std@0.192.0/html/entities.ts";
import "../util/string.ts";
import "../util/japanese.ts";
import "../util/array.ts";
const formatters = {
"HTML": tokens => tokens.reduce((out, token) => {
if (token.ruby) out += `${escape(token.writing)}`;
else out += token.writing;
return out;
}, ""),
"parenthesis": tokens => tokens.reduce((out, token) => {
if (token.ruby) out += `${token.writing}(${token.reading}) `;
else out += token.writing;
return out;
}, ""),
"refold-tools": tokens => tokens.reduce((out, token) => {
if (token.ruby) out += `[${token.writing}](${token.reading})`;
else out += token.writing;
return out;
}, ""),
} satisfies { [name: string]: (tokens: Array) => string };
export type JapaneseFormatter = keyof typeof formatters;
/** @interface Piece */
interface JapaneseToken {
/** @prop token writing (kanji/katakana/hiragana) */
writing: string;
/** @prop token reading (katakana/hiragana) */
reading: string;
/** @prop normalized token reading (always hiragana) */
normalized: string;
/** @prop show reading when parsed by formatter */
ruby: boolean;
};
/** @class Japanese string with reading and output formatters */
export default class Japanese {
public writing: string;
public reading: string;
private normalized: string;
constructor(writing: string, reading: string) {
this.writing = writing;
this.reading = reading;
this.normalized = reading.normalizeKana();
}
/** @summary format this as text with furigana */
public furigana(format: JapaneseFormatter = "HTML"): string {
return formatters[format](this.tokenize());
}
/**
* @summary attempt to match kana in this.reading to sections of the same
* script in this.writing
*/
private tokenize(): Array {
var tokens: Array = [];
// split this.writing into tokens with different scripts
var token: JapaneseToken = {
writing: "",
reading: "",
normalized: "",
ruby: true,
};
var kana: boolean = this.writing[0].kanaOnly();
for (var char of this.writing) {
if (char.kanaOnly() != kana) {
tokens.push({ ...token });
token.writing = "";
}
token.writing += char;
kana = char.kanaOnly();
}
tokens.push(token);
// find kana-only tokens and normalize them
tokens = tokens.map(token => {
if (!token.writing.kanaOnly()) return token;
token.normalized = token.writing.normalizeKana();
token.reading = token.writing;
token.ruby = false;
return token;
});
// don't try to spread reading across kanji if there is only one kanji/kana string
if (tokens.length == 1) {
tokens[0].reading = this.reading;
tokens[0].normalized = this.reading.normalizeKana();
tokens[0].ruby = !this.writing.kanaOnly();
return tokens;
}
// list of indices where anchor token could be in reading
var possibilities: Array> = [];
// find all possible arrangements (in-order) of anchor indices in this.reading
var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array = []): void => {
// this arrangement is a possibility because the last token fit
if (tokenIndex == tokens.length) {
possibilities.push(path);
return;
}
// skip until next 'anchor' token
if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path);
// try all positions where current (anchor) token fits in this.reading
while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) {
match(tokenIndex + 1, searchStart, [...path, searchStart]);
}
};
match();
// create index slices from possibilities
var slices = possibilities
.map(match => { // convert start index of anchor to start and stop index (based on anchor length)
var out = [0];
let matchIndex = 0;
for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) {
if (tokens[tokenIndex].normalized.length == 0) continue;
out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length);
matchIndex++;
}
if (out.peek() != this.reading.length) out.push(this.reading.length);
return out;
})
.filter(slice => slice.length == tokens.length + 1)
.filter(slice => slice.isUniq()) // slice can't contain sections with 0 length
.filter(slice => slice.peek() == this.reading.length); // slice should match entire reading
// cop-out if there is no valid way to split reading across kanji
if (slices.length == 0) {
return [{
writing: this.writing,
reading: this.reading,
normalized: this.normalized,
ruby: true,
}];
}
var slice = slices[0]; // TODO: pick most "balanced" out of these instead
for (let i = 0; i < tokens.length; i++) {
// slice[i+1] is safe because slice.length == tokens.length + 1
tokens[i].reading = this.reading.substring(slice[i], slice[i+1]);
tokens[i].normalized = tokens[i].reading.normalizeKana();
}
return tokens;
}
}
export type test = keyof typeof Japanese.formatters;
var gert: test = "HTML";