import { escape } from "https://deno.land/std@0.192.0/html/entities.ts"; import "../util/string.ts"; import "../util/japanese.ts"; import "../util/array.ts"; import { Wrap } from "../util/wrap.ts"; const formatters = { "HTML": tokens => tokens.reduce((out, token) => { if (token.ruby) { out += (escape(token.writing) + escape(token.reading).wrap(Wrap.HTML.rubyText)).wrap(Wrap.HTML.ruby); } else out += token.writing; return out; }, ""), "parenthesis": tokens => tokens.reduce((out, token) => { if (token.ruby) out += token.writing + token.reading.wrap(Wrap.parenthesis) + " "; else out += token.writing; return out; }, ""), "refold-tools": tokens => tokens.reduce((out, token) => { if (token.ruby) out += token.writing.wrap(Wrap.bracket) + token.reading.wrap(Wrap.parenthesis); else out += token.writing; return out; }, ""), } satisfies { [name: string]: (tokens: Array) => string }; export type JapaneseFormatter = keyof typeof formatters; /** @interface Piece */ interface JapaneseToken { /** @prop token writing (kanji/katakana/hiragana) */ writing: string; /** @prop token reading (katakana/hiragana) */ reading: string; /** @prop normalized token reading (always hiragana) */ normalized: string; /** @prop show reading when parsed by formatter */ ruby: boolean; }; /** @class Japanese string with reading and output formatters */ export default class Japanese { public writing: string; public reading: string; private normalized: string; constructor(writing: string, reading: string) { this.writing = writing; this.reading = reading; this.normalized = reading.normalizeKana(); } /** @summary format this as text with furigana */ public furigana(format: JapaneseFormatter = "HTML"): string { return formatters[format](this.tokenize()); } /** * @summary attempt to match kana in this.reading to sections of the same * script in this.writing */ private tokenize(): Array { var tokens: Array = []; // split this.writing into tokens with different scripts var token: JapaneseToken = { writing: "", reading: "", normalized: "", ruby: true, }; var kanji: boolean = this.writing[0].kanjiOnly(true); for (var char of this.writing) { var kanjiNow = char.kanjiOnly(true); if (kanjiNow != kanji) { tokens.push({ ...token }); token.writing = ""; } token.writing += char; kanji = kanjiNow; } tokens.push(token); // find kana-only tokens and normalize them tokens = tokens.map(token => { if (!token.writing.kanaOnly()) return token; token.normalized = token.writing.normalizeKana(); token.reading = token.writing; token.ruby = false; return token; }); // don't try to spread reading across kanji if there is only one kanji/kana string if (tokens.length == 1) { tokens[0].reading = this.reading; tokens[0].normalized = this.reading.normalizeKana(); tokens[0].ruby = !this.writing.kanaOnly(); return tokens; } // list of indices where anchor token could be in reading var possibilities: Array> = []; // find all possible arrangements (in-order) of anchor indices in this.reading var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array = []): void => { // this arrangement is a possibility because the last token fit if (tokenIndex == tokens.length) { possibilities.push(path); return; } // skip until next 'anchor' token (token with no-kanji characters only) if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path); // try all positions where current (anchor) token fits in this.reading while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) { match(tokenIndex + 1, searchStart, [...path, searchStart]); } }; match(); // create index slices from possibilities var slices = possibilities .map(match => { // convert start index of anchor to start and stop index (based on anchor length) var out = []; let matchIndex = 0; if (tokens[0].ruby) out.push(0); for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) { if (tokens[tokenIndex].normalized.length == 0) continue; out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length); matchIndex++; } if (tokens.peek().ruby) out.push(this.reading.length); // if (out.peek() != this.reading.length) out.push(this.reading.length); return out; }) .filter(slice => slice.length == tokens.length + 1) .filter(slice => slice.isUniq()) // slice can't contain sections with 0 length .filter(slice => slice.peek() == this.reading.length); // slice should match entire reading // cop-out if there is no valid way to split reading across kanji if (slices.length == 0) { return [{ writing: this.writing, reading: this.reading, normalized: this.normalized, ruby: true, }]; } var slice = slices[0]; // TODO: pick most "balanced" out of these instead for (let i = 0; i < tokens.length; i++) { // slice[i+1] is safe because slice.length == tokens.length + 1 tokens[i].reading = this.reading.substring(slice[i], slice[i+1]); tokens[i].normalized = tokens[i].reading.normalizeKana(); } return tokens; } }