diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-07-02 19:21:39 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-07-02 19:21:39 +0200 |
commit | f4963b89ee542592e9ae95ca29d74ddc57841c3f (patch) | |
tree | f9ae82ae9549330d12a30ffee8960f2577fff9aa /api/japanese.ts | |
parent | ce9e0788317b25e5d297ed38d9fed0754a341288 (diff) |
implement Japanese class
Diffstat (limited to 'api/japanese.ts')
-rw-r--r-- | api/japanese.ts | 153 |
1 files changed, 153 insertions, 0 deletions
diff --git a/api/japanese.ts b/api/japanese.ts new file mode 100644 index 0000000..0396821 --- /dev/null +++ b/api/japanese.ts @@ -0,0 +1,153 @@ +import { escape } from "https://deno.land/std@0.192.0/html/entities.ts"; + +import "../util/string.ts"; +import "../util/japanese.ts"; +import "../util/array.ts"; + +/** @interface Piece */ +interface JapaneseToken { + /** @prop token writing (kanji/katakana/hiragana) */ + writing: string; + /** @prop token reading (katakana/hiragana) */ + reading: string; + /** @prop normalized token reading (always hiragana) */ + normalized: string; + /** @prop show reading when parsed by formatter */ + ruby: boolean; +}; + +/** @class Japanese string with reading and output formatters */ +export default class Japanese { + public writing: string; + public reading: string; + private normalized: string; + + private formatters = { + "HTML": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`; + else out += token.writing; + return out; + }, ""), + "parenthesis": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `${token.writing}(${token.reading}) `; + else out += token.writing; + return out; + }, ""), + "refold-tools": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `[${token.writing}](${token.reading})`; + else out += token.writing; + return out; + }, ""), + } satisfies Record<string, (tokens: Array<JapaneseToken>) => string>; + + constructor(writing: string, reading: string) { + this.writing = writing; + this.reading = reading; + this.normalized = reading.normalizeKana(); + } + + /** @summary format this as text with furigana */ + public furigana(format: keyof typeof this.formatters = "HTML"): string { + return this.formatters[format](this.tokenize()); + } + + /** + * @summary attempt to match kana in this.reading to sections of the same + * script in this.writing + */ + private tokenize(): Array<JapaneseToken> { + var tokens: Array<JapaneseToken> = []; + + // split this.writing into tokens with different scripts + var token: JapaneseToken = { + writing: "", + reading: "", + normalized: "", + ruby: true, + }; + var kana: boolean = this.writing[0].kanaOnly(); + for (var char of this.writing) { + if (char.kanaOnly() != kana) { + tokens.push({ ...token }); + token.writing = ""; + } + token.writing += char; + kana = char.kanaOnly(); + } + tokens.push(token); + + // find kana-only tokens and normalize them + tokens = tokens.map(token => { + if (!token.writing.kanaOnly()) return token; + token.normalized = token.writing.normalizeKana(); + token.reading = token.writing; + token.ruby = false; + return token; + }); + + // don't try to spread reading across kanji if there is only one kanji/kana string + if (tokens.length == 1) { + tokens[0].reading = this.reading; + tokens[0].normalized = this.reading.normalizeKana(); + tokens[0].ruby = !this.writing.kanaOnly(); + return tokens; + } + + // list of indices where anchor token could be in reading + var possibilities: Array<Array<number>> = []; + // find all possible arrangements (in-order) of anchor indices in this.reading + var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => { + // this arrangement is a possibility because the last token fit + if (tokenIndex == tokens.length) { + possibilities.push(path); + return; + } + // skip until next 'anchor' token + if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path); + + // try all positions where current (anchor) token fits in this.reading + while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) { + match(tokenIndex + 1, searchStart, [...path, searchStart]); + } + }; + match(); + + // create index slices from possibilities + var slices = possibilities + .map(match => { // convert start index of anchor to start and stop index (based on anchor length) + var out = [0]; + let matchIndex = 0; + for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) { + if (tokens[tokenIndex].normalized.length == 0) continue; + out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length); + matchIndex++; + } + if (out.peek() != this.reading.length) out.push(this.reading.length); + return out; + }) + .filter(slice => slice.length == tokens.length + 1) + .filter(slice => slice.isUniq()) // slice can't contain sections with 0 length + .filter(slice => slice.peek() == this.reading.length); // slice should match entire reading + + // cop-out if there is no valid way to split reading across kanji + if (slices.length == 0) { + return [{ + writing: this.writing, + reading: this.reading, + normalized: this.normalized, + ruby: true, + }]; + } + + var slice = slices[0]; // TODO: pick most "balanced" out of these instead + + for (let i = 0; i < tokens.length; i++) { + // slice[i+1] is safe because slice.length == tokens.length + 1 + tokens[i].reading = this.reading.substring(slice[i], slice[i+1]); + tokens[i].normalized = tokens[i].reading.normalizeKana(); + } + + return tokens; + } +} + |