diff options
-rw-r--r-- | api/japanese.ts | 153 | ||||
-rw-r--r-- | api/word.ts | 1 | ||||
-rw-r--r-- | deno.lock | 1 | ||||
-rw-r--r-- | test/api-japanese.ts | 24 | ||||
-rw-r--r-- | test/halfwidth2fullwidth.ts | 5 | ||||
-rw-r--r-- | util/array.ts | 14 | ||||
-rw-r--r-- | util/japanese.ts | 133 | ||||
-rw-r--r-- | util/number.ts | 11 | ||||
-rw-r--r-- | util/string.ts | 48 |
9 files changed, 376 insertions, 14 deletions
diff --git a/api/japanese.ts b/api/japanese.ts new file mode 100644 index 0000000..0396821 --- /dev/null +++ b/api/japanese.ts @@ -0,0 +1,153 @@ +import { escape } from "https://deno.land/std@0.192.0/html/entities.ts"; + +import "../util/string.ts"; +import "../util/japanese.ts"; +import "../util/array.ts"; + +/** @interface Piece */ +interface JapaneseToken { + /** @prop token writing (kanji/katakana/hiragana) */ + writing: string; + /** @prop token reading (katakana/hiragana) */ + reading: string; + /** @prop normalized token reading (always hiragana) */ + normalized: string; + /** @prop show reading when parsed by formatter */ + ruby: boolean; +}; + +/** @class Japanese string with reading and output formatters */ +export default class Japanese { + public writing: string; + public reading: string; + private normalized: string; + + private formatters = { + "HTML": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `<ruby>${escape(token.writing)}<rt>${escape(token.reading)}</rt></ruby>`; + else out += token.writing; + return out; + }, ""), + "parenthesis": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `${token.writing}(${token.reading}) `; + else out += token.writing; + return out; + }, ""), + "refold-tools": tokens => tokens.reduce((out, token) => { + if (token.ruby) out += `[${token.writing}](${token.reading})`; + else out += token.writing; + return out; + }, ""), + } satisfies Record<string, (tokens: Array<JapaneseToken>) => string>; + + constructor(writing: string, reading: string) { + this.writing = writing; + this.reading = reading; + this.normalized = reading.normalizeKana(); + } + + /** @summary format this as text with furigana */ + public furigana(format: keyof typeof this.formatters = "HTML"): string { + return this.formatters[format](this.tokenize()); + } + + /** + * @summary attempt to match kana in this.reading to sections of the same + * script in this.writing + */ + private tokenize(): Array<JapaneseToken> { + var tokens: Array<JapaneseToken> = []; + + // split this.writing into tokens with different scripts + var token: JapaneseToken = { + writing: "", + reading: "", + normalized: "", + ruby: true, + }; + var kana: boolean = this.writing[0].kanaOnly(); + for (var char of this.writing) { + if (char.kanaOnly() != kana) { + tokens.push({ ...token }); + token.writing = ""; + } + token.writing += char; + kana = char.kanaOnly(); + } + tokens.push(token); + + // find kana-only tokens and normalize them + tokens = tokens.map(token => { + if (!token.writing.kanaOnly()) return token; + token.normalized = token.writing.normalizeKana(); + token.reading = token.writing; + token.ruby = false; + return token; + }); + + // don't try to spread reading across kanji if there is only one kanji/kana string + if (tokens.length == 1) { + tokens[0].reading = this.reading; + tokens[0].normalized = this.reading.normalizeKana(); + tokens[0].ruby = !this.writing.kanaOnly(); + return tokens; + } + + // list of indices where anchor token could be in reading + var possibilities: Array<Array<number>> = []; + // find all possible arrangements (in-order) of anchor indices in this.reading + var match = (tokenIndex: number = 0, searchStart: number = -1, path: Array<number> = []): void => { + // this arrangement is a possibility because the last token fit + if (tokenIndex == tokens.length) { + possibilities.push(path); + return; + } + // skip until next 'anchor' token + if (tokens[tokenIndex].normalized.length == 0) return match(tokenIndex + 1, searchStart, path); + + // try all positions where current (anchor) token fits in this.reading + while ((searchStart = this.normalized.indexOf(tokens[tokenIndex].normalized, searchStart + 1)) != -1) { + match(tokenIndex + 1, searchStart, [...path, searchStart]); + } + }; + match(); + + // create index slices from possibilities + var slices = possibilities + .map(match => { // convert start index of anchor to start and stop index (based on anchor length) + var out = [0]; + let matchIndex = 0; + for (let tokenIndex = 0; tokenIndex < tokens.length; tokenIndex++) { + if (tokens[tokenIndex].normalized.length == 0) continue; + out.push(match[matchIndex], match[matchIndex] + tokens[tokenIndex].writing.length); + matchIndex++; + } + if (out.peek() != this.reading.length) out.push(this.reading.length); + return out; + }) + .filter(slice => slice.length == tokens.length + 1) + .filter(slice => slice.isUniq()) // slice can't contain sections with 0 length + .filter(slice => slice.peek() == this.reading.length); // slice should match entire reading + + // cop-out if there is no valid way to split reading across kanji + if (slices.length == 0) { + return [{ + writing: this.writing, + reading: this.reading, + normalized: this.normalized, + ruby: true, + }]; + } + + var slice = slices[0]; // TODO: pick most "balanced" out of these instead + + for (let i = 0; i < tokens.length; i++) { + // slice[i+1] is safe because slice.length == tokens.length + 1 + tokens[i].reading = this.reading.substring(slice[i], slice[i+1]); + tokens[i].normalized = tokens[i].reading.normalizeKana(); + } + + return tokens; + } +} + diff --git a/api/word.ts b/api/word.ts index 63dce10..7eba936 100644 --- a/api/word.ts +++ b/api/word.ts @@ -5,6 +5,7 @@ import { ParseToken } from "../language/types.ts"; export default class Word extends APIBase { public writing = "TODO"; public reading = "TODO"; + public conjugated = "TODO"; constructor() { super(); @@ -61,6 +61,7 @@ "https://deno.land/std@0.192.0/async/retry.ts": "6521c061a5ab24e8b1ae624bdc581c4243d1d574f99dc7f5a2a195c2241fb1b8", "https://deno.land/std@0.192.0/async/tee.ts": "47e42d35f622650b02234d43803d0383a89eb4387e1b83b5a40106d18ae36757", "https://deno.land/std@0.192.0/fmt/colors.ts": "d67e3cd9f472535241a8e410d33423980bec45047e343577554d3356e1f0ef4e", + "https://deno.land/std@0.192.0/html/entities.ts": "1c9fa4d76e36a9bdbe370a65f1612771f3cc2cf802d217b4e633850e2fa25c16", "https://deno.land/std@0.192.0/http/server.ts": "1b23463b5b36e4eebc495417f6af47a6f7d52e3294827a1226d2a1aab23d9d20", "https://deno.land/std@0.192.0/testing/_diff.ts": "1a3c044aedf77647d6cac86b798c6417603361b66b54c53331b312caeb447aea", "https://deno.land/std@0.192.0/testing/_format.ts": "a69126e8a469009adf4cf2a50af889aca364c349797e63174884a52ff75cf4c7", diff --git a/test/api-japanese.ts b/test/api-japanese.ts new file mode 100644 index 0000000..86f2ac1 --- /dev/null +++ b/test/api-japanese.ts @@ -0,0 +1,24 @@ +import Japanese from "../api/japanese.ts"; + +// https://japanese.stackexchange.com/questions/69521/reading-per-kanji-irregular-readings +// wow + +var cases = [ + ["繰り返す", "くりかえす"], // [繰](く)り[返](かえ)す + ["漢字テスト", "かんじてすと"], // [漢字](かんじ)テスト + ["凛々しく", "りりしく"], // [凛々](りり)しく + ["字のテスト", "じのテスト"], // [字](じ)のテスト + ["文字", "っ"], // [漢字](っ) + ["文字りす", "りりりす"], // [文字](りり)りす + ["気を引き締める", "きをひきしめる"], // [気](き)を[引](ひ)き[締](し)める + ["文字り漢字", "りりりり"], // ????? + ["大口魚", "たら"], // [大口魚](たら) +] satisfies Array<[string, string]>; + +for (var args of cases) { + var test = new Japanese(...args as [string, string]); + // console.log(test.reading); + // console.log(test["tokenize"]()); + console.log(JSON.stringify(args) + " -> " + test.furigana("parenthesis")); +} + diff --git a/test/halfwidth2fullwidth.ts b/test/halfwidth2fullwidth.ts new file mode 100644 index 0000000..4855f7f --- /dev/null +++ b/test/halfwidth2fullwidth.ts @@ -0,0 +1,5 @@ +import "../util/japanese.ts"; + +console.log("これが オレのウィンターバケーション スタイル!".widenKatakana().katakanaToHiragana()); + + diff --git a/util/array.ts b/util/array.ts index f032935..c5a26c6 100644 --- a/util/array.ts +++ b/util/array.ts @@ -1,3 +1,5 @@ +import "./set.ts"; + declare global { interface Array<T> { /** @summary check if any of the elements of `arr2` are included in `this` */ @@ -8,6 +10,10 @@ declare global { set(): Set<T>; /** @summary clear array */ clear(): void; + /** @summary filter duplicates from array */ + filterDuplicates(): Array<T>; + /** @summary `true` if the array doesn't contain duplicate items */ + isUniq(): boolean; } } @@ -27,3 +33,11 @@ Array.prototype.clear = function() { while (this.length > 0) this.pop(); } +Array.prototype.filterDuplicates = function() { + return this.set().arr(); // TODO: optimize this +} + +Array.prototype.isUniq = function() { + return this.length == this.filterDuplicates().length; +} + diff --git a/util/japanese.ts b/util/japanese.ts index 2017280..d398b60 100644 --- a/util/japanese.ts +++ b/util/japanese.ts @@ -1,4 +1,5 @@ import { UnicodeRange } from "./string.ts"; +import "./number.ts"; declare global { interface String { @@ -57,6 +58,15 @@ declare global { * `strict` to true */ japaneseOnly(strict?: boolean): boolean + + /** @summary convert any half-width katakana to full-width */ + widenKatakana(): string; + + /** @summary convert any full-width katakana to hiragana */ + katakanaToHiragana(): string; + + /** @summary convert any kana (full and half-width) to full-width hiragana */ + normalizeKana(): string; } } @@ -85,7 +95,7 @@ function stringOnly(input: string, check: (key: string, val: number) => StringOn String.prototype.hiraganaOnly = function(strict = false) { return stringOnly(this as string, (key, val) => { - if (key == UnicodeRange.JapaneseHiragana) + if (key == UnicodeRange.JapaneseFWHiragana) return StringOnlyReturnValue.TallyAdd; // count hiragana characters else if (!strict && key.startsWith("any-")) return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) @@ -97,7 +107,7 @@ String.prototype.hiraganaOnly = function(strict = false) { String.prototype.katakanaOnly = function(strict = false) { return stringOnly(this as string, (key, val) => { - if (key == UnicodeRange.JapaneseKatakana) + if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana].includes(key as UnicodeRange)) return StringOnlyReturnValue.TallyAdd; // count katakana characters else if (!strict && key.startsWith("any-")) return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) @@ -121,7 +131,7 @@ String.prototype.kanjiOnly = function(strict = false) { String.prototype.kanaOnly = function(strict = false) { return stringOnly(this as string, (key, val) => { - if (key == UnicodeRange.JapaneseHiragana || key == UnicodeRange.JapaneseKatakana) + if ([UnicodeRange.JapaneseHWKatakana, UnicodeRange.JapaneseFWKatakana, UnicodeRange.JapaneseFWHiragana].includes(key as UnicodeRange)) return StringOnlyReturnValue.TallyAdd; // count kana characters else if (!strict && key.startsWith("any-")) return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) @@ -143,3 +153,120 @@ String.prototype.japaneseOnly = function(strict = false) { }); } +String.prototype.widenKatakana = function() { + const map: { [key: string]: string } = { + "ァ": "ァ", + "ア": "ア", + "ィ": "ィ", + "イ": "イ", + "ゥ": "ゥ", + "ウ": "ウ", + "ェ": "ェ", + "エ": "エ", + "ォ": "ォ", + "オ": "オ", + "ガ": "ガ", + "カ": "カ", + "ギ": "ギ", + "キ": "キ", + "グ": "グ", + "ク": "ク", + "ゲ": "ゲ", + "ケ": "ケ", + "ゴ": "ゴ", + "コ": "コ", + "ザ": "ザ", + "サ": "サ", + "ジ": "ジ", + "シ": "シ", + "ズ": "ズ", + "ス": "ス", + "ゼ": "ゼ", + "セ": "セ", + "ゾ": "ゾ", + "ソ": "ソ", + "ダ": "ダ", + "タ": "タ", + "ヂ": "ヂ", + "チ": "チ", + "ヅ": "ヅ", + "ッ": "ッ", + "ツ": "ツ", + "デ": "デ", + "テ": "テ", + "ド": "ド", + "ト": "ト", + "ナ": "ナ", + "ニ": "ニ", + "ヌ": "ヌ", + "ネ": "ネ", + "ノ": "ノ", + "バ": "バ", + "パ": "パ", + "ハ": "ハ", + "ビ": "ビ", + "ピ": "ピ", + "ヒ": "ヒ", + "ブ": "ブ", + "プ": "プ", + "フ": "フ", + "ベ": "ベ", + "ペ": "ペ", + "ヘ": "ヘ", + "ボ": "ボ", + "ポ": "ポ", + "ホ": "ホ", + "マ": "マ", + "ミ": "ミ", + "ム": "ム", + "メ": "メ", + "モ": "モ", + "ャ": "ャ", + "ヤ": "ヤ", + "ュ": "ュ", + "ユ": "ユ", + "ョ": "ョ", + "ヨ": "ヨ", + "ラ": "ラ", + "リ": "リ", + "ル": "ル", + "レ": "レ", + "ロ": "ロ", + "ワ": "ワ", + "ヲ": "ヲ", + "ン": "ン", + "ヴ": "ヴ", + "ヷ": "ヷ", + "イ゙": "イ゙", + "エ゙": "エ゙", + "ヺ": "ヺ", + "ー": "ー", + }; + + var out = ""; + outer: + for (let i = 0; i < this.length; i++) { + for (var key in map) { + if (!this.substring(i).startsWith(key)) continue; + out += map[key]; + i += key.length - 1; + continue outer; + } + + out += this[i]; + } + return out; +} + +String.prototype.katakanaToHiragana = function() { + return this.map(char => { + var code = char.codePointAt(0)!; + if (0x30a1 <= code && code <= 0x30f6) return (code + (0x3041 - 0x30a1)).toChar(); + return char; + }) +} + +String.prototype.normalizeKana = function() { + return this.widenKatakana().katakanaToHiragana(); +} + diff --git a/util/number.ts b/util/number.ts new file mode 100644 index 0000000..c28864f --- /dev/null +++ b/util/number.ts @@ -0,0 +1,11 @@ +declare global { + interface Number { + /** @summary convert number to character by charCode */ + toChar(): string; + } +} + +Number.prototype.toChar = function() { + return String.fromCharCode(this as number); +} + diff --git a/util/string.ts b/util/string.ts index 397dcd6..327b884 100644 --- a/util/string.ts +++ b/util/string.ts @@ -26,6 +26,12 @@ declare global { * @argument fallback return this value if parsing fails */ json(fallback?: any): any; + + /** + * @summary map each character of a string to another character using + * `mapFn` + */ + map(mapFn: (char: string) => string): string; } } @@ -34,11 +40,15 @@ export enum UnicodeRange { Whitespace = "any-whitespace", Punctuation = "any-punctuation", Unknown = "any-unknown", - JapanesePunctuation = "jp-punctuation", - JapaneseHiragana = "jp-hiragana", - JapaneseKatakana = "jp-katakana", - JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana", + JapaneseFWPunctuation = "jp-full-width-punctuation", + JapaneseHWPunctuation = "jp-half-width-punctuation", + JapaneseFWHiragana = "jp-full-width-hiragana", + JapaneseFWKatakana = "jp-full-width-katakana", + JapaneseFWLatin = "jp-full-width-latin", + JapaneseHWKatakana = "jp-half-width-katakana", JapaneseKanji = "jp-kanji", + JapaneseKanjiRadicals = "jp-kanji-radicals", + JapaneseAuxiliary = "jp-aux", } type RangeTally = Record<UnicodeRange, number>; @@ -52,12 +62,22 @@ String.prototype.range = function() { if (0x2e == code) return UnicodeRange.Punctuation; // full stop if (0x3f == code) return UnicodeRange.Punctuation; // question mark + // https://stackoverflow.com/a/53807563 if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin; - if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation; - if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana; - if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana; - if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana; - if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji; + if (0x2e80 <= code && code <= 0x2fd5) return UnicodeRange.JapaneseKanjiRadicals; + if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapaneseFWPunctuation; + if (0xff5f <= code && code <= 0xff60) return UnicodeRange.JapaneseFWPunctuation; + if (0x3041 <= code && code <= 0x3096) return UnicodeRange.JapaneseFWHiragana; + if (0x30a1 <= code && code <= 0x30ff) return UnicodeRange.JapaneseFWKatakana; + if (0x3400 <= code && code <= 0x4db5) return UnicodeRange.JapaneseKanji; + if (0x4e00 <= code && code <= 0x9fcb) return UnicodeRange.JapaneseKanji; + if (0xf900 <= code && code <= 0xfa6a) return UnicodeRange.JapaneseKanji; + if (0xff61 <= code && code <= 0xff65) return UnicodeRange.JapaneseHWPunctuation; + if (0xff66 <= code && code <= 0xff9f) return UnicodeRange.JapaneseHWKatakana; + if (0x31f0 <= code && code <= 0x31ff) return UnicodeRange.JapaneseAuxiliary; + if (0x3220 <= code && code <= 0x3243) return UnicodeRange.JapaneseAuxiliary; + if (0x3280 <= code && code <= 0x337f) return UnicodeRange.JapaneseAuxiliary; + if (0xff01 <= code && code <= 0xff5e) return UnicodeRange.JapaneseFWLatin; return UnicodeRange.Unknown; } @@ -71,11 +91,11 @@ String.prototype.parseTags = function() { return parseTags(this as string); } -String.prototype.removeAll = function(searchValue: string | RegExp) { +String.prototype.removeAll = function(searchValue) { return this.replaceAll(searchValue, ""); } -String.prototype.json = function(fallback?: any) { +String.prototype.json = function(fallback) { if (fallback) { try { return JSON.parse(this as string); @@ -87,3 +107,9 @@ String.prototype.json = function(fallback?: any) { } } +String.prototype.map = function(mapFn) { + var out = ""; + for (var char of this) out += mapFn(char); + return out; +} + |