From 67dbb6421976254658c5e38045513129dd18187a Mon Sep 17 00:00:00 2001 From: lonkaars Date: Wed, 28 Jun 2023 23:59:50 +0200 Subject: initial public commit --- .gitignore | 3 + .vim/coc-settings.json | 6 ++ .vscode/settings.json | 3 + assets/the-dream.png | Bin 0 -> 334535 bytes core/api.ts | 16 ++++ core/direct/client.ts | 11 +++ core/http/client.ts | 31 +++++++ core/http/props.ts | 10 ++ core/http/server.ts | 33 +++++++ core/raw/api.ts | 27 ++++++ db/.gitignore | 1 + db/db.ts | 109 ++++++++++++++++++++++ db/dict/.gitignore | 10 ++ db/dict/deinflections.sql | 183 +++++++++++++++++++++++++++++++++++++ db/dict/init.sql | 115 +++++++++++++++++++++++ db/dict/reset.sql | 12 +++ db/dict/tags.sql | 11 +++ db/dict/template.sql.m4 | 104 +++++++++++++++++++++ db/dict/test_a.dict.sql | 25 +++++ db/dict/test_b.dict.sql | 9 ++ db/dict/test_pitch_accent.dict.sql | 10 ++ db/find.sql | 96 +++++++++++++++++++ db/makefile | 45 +++++++++ db/readme.md | 168 ++++++++++++++++++++++++++++++++++ db/test/find | 20 ++++ db/user/.gitignore | 2 + db/user/init.sql | 16 ++++ db/user/reset.sql | 4 + db/user/root.sql | 6 ++ deno.jsonc | 5 + deno.lock | 79 ++++++++++++++++ language/japanese.ts | 137 +++++++++++++++++++++++++++ language/parser.ts | 121 ++++++++++++++++++++++++ language/readme.md | 53 +++++++++++ language/tags.ts | 102 +++++++++++++++++++++ language/types.ts | 49 ++++++++++ license | 21 +++++ main.ts | 61 +++++++++++++ makefile | 22 +++++ readme.md | 67 ++++++++++++++ util/array.ts | 17 ++++ util/error.ts | 7 ++ util/readme.md | 10 ++ util/set.ts | 17 ++++ util/string.ts | 74 +++++++++++++++ 45 files changed, 1928 insertions(+) create mode 100644 .gitignore create mode 100644 .vim/coc-settings.json create mode 100644 .vscode/settings.json create mode 100644 assets/the-dream.png create mode 100644 core/api.ts create mode 100644 core/direct/client.ts create mode 100644 core/http/client.ts create mode 100644 core/http/props.ts create mode 100644 core/http/server.ts create mode 100644 core/raw/api.ts create mode 100644 db/.gitignore create mode 100644 db/db.ts create mode 100644 db/dict/.gitignore create mode 100644 db/dict/deinflections.sql create mode 100644 db/dict/init.sql create mode 100644 db/dict/reset.sql create mode 100644 db/dict/tags.sql create mode 100644 db/dict/template.sql.m4 create mode 100644 db/dict/test_a.dict.sql create mode 100644 db/dict/test_b.dict.sql create mode 100644 db/dict/test_pitch_accent.dict.sql create mode 100644 db/find.sql create mode 100644 db/makefile create mode 100644 db/readme.md create mode 100755 db/test/find create mode 100644 db/user/.gitignore create mode 100644 db/user/init.sql create mode 100644 db/user/reset.sql create mode 100644 db/user/root.sql create mode 100644 deno.jsonc create mode 100644 deno.lock create mode 100644 language/japanese.ts create mode 100644 language/parser.ts create mode 100644 language/readme.md create mode 100644 language/tags.ts create mode 100644 language/types.ts create mode 100644 license create mode 100644 main.ts create mode 100644 makefile create mode 100644 readme.md create mode 100644 util/array.ts create mode 100644 util/error.ts create mode 100644 util/readme.md create mode 100644 util/set.ts create mode 100644 util/string.ts diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e7cf3e7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.bkp +*.dtmp +ext diff --git a/.vim/coc-settings.json b/.vim/coc-settings.json new file mode 100644 index 0000000..497ca74 --- /dev/null +++ b/.vim/coc-settings.json @@ -0,0 +1,6 @@ +{ + "tsserver.enable": false, + "deno.enable": true, + "deno.lint": false, + "deno.unstable": true +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..b943dbc --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,3 @@ +{ + "deno.enable": true +} \ No newline at end of file diff --git a/assets/the-dream.png b/assets/the-dream.png new file mode 100644 index 0000000..9cde41f Binary files /dev/null and b/assets/the-dream.png differ diff --git a/core/api.ts b/core/api.ts new file mode 100644 index 0000000..0891081 --- /dev/null +++ b/core/api.ts @@ -0,0 +1,16 @@ +import { ParseResult } from "../language/types.ts"; + +/** + * @summary API interface + * + * This interface gets implemented by all API clients, so clients can be + * swapped easily. + */ +export default abstract class API { + /** @summary prepare API client */ + abstract prepare(): Promise; + + /** @summary parse sentence */ + abstract parseSentence(input: string): Promise; +}; + diff --git a/core/direct/client.ts b/core/direct/client.ts new file mode 100644 index 0000000..fda4b60 --- /dev/null +++ b/core/direct/client.ts @@ -0,0 +1,11 @@ +import API from "../api.ts"; +import YomikunRAWAPI from "../raw/api.ts"; + +/** + * @summary Yomikun direct API + * + * Alias to YomikunRAWAPI. calls API methods directly, and thus only works + * server-side. Used to test the API locally without HTTP overhead. + */ +export default class YomikunDirectAPIClient extends YomikunRAWAPI implements API { } + diff --git a/core/http/client.ts b/core/http/client.ts new file mode 100644 index 0000000..42d75f0 --- /dev/null +++ b/core/http/client.ts @@ -0,0 +1,31 @@ +import { ParseDepth, ParseResult } from "../../language/types.ts"; +import YomikunError from "../../util/error.ts"; +import API from "../api.ts"; +import { ConnectionProps, ConnectionPropsDefault } from "./props.ts"; + +/** + * @summary Yomikun HTTP API + * + * Uses the Yomikun server to call API methods. Handles (de)serialization + * automatically. + */ +export default class YomikunRemoteAPIClient implements API { + private props: ConnectionProps; + + constructor(options?: ConnectionProps) { + this.props = { ...ConnectionPropsDefault, ...options }; + } + + async prepare() { } + + async parseSentence(input: string) { + var response = await fetch(`http://${this.props.host}:${this.props.port}/parseSentence`); + console.log(response.body); + + return { + depth: ParseDepth.Term, + tokens: [], + } as ParseResult; + } +} + diff --git a/core/http/props.ts b/core/http/props.ts new file mode 100644 index 0000000..d69ae55 --- /dev/null +++ b/core/http/props.ts @@ -0,0 +1,10 @@ +export interface ConnectionProps { + host: string; + port: number; +}; + +export const ConnectionPropsDefault: ConnectionProps = { + host: "localhost", + port: 9400, +}; + diff --git a/core/http/server.ts b/core/http/server.ts new file mode 100644 index 0000000..8a6786e --- /dev/null +++ b/core/http/server.ts @@ -0,0 +1,33 @@ +import { serve } from "https://deno.land/std@0.192.0/http/server.ts"; + +import { ParseResult } from "../../language/types.ts"; +import YomikunRAWAPI from "../raw/api.ts"; +import { ConnectionProps, ConnectionPropsDefault } from "./props.ts"; + +interface Endpoint { + endpoint: string; +}; + +export default class YomikunRemoteAPIServer extends YomikunRAWAPI { + private props: ConnectionProps; + + constructor(options?: ConnectionProps) { + super(); + this.props = { ...ConnectionPropsDefault, ...options }; + } + + async parseSentence(input: string) { + return await super.parseSentence(input); + } + + async start() { + serve((req) => { + return new Response("Hello world!"); + }, { port: this.props.port }); + } + + async prepare() { + await super.prepare(); + } +} + diff --git a/core/raw/api.ts b/core/raw/api.ts new file mode 100644 index 0000000..2d29eed --- /dev/null +++ b/core/raw/api.ts @@ -0,0 +1,27 @@ +import API from "../api.ts"; +import Parser from "../../language/parser.ts"; +import YomikunError from "../../util/error.ts"; + +/** @summary internal Yomikun API client (DO NOT USE DIRECTLY) */ +export default class YomikunRAWAPI implements API { + private _parser: Parser; + + constructor() { + if (this.constructor === YomikunRAWAPI) { + throw new YomikunError("YomikunRAWAPI instantiated! please use YomikunDirectAPIClient instead"); + } + + this._parser = new Parser(); + } + + async prepare() { + await Promise.all([ + this._parser.prepare(), + ]); + } + + async parseSentence(input: string) { + return this._parser.parse(input); + } +}; + diff --git a/db/.gitignore b/db/.gitignore new file mode 100644 index 0000000..98e6ef6 --- /dev/null +++ b/db/.gitignore @@ -0,0 +1 @@ +*.db diff --git a/db/db.ts b/db/db.ts new file mode 100644 index 0000000..d5a2b76 --- /dev/null +++ b/db/db.ts @@ -0,0 +1,109 @@ +import { Database, Statement } from "https://deno.land/x/sqlite3@0.9.1/mod.ts"; +import * as path from 'https://deno.land/std@0.102.0/path/mod.ts'; + +import { TokenTags } from "../language/tags.ts"; +import "../util/string.ts"; +import YomikunError from "../util/error.ts"; + +export interface DBDictInfo { + id: number; + name: string; + language: string; + priority: number; +}; + +export interface FindResult { + id: number; + expression: string; + reading: string; + tags: TokenTags; + sort: number; + depth: number; + original: string; + match: { + kanji: boolean; + reading: boolean; + }; +} + +interface DBFindResult { + id: number; + expression: string; + reading: string; + tags: string; + rules: string; + depth: number; + original: string; + deinflected: string; + root_overlay: number; + user_overlay: number; +} + +/** + * @summary dictionary database connection, handles deconjugation and lookup in SQL + * + * @example + * const db = new DB(); + * await db.prepare(); + * const results = db.findTerm("なった"); + */ +export default class DB { + private connection: Database; + public ready: boolean = false; + private paths = { + db: { + dict: path.resolve('db', 'dict.db'), + user: path.resolve('db', 'user.db'), + }, + query: { + find: path.resolve('db', 'find.sql'), + }, + } as const; + private statement: { + attach: Statement; + queryTerm: Statement; + }; + + constructor() { + this.connection = new Database(":memory:", { create: false }); + this.statement = { + attach: this.connection.prepare("attach database ? as ?"), + queryTerm: this.connection.prepare(""), // initialized in prepare() + }; + this.attach(this.paths.db.dict, 'dict'); + this.attach(this.paths.db.user, 'user'); + } + + private attach(dbPath: string, alias?: string) { + this.statement.attach.run(dbPath, alias); + } + + async prepare() { + const statement = await Deno.readTextFile(this.paths.query.find); + this.statement.queryTerm = this.connection.prepare(statement); + this.ready = true; + } + + findTerm(term: string): FindResult[] { + if (!this.ready) throw new YomikunError("DB not ready yet, call `async DB::prepare()` first"); + + var results = this.statement.queryTerm.all({ term }) as unknown as DBFindResult[]; + var terms: FindResult[] = results?.map(term => { + if (term.rules == null) term.rules = ""; + return { + id: term.id, + expression: term.expression, + reading: term.reading, + tags: (term.tags + ' ' + term.rules).parseTags(), + sort: term.user_overlay ?? term.root_overlay ?? 100, + depth: term.depth, + original: term.original, + match: { + kanji: term.expression == term.deinflected, + reading: term.reading == term.deinflected, + }, + }; + }); + return terms; + } +}; diff --git a/db/dict/.gitignore b/db/dict/.gitignore new file mode 100644 index 0000000..96bd267 --- /dev/null +++ b/db/dict/.gitignore @@ -0,0 +1,10 @@ +base.sql +full.sql + +test_a.sql +test_b.sql +test_pitch_accent.sql + +jmdict* + +dict.sql diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql new file mode 100644 index 0000000..d13f313 --- /dev/null +++ b/db/dict/deinflections.sql @@ -0,0 +1,183 @@ +-- deinflection rules (ordered by appearance in tae kim's japanese grammar guide) +create temporary table deinflection_temp (tag, kana_in, kana_out, rules_in, rules_out); +insert into deinflection_temp values + -- negative + ('infl:negative', 'ない', 'る', 'a', 'ru'), + ('infl:negative', 'わない', 'う', 'a', 'u'), + ('infl:negative', 'かない', 'く', 'a', 'u'), + ('infl:negative', 'がない', 'ぐ', 'a', 'u'), + ('infl:negative', 'さない', 'す', 'a', 'u'), + ('infl:negative', 'たない', 'つ', 'a', 'u'), + ('infl:negative', 'なない', 'ぬ', 'a', 'u'), + ('infl:negative', 'ばない', 'ぶ', 'a', 'u'), + ('infl:negative', 'まない', 'む', 'a', 'u'), + ('infl:negative', 'らない', 'る', 'a', 'u'), + ('infl:negative', 'しない', 'する', 'a', 's'), + ('infl:negative', 'こない', 'くる', 'a', 'k'), + ('infl:negative', '来ない', '来る', 'a', 'k'), + ('infl:negative', 'ない', 'ある', 'a', 'ru'), -- this one may cause problems (?) + -- ('infl:negative', 'ない', '', 'a', 'ru'), -- this one may cause problems (?) + + -- past tense + ('infl:tense:past', 'た', 'る', 'a', 'ru'), + ('infl:tense:past', 'した', 'す', 'a', 'u'), + ('infl:tense:past', 'いた', 'く', 'a', 'u'), + ('infl:tense:past', 'いだ', 'ぐ', 'a', 'u'), + ('infl:tense:past', 'んだ', 'む', 'a', 'u'), + ('infl:tense:past', 'んだ', 'ぬ', 'a', 'u'), + ('infl:tense:past', 'んだ', 'ぶ', 'a', 'u'), + ('infl:tense:past', 'った', 'う', 'a', 'u'), + ('infl:tense:past', 'った', 'つ', 'a', 'u'), + ('infl:tense:past', 'った', 'る', 'a', 'u'), + ('infl:tense:past', 'した', 'する', 'a', 's'), + ('infl:tense:past', 'きた', 'くる', 'a', 'k'), + ('infl:tense:past', '来た', 'くる', 'a', 'k'), + ('infl:tense:past', '行った', '行く', 'a', ''), + + -- adjective to adverb + ('infl:adverb', 'く', 'い', 'a', 'i'), + -- TODO: na-adjectives aren't deconjugated + + -- polite form + ('infl:polite:masu', 'ます', 'る', 'a', 'ru'), + ('infl:polite:masu', 'います', 'う', 'a', 'u'), + ('infl:polite:masu', 'きます', 'く', 'a', 'u'), + ('infl:polite:masu', 'ぎます', 'ぐ', 'a', 'u'), + ('infl:polite:masu', 'します', 'す', 'a', 'u'), + ('infl:polite:masu', 'ちます', 'つ', 'a', 'u'), + ('infl:polite:masu', 'にます', 'ぬ', 'a', 'u'), + ('infl:polite:masu', 'びます', 'ぶ', 'a', 'u'), + ('infl:polite:masu', 'みます', 'む', 'a', 'u'), + ('infl:polite:masu', 'ります', 'る', 'a', 'u'), + ('infl:polite:masu', 'します', 'する', 'a', 's'), + ('infl:polite:masu', 'きます', 'くる', 'a', 'k'), + ('infl:polite:masu', '来ます', '来る', 'a', 'k'), + ('infl:polite:masu infl:negative', 'ません', 'る', 'a', 'ru'), + ('infl:polite:masu infl:negative', 'いません', 'う', 'a', 'u'), + ('infl:polite:masu infl:negative', 'きません', 'く', 'a', 'u'), + ('infl:polite:masu infl:negative', 'ぎません', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'しません', 'す', 'a', 'u'), + ('infl:polite:masu infl:negative', 'ちません', 'つ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'にません', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'びません', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'みません', 'む', 'a', 'u'), + ('infl:polite:masu infl:negative', 'りません', 'る', 'a', 'u'), + ('infl:polite:masu infl:negative', 'しません', 'する', 'a', 's'), + ('infl:polite:masu infl:negative', 'きません', 'くる', 'a', 'k'), + ('infl:polite:masu infl:negative', '来ません', '来る', 'a', 'k'), + ('infl:polite:masu infl:tense:past', 'ました', 'る', 'a', 'ru'), + ('infl:polite:masu infl:tense:past', 'いました', 'う', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'きました', 'く', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'ぎました', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'しました', 'す', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'ちました', 'つ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'にました', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'びました', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'みました', 'む', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'りました', 'る', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'しました', 'する', 'a', 's'), + ('infl:polite:masu infl:tense:past', 'きました', 'くる', 'a', 'k'), + ('infl:polite:masu infl:tense:past', '来ました', '来る', 'a', 'k'), + ('infl:polite:masu infl:tense:past infl:negative', 'ませんでした', 'る', 'a', 'ru'), + ('infl:polite:masu infl:tense:past infl:negative', 'いませんでした', 'う', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'く', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'ぎませんでした', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'す', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'ちませんでした', 'つ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'にませんでした', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'びませんでした', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'みませんでした', 'む', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'りませんでした', 'る', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'する', 'a', 's'), + ('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'くる', 'a', 'k'), + ('infl:polite:masu infl:tense:past infl:negative', '来ませんでした', '来る', 'a', 'k'), + -- ('infl:polite:masu infl:tense:past infl:negative', 'くありません', 'い', 'a', 'i'), -- reality check (should not be needed) + + -- TODO: compound nouns and adjectives + + -- te-form + ('infl:suffix:te', 'て', 'た', 'a', 'ru'), + ('infl:suffix:te', 'して', 'した', 'a', 's u'), + ('infl:suffix:te', 'いて', 'いた', 'a', 'u'), + ('infl:suffix:te', 'いで', 'いだ', 'a', 'u'), + ('infl:suffix:te', 'んで', 'んだ', 'a', 'u'), + ('infl:suffix:te', 'って', 'った', 'a', 'u'), + ('infl:suffix:te', 'きて', 'きた', 'a', 'k'), + ('infl:suffix:te', '来て', '来た', 'a', 'k'), + ('infl:suffix:te', 'くて', 'い', 'a', ''), -- TODO: rules_out of this one is i? + + -- -tari lists + ('infl:suffix:tari', 'たり', 'た', 'a', 'ru'), + ('infl:suffix:tari', 'したり', 'した', 'a', 's u'), + ('infl:suffix:tari', 'いたり', 'いた', 'a', 'u'), + ('infl:suffix:tari', 'いだり', 'いだ', 'a', 'u'), + ('infl:suffix:tari', 'んだり', 'んだ', 'a', 'u'), + ('infl:suffix:tari', 'ったり', 'った', 'a', 'u'), + ('infl:suffix:tari', 'きたり', 'きた', 'a', 'k'), + ('infl:suffix:tari', '来たり', '来た', 'a', 'k'), + + -- auxiliary rules + ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem + +-- rule/bitmask lookup table +create temporary table rule_map (tag, name, mask); +insert into rule_map values + (null, 'a', -1 ), -- all (allow all rules in) + (null, '', 0 ), -- (nothing) + ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide) + ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide) + ('infl:reason:suru', 's', 1 << 2), -- する (suru) + ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru) + (null, 'z', 1 << 4), -- ずる (zuru) + ('infl:reason:adj-i', 'i', 1 << 5), -- 形容詞 (i-adjective) + (null, 'iru', 1 << 6); -- 〜いる (temporary iru for progressive tense) + +-- add tags to db +insert into deinflection_rules (mask, tag) +select mask, tag +from rule_map +where tag is not null; + +-- convert space-separated strings to bitmasks and insert into deinflection table +insert into deinflection (tag, kana_in, kana_out, rules_in, rules_out) +with temp_deinflection_map(tag, kana_in, kana_out, rules_in_stack, rules_in, rules_out_stack, rules_out) as ( + select + tag, + kana_in, + kana_out, + rules_in || ' ', + '', + rules_out || ' ', + '' + from deinflection_temp + union + select + tag, + kana_in, + kana_out, + substr(rules_in_stack, instr(rules_in_stack, ' ') + 1), + replace(substr(rules_in_stack, 0, instr(rules_in_stack, ' ')), ' ', ''), + substr(rules_out_stack, instr(rules_out_stack, ' ') + 1), + replace(substr(rules_out_stack, 0, instr(rules_out_stack, ' ')), ' ', '') + from temp_deinflection_map + where + (length(rules_in_stack) > 1) or + (length(rules_out_stack) > 1) +) +select + temp_deinflection_map.tag, + temp_deinflection_map.kana_in, + temp_deinflection_map.kana_out, + -- NOTE: sum() should really be bitwise and across all rows after group by + -- here. be careful to not specify rules more than once in these columns, or + -- any other rule if the column has 'a'. + sum(rule_map_in.mask) as rules_in, + sum(rule_map_out.mask) as rules_out +from temp_deinflection_map +join rule_map as rule_map_in on rule_map_in.name = temp_deinflection_map.rules_in +join rule_map as rule_map_out on rule_map_out.name = temp_deinflection_map.rules_out +group by + temp_deinflection_map.tag, + temp_deinflection_map.kana_in, + temp_deinflection_map.kana_out; + diff --git a/db/dict/init.sql b/db/dict/init.sql new file mode 100644 index 0000000..4e9fcc9 --- /dev/null +++ b/db/dict/init.sql @@ -0,0 +1,115 @@ +-- deinflection map (see dict/deinflections.sql) +create table if not exists deinflection ( + tag text not null, + kana_in text not null, + kana_out text not null, + rules_in int not null default 0, + rules_out int not null default 0 +); +create index deinflection_kana_in on deinflection (kana_in); + +-- lookup table for deinflection rule bitmasks +create table if not exists deinflection_rules ( + mask int not null default 0, + tag text not null +); + +-- dictionary metadata +create table if not exists dict ( + id integer primary key autoincrement, + tag text not null, -- TODO: dict.tag -> dict.tag_id + language text null default null, + priority int not null default 1 +); + +-- tag code<->id lookup +create table if not exists tag ( + id integer primary key autoincrement, + code text not null, + unique(code) on conflict ignore +); +create index tag_code on tag (code); + + +-- definition of term (with single meaning) +-- +-- terms that have multiple meanings should have multiple entries in the +-- definition table with the same term_id. glossary contains an explanation or +-- list of synonyms to illustrate the meaning of the term. +create table if not exists definition ( + id integer primary key autoincrement, + term_id int not null, -- reference to term (combined writing and reading) + sort int not null default 1, -- sort multiple meanings/glossaries within dictionary + -- search results for terms are grouped by term, then dictionary, then sort + -- within each dictionary + glossary text null default null, -- explanation or list of synonyms for term + dict_id int not null, -- origin dictionary of term + foreign key(term_id) references term(id), + foreign key(dict_id) references dict(id), + unique(glossary) on conflict ignore +); + +-- human-readable tag label lookup table (used in UI and for translations) +create table if not exists tag_label ( + id integer primary key autoincrement, + tag_id int not null, -- reference to tag + language text not null default "en-US", -- label locale as RFC 5646 tag (e.g. ja) + -- label locale is used when displaying tags in a UI frontend. labels are + -- sorted based on langauge preference, with "en-US" being inserted last as a + -- fallback. + label_short text null default null, -- abbreviated label (e.g. ティアキン) + label text not null, -- full label (e.g. ティアーズ・オブ・ザ・キングダム) + description text null default null, -- short description (e.g. ゼルダの伝説シリーズより) + foreign key(tag_id) references tag(id) +); + +-- standalone combination of kanji and reading +-- +-- terms are added automatically when importing a dictionary, but are shared +-- between dictionaries and thus not removed when disabling/removing a +-- dictionary. terms are ranked outside the database, and the starting rank +-- score can be changed using the user database (TODO) +create table if not exists term ( + id integer primary key autoincrement, + expression text not null, -- writing of term (e.g. 乗り越える) + reading text not null, -- reading of term (e.g. のりこえる) + alt int null default null, -- reference to main writing if this is is an alternate + -- alternate readings are stored as normal terms, but point to their main + -- reading. also used internally to store alternate readings of irregular + -- verbs example: + -- id expression reading alt + -- 1 言葉 ことば NULL + -- 2 詞 ことば 1 + -- 3 辞 ことば 1 + -- 4 来る くる NULL + -- 5 来た きた 4 + -- NOTE: alternate writings can technically be stored recursively this way, + -- but are only resolved one level deep TODO: resolve alternates + foreign key(alt) references term(id), + unique(expression, reading) on conflict ignore +); +create index term_expression on term (expression); +create index term_reading on term (reading); +-- TODO: (automatically) remove unused terms from db (using constraints?) + + +-- allow many<->many relation between definition and tag +create table if not exists definition_tag ( + id integer primary key autoincrement, + definition_id int not null, + tag_id int not null, + foreign key(definition_id) references definition(id), + foreign key(tag_id) references tag(id), + unique(definition_id, tag_id) on conflict ignore +); + +-- allow many<->many relation between term and tag +create table if not exists term_tag ( + id integer primary key autoincrement, + term_id int not null, + tag_id int not null, + foreign key(term_id) references term(id), + foreign key(tag_id) references tag(id), + unique(term_id, tag_id) on conflict ignore +); + diff --git a/db/dict/reset.sql b/db/dict/reset.sql new file mode 100644 index 0000000..757e31b --- /dev/null +++ b/db/dict/reset.sql @@ -0,0 +1,12 @@ +drop table if exists definition_tag; +drop table if exists term_tag; + +drop table if exists definition; +drop table if exists tag_label; +drop table if exists term; + +drop table if exists deinflection; +drop table if exists deinflection_rules; +drop table if exists dict; +drop table if exists tag; + diff --git a/db/dict/tags.sql b/db/dict/tags.sql new file mode 100644 index 0000000..a200abb --- /dev/null +++ b/db/dict/tags.sql @@ -0,0 +1,11 @@ +insert into tag (code) values + ('class:verb'), + ('class:verb:suru'), + ('class:verb:suru-included'), + ('class:noun'), + ('class:suffix'), + ('class:part'), + ('class:expr'), + ('name:place'), + ('name:female'), + ('name:male'); diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql.m4 new file mode 100644 index 0000000..00de413 --- /dev/null +++ b/db/dict/template.sql.m4 @@ -0,0 +1,104 @@ +-- create temporary ingest table +drop table if exists ingest; +-- TODO: ingest pitch-accent dictionaries +-- TODO: ingest alternate writings (space-separated) +create temporary table ingest( + -- term fields + expression text not null, -- kanji of term (e.g. 読み込む) + reading text not null, -- reading of term (e.g. よみこむ) + term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB + + -- definition fields + glossary_sort int not null default 1, -- order of multiple meanings (glossaries) + glossary text null default null, -- glossary content (support for basic HTML markup/styling) + glossary_tags text null default null -- add tags to single glossary entry +); + +include(`/dev/stdin')dnl --' +-- the apostrophe is so my editor highlighting keeps working if I force the +-- filetype to sql instead of m4 + +-- create dict id +insert into dict (tag, language) values ('dict:' || :dict, :lang); + +-- add terms +insert into term (expression, reading) +select expression, reading +from ingest; + +-- add definitions +insert into definition (term_id, sort, glossary, dict_id) +select + term.id, + ingest.glossary_sort, + ingest.glossary, + (select id from dict where tag = 'dict:' || :dict) +from ingest +join term on term.expression = ingest.expression and term.reading = ingest.reading; + +-- create map of term_id and tag code +drop table if exists term_tag_map; +create temporary table term_tag_map (term_id, tag); +insert into term_tag_map +with tag_map(term_id, temp, tag) as ( + select + (select id from term where expression is ingest.expression and reading is ingest.reading), + term_tags || ' ', + '' + from ingest + union + select + term_id, + `substr'(temp, instr(temp, ' ') + 1), + `substr'(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select term_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- create map of definition_id and tag code +drop table if exists definition_tag_map; +create temporary table definition_tag_map (definition_id, tag); +insert into definition_tag_map +with tag_map(definition_id, temp, tag) as ( + select + (select id from definition where glossary is ingest.glossary), + glossary_tags || ' ', + '' + from ingest + union + select + definition_id, + `substr'(temp, instr(temp, ' ') + 1), + `substr'(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select definition_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- make sure tags exist +insert into tag (code) +select tag from term_tag_map +union +select tag from definition_tag_map; + +-- add tags to terms +insert into term_tag (term_id, tag_id) +select + term_id, + tag.id +from term_tag_map +join tag on tag.code = term_tag_map.tag; + +-- add tags to definitions +insert into definition_tag (definition_id, tag_id) +select + definition_id, + tag.id +from definition_tag_map +join tag on tag.code = definition_tag_map.tag; + diff --git a/db/dict/test_a.dict.sql b/db/dict/test_a.dict.sql new file mode 100644 index 0000000..854d207 --- /dev/null +++ b/db/dict/test_a.dict.sql @@ -0,0 +1,25 @@ +-- set these +.param set :dict 'test_a' +.param set :lang 'nl_NL' + +-- dictionary content +insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values + ('浮上', 'ふじょう', 'class:verb:suru class:noun', 1, 'opstijgen, zweven', ''), + ('城', 'しろ', 'class:noun', 1, 'kasteel', ''), + ('城', 'じょう', 'class:suffix', 1, '-burcht, -burg (suffix voor kastelen)', ''), + ('下', 'した', 'class:noun', 1, 'onder, beneden, omlaag', ''), + ('の下に', 'のもとに', 'class:expr', 1, 'onder leiding van', ''), + ('ハイラル', 'はいらる', 'series:zelda name:place class:noun', 1, 'Hyrule', ''), + ('にて', 'にて', 'class:part', 1, '{duidt aanwezigheid op een plaats aan}', ''), + ('ゼルダ', 'ぜるだ', 'series:zelda name:female class:noun', 1, 'Zelda', ''), + ('様', 'さま', 'class:suffix', 1, 'meneer, mijnheer, mevrouw, madame', ''), + ('様', 'よう', 'class:noun', 1, '(eruit zien) als, zoals, manier', ''), + ('達', 'たち', 'class:suffix', 1, '{meervoudssuffix}', ''), + ('の', 'の', 'class:part', 1, '{bezitspartikel}', ''), + ('と', 'と', 'class:part', 1, '{opsommingspartikel}', ''), + ('捜索', 'そうさく', 'class:verb:suru class:noun', 1, 'onderzoek, opspeuren, achterhalen', ''), + ('を', 'を', 'class:part', 1, '{lijdend voornaamwoord partikel}', ''), + ('行う', 'おこなう', 'class:verb', 1, 'uitvoeren', ''), + ('こと', 'こと', 'class:part class:noun', 1, '{eindpartikel dat een bevel / waarschuwing uitdrukt}', ''), + ('成る', 'なる', 'class:verb aux:test', 1, 'worden, veranderen in', ''); + diff --git a/db/dict/test_b.dict.sql b/db/dict/test_b.dict.sql new file mode 100644 index 0000000..0140189 --- /dev/null +++ b/db/dict/test_b.dict.sql @@ -0,0 +1,9 @@ +-- set these +.param set :dict 'test_b' +.param set :lang 'en_US' + +-- dictionary content +insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values + ('城', 'しろ', 'aux:test_b', 1, 'castle', ''), + ('城', 'じょう', 'aux:test_b aux:test_b aux:nog_een_test_b', 1, '-burg (suffix for castles)', 'aux:gert aux:test aux:nog_wat'); + diff --git a/db/dict/test_pitch_accent.dict.sql b/db/dict/test_pitch_accent.dict.sql new file mode 100644 index 0000000..3b8298f --- /dev/null +++ b/db/dict/test_pitch_accent.dict.sql @@ -0,0 +1,10 @@ +-- this is an example pitch accent dictionary +-- set these +.param set :dict 'pitch_accent' +.param set :lang null + +-- dictionary content +insert into ingest (expression, reading, glossary_tags) values + ('浮上', 'ふじょう', 'pitch:0'), + ('成る', 'なる', 'pitch:1'); + diff --git a/db/find.sql b/db/find.sql new file mode 100644 index 0000000..cdaebb3 --- /dev/null +++ b/db/find.sql @@ -0,0 +1,96 @@ +-- this statement is prepared and run using :term and :user as inputs (see +-- db.ts or test/find) + +-- this file is kind of messy because it needs to be one large query, instead +-- of separate phases creating temporary tables. queries with more than one +-- statement can't return results because of the way sqlite3 works. + +-- TODO: add more comments in this file to explain what is going on + +-- explain query plan -- testing only +with results(id, expression, reading, tags, depth, rules, original, deinflected) as ( + -- stripped deinflection table (remove some columns and duplicates) + with deinflections(term, tags, depth, original, rules) as ( + -- recursively generated deinflection table + with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as ( + -- input term all substrings until length 1 + with inputs(length, term, tags, rules, rules_in, rules_out, depth) as ( + select length(:term), :term, '', -1, 0, 0, 0 + union + select + inputs.length - 1, + substr(inputs.term, 1, inputs.length - 1), + inputs.tags, + inputs.rules, + inputs.rules_in, + inputs.rules_out, + inputs.depth + from inputs + where inputs.length > 1 + ) + select * from inputs + union -- join all recursive rows into one large table + select + deinflect.length, + substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out, + deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side + deinflection.rules_out, + deinflection.rules_in, + deinflect.rules, + deinflect.depth + 1 + from deinflect -- temp table + inner join deinflection -- deinflection rules table + on + -- rules_in has to contain any of the current deconjugation rules + (deinflect.rules & deinflection.rules_in != 0) and + -- term.endsWith(kana_in) + (substr(term, length(term) - length(kana_in) + 1) = kana_in) and + -- can't deconjugate to length <1 + (length(term) > 0) + limit 50 -- failsafe to catch any infinite loops + ) + select term, tags, depth, substr(:term, 1, deinflect.length), rules_out + from deinflect + ) + select + term.id, + term.expression, + term.reading, + deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, + deinflections.depth, + rules, + deinflections.original, + deinflections.term + from deinflections + inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term) + inner join term_tag on term_tag.term_id = term.id + inner join tag on term_tag.tag_id = tag.id + group by term.id, deinflections.original + having term.id is not null +) +select + results.id, + results.expression, + results.reading, + results.tags, + group_concat(deinflection_rules.tag, ' ') as rules, + results.depth, + results.original, + results.deinflected, + root_overlay.sort as root_overlay, + user_overlay.sort as user_overlay +from results +left join deinflection_rules + on results.rules & deinflection_rules.mask != 0 +left join sort_overlay + as root_overlay + on (root_overlay.expression = results.expression) and + (root_overlay.reading = results.reading) and + (root_overlay.user_id = 0) +left join sort_overlay + as user_overlay + on (user_overlay.expression = results.expression) and + (user_overlay.reading = results.reading) and + (user_overlay.user_id = (select id from user where username = :user)) +group by results.id, results.original; + diff --git a/db/makefile b/db/makefile new file mode 100644 index 0000000..c1e527e --- /dev/null +++ b/db/makefile @@ -0,0 +1,45 @@ +SQL = sqlite3 +DICT_DB = dict.db +USER_DB = user.db + +DICT_TEMPLATE = dict/template.sql.m4 + +.PHONY: clean test + +all: $(DICT_DB) $(USER_DB) + +$(DICT_DB): dict/full.sql + $(SQL) $@ < $< + +$(USER_DB): user/full.sql + $(SQL) $@ < $< + +dict/base.sql: dict/reset.sql dict/init.sql dict/deinflections.sql dict/tags.sql + cat $^ > $@ + +dict/full.sql: dict/base.sql dict/dict.sql + cat $^ > $@ + +dict/dict.sql: dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql dict/jmdict.sql + cat $^ > $@ + +user/base.sql: user/reset.sql user/init.sql + cat $^ > $@ + +user/full.sql: user/base.sql user/root.sql + cat $^ > $@ + +%.sql: %.dict.sql $(DICT_TEMPLATE) + m4 $(DICT_TEMPLATE) < $< > $@ + +# delete generated sql files and database +clean: + $(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql user/base.sql user/full.sql + +test: $(DICT_DB) find.sql + ./test/find '浮上しました' + ./test/find 'ならない' + ./test/find '浮上した' + ./test/find 'なって' + ./test/find 'の下にて' + diff --git a/db/readme.md b/db/readme.md new file mode 100644 index 0000000..985ee15 --- /dev/null +++ b/db/readme.md @@ -0,0 +1,168 @@ +# DB + +Yomikun's database (sqlite3) handles search scoping and deinflection. For full +details on how Yomikun's serach algorithm works, see [search +algorithm](#search-algorithm). + +This directory contains: + +- Database initialization scripts +- Deinflection rules +- User-submitted directories +- A makefile to initialize an empty database +- Typescript Database interface + +## ERD + +```mermaid +erDiagram + +%% tables +deinflection +dict +tag +definition +tag_label +term + +%% relations +definition }|--|| term : "" %% definition has one term, term can have multiple definitions +tag_label }o--|| tag : "" %% tag_label has one tag, tag can have 0 or more labels (though it should have 1) +term }o--o{ tag : term_tag +definition }o--o{ tag : definition_tag +dict ||--|{ definition : "" +``` + +See [dictionary init.sql](dict/init.sql) for details on tables. + +## Search algorithm + +The search algorithm runs in part in the database engine, and in Typescript. +The Typescript part is implemented in the [Language +subdirectory](../language). + +|step|implemented in| +|-|-| +|Scoping|Typescript and SQL| +|Deinflection|SQL| +|Lookup|SQL| +|Ranking|Typescript| + +### Scoping + +Sentences are parsed in chunks. Each step of the sentence parser moves the +chunk start forward one or more character, depending on the results of the +previous chunk. + +The Typescript part of the parser shifts forward the beginning of each chunk +(vertical sections in diagram). The database engine (sqlite3) generates all +possible substrings from the beginning of the chunk (triangle pattern in +diagram). + +This diagram shows how the sentence 「大福を食べようかな」 would be scoped with +a maximum lookahead of 5 (real parser uses 15). Red is wanted results, other +valid matches are highlighted in light blue. Vertical axis represents time +(forward = down), and horizontal axis is character indices for the input +string. + + +```mermaid +gantt +dateFormat X +axisFormat %s + +section 大福 +大福を食べ : 0,5s +大福を食 : 0,4s +大福を : 0,3s +大福 : crit,0,2s +大 : active,0,1s +section を +を食べよう : 2,5s +を食べよ : 2,4s +を食べ : 2,3s +を食 : 2,2s +を : crit,2,1s +section 食べる +食べようか : 3,5s +食べよう : crit,3,4s +食べよ : active,3,3s +食べ : active,3,2s +食 : active,3,1s +section かな +かな : crit,7,2s +か : active,7,1s +``` + +### Deinflection + +The deinflection step uses a table with simple find/replace rules, similar to +Yomichan's deinflection rules file. Each rule has a `kana_in`, `kana_out`, +`rules_in` and `rules_out` column. The rules are applied using the following +algorithm (psuedocode, real implementation is almost unreadably large SQL +query): + +```python +possibilities = [] +function deconjugate(original_input, depth) { + for (rule in deinflection_rules) { + # reset input after each rule check + input = original_input; + + # check if rule matches at all + if (input does not end in rule.kana_in) continue; + # make sure (for example) godan deconjugations don't get applied + # after ichidan deconjugations + if (rule.rules_in does not contain input.rules) continue; + # swap kana_in for kana_out on input string + input.replace_end(rule.kana_in, rule.kana_out); + # check if deconjugation didn't clear the input + if (input.length < 1) continue; + + # apply new rules to input + input.rules = rule.rules_out; + + # attempt another deconjugation step + depth += 1; + deconjugate(input, depth); + } +} +``` + +The deinflection rules' `rules_in` and `rules_out` are checked using bitwise +operators, and each bit has the same meaning as Yomichan's `rulesIn` and +`rulesOut`: + +|alias|bitmask|meaning| +|-|-|-| +|a|`-1`|all (allow all rules)| +||`0`|nothing| +|ru|`1 << 0`|一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide)| +|u|`1 << 1`|五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide)| +|s|`1 << 2`|する (suru)| +|k|`1 << 3`|くる (kuru)| +|z|`1 << 4`|ずる (zuru)| +|i|`1 << 5`|形容詞 (i-adjective)| +|iru|`1 << 6`|〜いる (temporary iru for progressive tense)| + +The deinflection rules are mostly derived from [Tae Kim's Japanese grammar +guide][taekim] and are initialized in [deinflections.sql](dict/deinflections.sql). + +### Lookup + +Lookup is done on the results of the deinflection step in the database. All +possible deinflections are checked for an exact match on either the reading or +writing of a word in all dictionaries. This step appends any ~dictionary and~ +word tags to the tags added by the deinflector. + +### Ranking + +Ranking happens in Typescript. Ranking also removes additional illegally +deconjugated words and gives priority to words with certain tags depending on +context. The filtering/ranking code is intentionally kept as readable as +possible because this code is mostly responsible for generating readings as +accurately as possible. Read the code [here](../language/parser.ts). + +[taekim]: https://guidetojapanese.org/learn/ + diff --git a/db/test/find b/db/test/find new file mode 100755 index 0000000..d22d711 --- /dev/null +++ b/db/test/find @@ -0,0 +1,20 @@ +#!/bin/sh + +TERM="$1" +USER="root" + +DICT_DB="../dict.db" +USER_DB="../user.db" +FIND_SQL="../find.sql" + +cd "$(dirname "$0")" + +sqlite3 \ + -markdown \ + -cmd ".timer ON" \ + -cmd ".param set :term '$TERM'" \ + -cmd ".param set :user '$USER'" \ + -cmd "attach '$DICT_DB' as dict" \ + -cmd "attach '$USER_DB' as user" \ + < "$FIND_SQL" + diff --git a/db/user/.gitignore b/db/user/.gitignore new file mode 100644 index 0000000..7b59807 --- /dev/null +++ b/db/user/.gitignore @@ -0,0 +1,2 @@ +base.sql +full.sql diff --git a/db/user/init.sql b/db/user/init.sql new file mode 100644 index 0000000..1d0e830 --- /dev/null +++ b/db/user/init.sql @@ -0,0 +1,16 @@ +create table if not exists user ( + id integer primary key autoincrement, + username text not null, + unique(username) +); + +create table if not exists sort_overlay ( + id integer primary key autoincrement, + user_id int not null default 0, + expression text not null, + reading text not null, + sort int not null, + foreign key(user_id) references user(id), + unique(user_id, expression, reading) on conflict replace +); + diff --git a/db/user/reset.sql b/db/user/reset.sql new file mode 100644 index 0000000..d136af3 --- /dev/null +++ b/db/user/reset.sql @@ -0,0 +1,4 @@ +drop table if exists sort_overlay; + +drop table if exists user; + diff --git a/db/user/root.sql b/db/user/root.sql new file mode 100644 index 0000000..70671ce --- /dev/null +++ b/db/user/root.sql @@ -0,0 +1,6 @@ +-- default user_id = 0 = root (apply to everyone) +insert into sort_overlay (expression, reading, sort) values + ('達','だち',-1), + ('の下に','のもとに',-1), + ('下に','しもに',-1); + diff --git a/deno.jsonc b/deno.jsonc new file mode 100644 index 0000000..3c5130f --- /dev/null +++ b/deno.jsonc @@ -0,0 +1,5 @@ +{ + "tasks": { + "dev": "deno run --watch main.ts" + } +} diff --git a/deno.lock b/deno.lock new file mode 100644 index 0000000..84f16e8 --- /dev/null +++ b/deno.lock @@ -0,0 +1,79 @@ +{ + "version": "2", + "remote": { + "https://deno.land/std@0.102.0/_util/assert.ts": "2f868145a042a11d5ad0a3c748dcf580add8a0dbc0e876eaa0026303a5488f58", + "https://deno.land/std@0.102.0/_util/os.ts": "4bfbbb53cf87ec0c0f562b5b836c4d5392faa0365a30580cdfd3acb9944dcca1", + "https://deno.land/std@0.102.0/path/_constants.ts": "1247fee4a79b70c89f23499691ef169b41b6ccf01887a0abd131009c5581b853", + "https://deno.land/std@0.102.0/path/_interface.ts": "1fa73b02aaa24867e481a48492b44f2598cd9dfa513c7b34001437007d3642e4", + "https://deno.land/std@0.102.0/path/_util.ts": "2e06a3b9e79beaf62687196bd4b60a4c391d862cfa007a20fc3a39f778ba073b", + "https://deno.land/std@0.102.0/path/common.ts": "eaf03d08b569e8a87e674e4e265e099f237472b6fd135b3cbeae5827035ea14a", + "https://deno.land/std@0.102.0/path/glob.ts": "697fec7edbada26ed12b696023646a28c3897370c94ef7a555d317494b8bc270", + "https://deno.land/std@0.102.0/path/mod.ts": "4465dc494f271b02569edbb4a18d727063b5dbd6ed84283ff906260970a15d12", + "https://deno.land/std@0.102.0/path/posix.ts": "b81974c768d298f8dcd2c720229639b3803ca4a241fa9a355c762fa2bc5ef0c1", + "https://deno.land/std@0.102.0/path/separator.ts": "8fdcf289b1b76fd726a508f57d3370ca029ae6976fcde5044007f062e643ff1c", + "https://deno.land/std@0.102.0/path/win32.ts": "f4a3d4a3f2c9fe894da046d5eac48b5e789a0ebec5152b2c0985efe96a9f7ae1", + "https://deno.land/std@0.176.0/_util/asserts.ts": "178dfc49a464aee693a7e285567b3d0b555dc805ff490505a8aae34f9cfb1462", + "https://deno.land/std@0.176.0/_util/os.ts": "d932f56d41e4f6a6093d56044e29ce637f8dcc43c5a90af43504a889cf1775e3", + "https://deno.land/std@0.176.0/encoding/hex.ts": "50f8c95b52eae24395d3dfcb5ec1ced37c5fe7610ef6fffdcc8b0fdc38e3b32f", + "https://deno.land/std@0.176.0/fmt/colors.ts": "938c5d44d889fb82eff6c358bea8baa7e85950a16c9f6dae3ec3a7a729164471", + "https://deno.land/std@0.176.0/fs/_util.ts": "65381f341af1ff7f40198cee15c20f59951ac26e51ddc651c5293e24f9ce6f32", + "https://deno.land/std@0.176.0/fs/copy.ts": "14214efd94fc3aa6db1e4af2b4b9578e50f7362b7f3725d5a14ad259a5df26c8", + "https://deno.land/std@0.176.0/fs/empty_dir.ts": "c3d2da4c7352fab1cf144a1ecfef58090769e8af633678e0f3fabaef98594688", + "https://deno.land/std@0.176.0/fs/ensure_dir.ts": "724209875497a6b4628dfb256116e5651c4f7816741368d6c44aab2531a1e603", + "https://deno.land/std@0.176.0/fs/ensure_file.ts": "c38602670bfaf259d86ca824a94e6cb9e5eb73757fefa4ebf43a90dd017d53d9", + "https://deno.land/std@0.176.0/fs/ensure_link.ts": "c0f5b2f0ec094ed52b9128eccb1ee23362a617457aa0f699b145d4883f5b2fb4", + "https://deno.land/std@0.176.0/fs/ensure_symlink.ts": "2955cc8332aeca9bdfefd05d8d3976b94e282b0f353392a71684808ed2ffdd41", + "https://deno.land/std@0.176.0/fs/eol.ts": "f1f2eb348a750c34500741987b21d65607f352cf7205f48f4319d417fff42842", + "https://deno.land/std@0.176.0/fs/exists.ts": "b8c8a457b71e9d7f29b9d2f87aad8dba2739cbe637e8926d6ba6e92567875f8e", + "https://deno.land/std@0.176.0/fs/expand_glob.ts": "45d17e89796a24bd6002e4354eda67b4301bb8ba67d2cac8453cdabccf1d9ab0", + "https://deno.land/std@0.176.0/fs/mod.ts": "bc3d0acd488cc7b42627044caf47d72019846d459279544e1934418955ba4898", + "https://deno.land/std@0.176.0/fs/move.ts": "4cb47f880e3f0582c55e71c9f8b1e5e8cfaacb5e84f7390781dd563b7298ec19", + "https://deno.land/std@0.176.0/fs/walk.ts": "ea95ffa6500c1eda6b365be488c056edc7c883a1db41ef46ec3bf057b1c0fe32", + "https://deno.land/std@0.176.0/path/_constants.ts": "e49961f6f4f48039c0dfed3c3f93e963ca3d92791c9d478ac5b43183413136e0", + "https://deno.land/std@0.176.0/path/_interface.ts": "6471159dfbbc357e03882c2266d21ef9afdb1e4aa771b0545e90db58a0ba314b", + "https://deno.land/std@0.176.0/path/_util.ts": "d7abb1e0dea065f427b89156e28cdeb32b045870acdf865833ba808a73b576d0", + "https://deno.land/std@0.176.0/path/common.ts": "ee7505ab01fd22de3963b64e46cff31f40de34f9f8de1fff6a1bd2fe79380000", + "https://deno.land/std@0.176.0/path/glob.ts": "d479e0a695621c94d3fd7fe7abd4f9499caf32a8de13f25073451c6ef420a4e1", + "https://deno.land/std@0.176.0/path/mod.ts": "4b83694ac500d7d31b0cdafc927080a53dc0c3027eb2895790fb155082b0d232", + "https://deno.land/std@0.176.0/path/posix.ts": "8b7c67ac338714b30c816079303d0285dd24af6b284f7ad63da5b27372a2c94d", + "https://deno.land/std@0.176.0/path/separator.ts": "0fb679739d0d1d7bf45b68dacfb4ec7563597a902edbaf3c59b50d5bcadd93b1", + "https://deno.land/std@0.176.0/path/win32.ts": "d186344e5583bcbf8b18af416d13d82b35a317116e6460a5a3953508c3de5bba", + "https://deno.land/std@0.179.0/_util/asserts.ts": "178dfc49a464aee693a7e285567b3d0b555dc805ff490505a8aae34f9cfb1462", + "https://deno.land/std@0.179.0/_util/os.ts": "d932f56d41e4f6a6093d56044e29ce637f8dcc43c5a90af43504a889cf1775e3", + "https://deno.land/std@0.179.0/path/_constants.ts": "e49961f6f4f48039c0dfed3c3f93e963ca3d92791c9d478ac5b43183413136e0", + "https://deno.land/std@0.179.0/path/_interface.ts": "6471159dfbbc357e03882c2266d21ef9afdb1e4aa771b0545e90db58a0ba314b", + "https://deno.land/std@0.179.0/path/_util.ts": "d7abb1e0dea065f427b89156e28cdeb32b045870acdf865833ba808a73b576d0", + "https://deno.land/std@0.179.0/path/common.ts": "ee7505ab01fd22de3963b64e46cff31f40de34f9f8de1fff6a1bd2fe79380000", + "https://deno.land/std@0.179.0/path/glob.ts": "d479e0a695621c94d3fd7fe7abd4f9499caf32a8de13f25073451c6ef420a4e1", + "https://deno.land/std@0.179.0/path/mod.ts": "4b83694ac500d7d31b0cdafc927080a53dc0c3027eb2895790fb155082b0d232", + "https://deno.land/std@0.179.0/path/posix.ts": "8b7c67ac338714b30c816079303d0285dd24af6b284f7ad63da5b27372a2c94d", + "https://deno.land/std@0.179.0/path/separator.ts": "0fb679739d0d1d7bf45b68dacfb4ec7563597a902edbaf3c59b50d5bcadd93b1", + "https://deno.land/std@0.179.0/path/win32.ts": "d186344e5583bcbf8b18af416d13d82b35a317116e6460a5a3953508c3de5bba", + "https://deno.land/std@0.192.0/_util/asserts.ts": "178dfc49a464aee693a7e285567b3d0b555dc805ff490505a8aae34f9cfb1462", + "https://deno.land/std@0.192.0/async/abortable.ts": "fd682fa46f3b7b16b4606a5ab52a7ce309434b76f820d3221bdfb862719a15d7", + "https://deno.land/std@0.192.0/async/deadline.ts": "58f72a3cc0fcb731b2cc055ba046f4b5be3349ff6bf98f2e793c3b969354aab2", + "https://deno.land/std@0.192.0/async/debounce.ts": "adab11d04ca38d699444ac8a9d9856b4155e8dda2afd07ce78276c01ea5a4332", + "https://deno.land/std@0.192.0/async/deferred.ts": "42790112f36a75a57db4a96d33974a936deb7b04d25c6084a9fa8a49f135def8", + "https://deno.land/std@0.192.0/async/delay.ts": "73aa04cec034c84fc748c7be49bb15cac3dd43a57174bfdb7a4aec22c248f0dd", + "https://deno.land/std@0.192.0/async/mod.ts": "f04344fa21738e5ad6bea37a6bfffd57c617c2d372bb9f9dcfd118a1b622e576", + "https://deno.land/std@0.192.0/async/mux_async_iterator.ts": "70c7f2ee4e9466161350473ad61cac0b9f115cff4c552eaa7ef9d50c4cbb4cc9", + "https://deno.land/std@0.192.0/async/pool.ts": "f1b8d3df4d7fd3c73f8cbc91cc2e8b8e950910f1eab94230b443944d7584c657", + "https://deno.land/std@0.192.0/async/retry.ts": "6521c061a5ab24e8b1ae624bdc581c4243d1d574f99dc7f5a2a195c2241fb1b8", + "https://deno.land/std@0.192.0/async/tee.ts": "47e42d35f622650b02234d43803d0383a89eb4387e1b83b5a40106d18ae36757", + "https://deno.land/std@0.192.0/http/server.ts": "1b23463b5b36e4eebc495417f6af47a6f7d52e3294827a1226d2a1aab23d9d20", + "https://deno.land/x/plug@1.0.1/deps.ts": "35ea2acd5e3e11846817a429b7ef4bec47b80f2d988f5d63797147134cbd35c2", + "https://deno.land/x/plug@1.0.1/download.ts": "8d6a023ade0806a0653b48cd5f6f8b15fcfaa1dbf2aa1f4bc90fc5732d27b144", + "https://deno.land/x/plug@1.0.1/mod.ts": "5dec80ee7a3a325be45c03439558531bce7707ac118f4376cebbd6740ff24bfb", + "https://deno.land/x/plug@1.0.1/types.ts": "d8eb738fc6ed883e6abf77093442c2f0b71af9090f15c7613621d4039e410ee1", + "https://deno.land/x/plug@1.0.1/util.ts": "5ba8127b9adc36e070b9e22971fb8106869eea1741f452a87b4861e574f13481", + "https://deno.land/x/sqlite3@0.9.1/deno.json": "50895b0bb0c13ae38b93413d7f9f62652f6e7076cd99b9876f6b3b7f6c488dca", + "https://deno.land/x/sqlite3@0.9.1/deps.ts": "f6035f0884a730c0d55b0cdce68846f13bbfc14e8afbf0b3cd4f12a52b4107b7", + "https://deno.land/x/sqlite3@0.9.1/mod.ts": "d41b8b30e1b20b537ef4d78cae98d90f6bd65c727b64aa1a18bffbb28f7d6ec3", + "https://deno.land/x/sqlite3@0.9.1/src/blob.ts": "3681353b3c97bc43f9b02f8d1c3269c0dc4eb9cb5d3af16c7ce4d1e1ec7507c4", + "https://deno.land/x/sqlite3@0.9.1/src/constants.ts": "85fd27aa6e199093f25f5f437052e16fd0e0870b96ca9b24a98e04ddc8b7d006", + "https://deno.land/x/sqlite3@0.9.1/src/database.ts": "c326446463955f276dcbe18547ede4b19ea3085bef0980548c0a58d830b3b5d9", + "https://deno.land/x/sqlite3@0.9.1/src/ffi.ts": "b83f6d16179be7a97a298d6e8172941dbf532058e7c2b3df3a708beefe285c90", + "https://deno.land/x/sqlite3@0.9.1/src/statement.ts": "4773bc8699a9084b93e65126cd5f9219c248de1fce447270bdae2c3630637150", + "https://deno.land/x/sqlite3@0.9.1/src/util.ts": "3892904eb057271d4072215c3e7ffe57a9e59e4df78ac575046eb278ca6239cd" + } +} diff --git a/language/japanese.ts b/language/japanese.ts new file mode 100644 index 0000000..c0ad825 --- /dev/null +++ b/language/japanese.ts @@ -0,0 +1,137 @@ +import { UnicodeRange } from "../util/string.ts"; + +enum StringOnlyReturnValue { + TallyAdd, + TallyIgnore, + TallyStop, +} + +export default class JapaneseString extends String { + /** @summary check tally for allowed scripts (internal use only) */ + private stringOnly(check: (key: string, val: number) => StringOnlyReturnValue): boolean { + var tally = this.rangeTally(); + var ok = false; + for (var [key, val] of Object.entries(tally)) { + switch(check(key, val)) { + case StringOnlyReturnValue.TallyAdd: { + ok = true; + break; + } + case StringOnlyReturnValue.TallyIgnore: { break; } + case StringOnlyReturnValue.TallyStop: { return false; } + } + } + return ok; + } + + /** + * @summary check if string is hiragana only + * + * @argument strict don't allow ascii whitespace and punctuation (default: false) + * + * return `true` if at least one hiragana character is in string, and no other + * unicode ranges are found. ascii whitespace and punctuation is still allowed, + * but not counted as hiragana. this behavior can be turned off by setting + * `strict` to true + */ + hiraganaOnly(strict = false) { + return this.stringOnly((key, val) => { + if (key == UnicodeRange.JapaneseHiragana) + return StringOnlyReturnValue.TallyAdd; // count hiragana characters + else if (!strict && key.startsWith("any-")) + return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) + else if (val > 0) + return StringOnlyReturnValue.TallyStop; // don't allow any other ranges + return StringOnlyReturnValue.TallyIgnore; + }); + } + + /** + * @summary check if string is katakana only + * + * @argument strict don't allow ascii whitespace and punctuation (default: false) + * + * return `true` if at least one katakana character is in string, and no other + * unicode ranges are found. ascii whitespace and punctuation is still allowed, + * but not counted as katakana. this behavior can be turned off by setting + * `strict` to true + */ + katakanaOnly(strict = false) { + return this.stringOnly((key, val) => { + if (key == UnicodeRange.JapaneseKatakana) + return StringOnlyReturnValue.TallyAdd; // count katakana characters + else if (!strict && key.startsWith("any-")) + return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) + else if (val > 0) + return StringOnlyReturnValue.TallyStop; // don't allow any other ranges + return StringOnlyReturnValue.TallyIgnore; + }); + } + + /** + * @summary check if string is kanji only + * + * @argument strict don't allow ascii whitespace and punctuation (default: false) + * + * return `true` if at least one kanji character is in string, and no other + * unicode ranges are found. ascii whitespace and punctuation is still allowed, + * but not counted as kanji. this behavior can be turned off by setting + * `strict` to true + */ + kanjiOnly(strict = false) { + return this.stringOnly((key, val) => { + if (key == UnicodeRange.JapaneseKanji) + return StringOnlyReturnValue.TallyAdd; // count kanji characters + else if (!strict && key.startsWith("any-")) + return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) + else if (val > 0) + return StringOnlyReturnValue.TallyStop; // don't allow any other ranges + return StringOnlyReturnValue.TallyIgnore; + }); + } + + /** + * @summary check if string is kana only + * + * @argument strict don't allow ascii whitespace and punctuation (default: false) + * + * return `true` if at least one kana character is in string, and no other + * unicode ranges are found. ascii whitespace and punctuation is still allowed, + * but not counted as kana. this behavior can be turned off by setting `strict` + * to true + */ + kanaOnly(strict = false) { + return this.stringOnly((key, val) => { + if (key == UnicodeRange.JapaneseHiragana || key == UnicodeRange.JapaneseKatakana) + return StringOnlyReturnValue.TallyAdd; // count kana characters + else if (!strict && key.startsWith("any-")) + return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) + else if (val > 0) + return StringOnlyReturnValue.TallyStop; // don't allow any other ranges + return StringOnlyReturnValue.TallyIgnore; + }); + } + + /** + * @summary check if string is japanese only + * + * @argument strict don't allow ascii whitespace and punctuation (default: false) + * + * return `true` if at least one japanese character is in string, and no other + * unicode ranges are found. ascii whitespace and punctuation is still allowed, + * but not counted as japanese. this behavior can be turned off by setting + * `strict` to true + */ + japaneseOnly(strict = false) { + return this.stringOnly((key, val) => { + if (key.startsWith("jp-")) + return StringOnlyReturnValue.TallyAdd; // count japanese characters + else if (!strict && key.startsWith("any-")) + return StringOnlyReturnValue.TallyIgnore; // allow any- (ascii whitespace and punctuation) + else if (val > 0) + return StringOnlyReturnValue.TallyStop; // don't allow any other ranges + return StringOnlyReturnValue.TallyIgnore; + }); + } +} + diff --git a/language/parser.ts b/language/parser.ts new file mode 100644 index 0000000..9c6bef2 --- /dev/null +++ b/language/parser.ts @@ -0,0 +1,121 @@ +import { Tag } from "./tags.ts"; +import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; +import DB from "../db/db.ts"; +import "../util/array.ts"; +import "../util/set.ts"; + +/** @summary main Parser class */ +export default class Parser { + db: DB; + + constructor() { + this.db = new DB(); + } + + async prepare() { + await Promise.all([ + this.db.prepare(), + ]); + } + + parse(sentence: string, options?: InputSentenceProps): ParseResult { + let parseResult = this.parseTerms(sentence, options); + if ((options?.depth || ParseDepth.Term) <= ParseDepth.Term) return parseResult; + parseResult = this.addGlossary(parseResult, options); + if ((options?.depth || ParseDepth.Term) <= ParseDepth.Term) return parseResult; + return parseResult; + } + + /** @summary parse sentence into terms with readings */ + private parseTerms(sentence: string, options?: InputSentenceProps): ParseResult { + const MAX_LOOKAHEAD = options?.lookahead ?? 15; + const PRIORITY_MOD_HIGHER = options?.priorityMod?.high ?? 10; + const PRIORITY_MOD_LOWER = options?.priorityMod?.low ?? 0.1; + + var parseResult: ParseResult = { + tokens: [], + depth: ParseDepth.Term, + }; + + for (let start = 0; start < sentence.length; start++) { + var results = this.db.findTerm(sentence.substring(start, start + MAX_LOOKAHEAD)); + // current starting point did not yield results, try again at next character or until end of input + if (results.length == 0) continue; + + results = results.filter(result => { + // ignore ignored by user terms + if (result.sort < 0) return false; + + // deconjugated words + if (result.depth > 0) { + // can't be conjugated at all + if (!result.tags.anyOf(Object.values(Tag.Class.Verb))) return false; + + // ignore other wrong deconjugations + if (result.tags.has(Tag.Class.Verb.U) && + !result.tags.has(Tag.Inflection.Reason.U)) return false; + if (result.tags.has(Tag.Class.Verb.Ru) && + !result.tags.has(Tag.Inflection.Reason.Ru)) return false; + if (result.tags.has(Tag.Class.Verb.Suru) && + !result.tags.has(Tag.Inflection.Reason.Suru)) return false; + } + + // all other results should be valid grammatically + return true; + }); + + // no valid results left after filter, try again at next character or until end of input + if (results.length == 0) continue; + + // bias search results by modifying sort value + results = results.map(result => { + // true if last token was a name else false + const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); + + // give higher priority to suffixes when last token was a name, else lower priority + if (result.tags.has(Tag.Class.Suffix)) + result.sort *= lastTokenName ? PRIORITY_MOD_HIGHER : PRIORITY_MOD_LOWER; + + // give lower priority to terms matched only by their readings, and are + // usually written in kanji + if (!result.tags.has(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) + result.sort *= PRIORITY_MOD_LOWER; + + return result; + }); + + + results.sort((a, b) => { + // sort by original string length (long to short) + if (a.original.length != b.original.length) return b.original.length - a.original.length; + // then by sort index (high to low) + if (a.sort != b.sort) return b.sort - a.sort; + // then by depth (high to low) + if (a.depth != b.depth) return b.depth - a.depth; + // else keep current order (random) + return 0; + }); + + // pick top result + const result = results[0]; + + parseResult.tokens.push({ + reading: [ {"text": result.expression, "ruby": result.reading} ], // TODO: source to reading + separate kaji/kana + tags: result.tags, + term_id: result.id, + source: result.original, + }); + + start += result.original.length - 1; // -1 because loop already increments start + continue; // extra verbose end of iteration + } + return parseResult; + } + + private addGlossary(input: ParseResult, options?: InputSentenceProps): ParseResult { + // TODO: annotate input with glossaries from DB + options; // prevent unused warning + return input; + } +}; + diff --git a/language/readme.md b/language/readme.md new file mode 100644 index 0000000..c889c9d --- /dev/null +++ b/language/readme.md @@ -0,0 +1,53 @@ +# Language + +This directory contains files that provide an abstracted interface with the +database for looking up sentences ~and words~. + +## Tags + +All dictionary entries have tags. Tags are combined from term info, dictionary +info, and glossary info. Tags can have subcategories separated by `:`. A +separate tags table handles displaying tags for different display languages, +including abbreviated versions. + +Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts). +Dictionary importers should map the dictionary-specific version of these tags +to Yomikun's tags for compatibility. Other tags include: + +|tag|description| +|-|-| +|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`. +|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`| +|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc. +|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.) + +### Behavior-altering tags + +Some tag classes impact the parser's behavior. For example, the input text +「完了しました」 will be parsed as just 「完了」, but with the +`class:verb:suru-included` tag added by the parser. This is because the word +「完了」 has the tag `class:verb:suru` in the database, which allows the parser +to deconjugate a noun with the verb 「する」 back into the stem. + +Other uses of this behavior include more accurate automatic kanji reading +generation, for example 「城」 b:ing read as 「じょう」 in 「ハイラル城」 +because 「ハイラル」 has the tag `name:place` in the database, and +「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`. + +Yomikun encourages homebrew dictionary sharing, and encourages using +behavior-altering tags for fixing readings for cases like the above examples. +As another example of this, it is encouraged that a dictionary for (for +example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`, +instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji +「湖(みずうみ)」. + +If Yomikun doesn't generate the correct reading, and the reading isn't based on +natural language context (=a computer *could* accurately decide which reading +is correct based on other words/tags in the sentence), please submit a pull +request with the sentence and it's (expected) reading. An example of a +non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be +read as both 「なん」 in which case 「何で」 turns into a single word, or +「なに」 where 「何」 is a regular word and 「で」 is particle. + +[taekim]: https://guidetojapanese.org/learn/ + diff --git a/language/tags.ts b/language/tags.ts new file mode 100644 index 0000000..4c1f134 --- /dev/null +++ b/language/tags.ts @@ -0,0 +1,102 @@ +/** @constant Tags that have significant meaning to the parser */ +export const Tag = { + /** @constant grammatical classes */ + Class: { + /** @constant verb subgroup */ + Verb: { + /** @constant any verb (fallback for vague dictionaries) */ + Unspecified: "class:verb", + /** @constant noun that can be conjugated into a verb by adding する */ + Suru: "class:verb:suru", + /** + * @constant verb stored as conjugated noun in database + * + * some dictionaries do this, also used internally to represent + * conjugation if found for suru-verb + */ + SuruIncluded: "class:verb:suru-included", + /** @constant godan verbs (〜う in [taekim]) */ + U: "class:verb:u", + /** @constant ichidan verbs (〜る in [taekim]) */ + Ru: "class:verb:ru", + }, + /** @constant regular nouns or words that can be treated as nouns */ + Noun: "class:noun", + /** @constant terms that are read differently when used as a suffix */ + Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types + /** @constant grammatical particles (e.g. の, と, は, を, etc.) */ + Particle: "class:part", + /** @constant expressions and idioms + * + * Can also be used for longer strings that are read in a special way, but + * is discouraged. + * + * @see ./readme.md#behavior-altering-tags + */ + Expression: "class:expr", + }, + /** @constant types of names */ + Name: { + /** @constant name of a place/location. allows suffixes */ + Place: "name:place", + /** @constant feminine name. allows suffixes and honorifics */ + Female: "name:female", + /** @constant masculine name. allows suffixes and honorifics */ + Male: "name:male", + }, + /** + * @constant added to a word when deconjugated by the deinflection table + * + * Some inflections are used as steps in others, like the -tari suffix which + * is conjugated after the past tense. In this case, the past tense tag would + * be removed when it comes after the -tari tag. (see ../util/string.ts) + * + * e.g. 来ない -> 来る [infl:negative] + */ + Inflection: { + /** @constant negative conjugations */ + Negative: "infl:negative", + /** @constant time-related conjugations */ + Tense: { + /** @constant past tense (e.g. 叩いた) */ + Past: "infl:tense:past", + }, + /** @constant adverbs (e.g. 早く) */ + Adverb: "infl:adverb", + /** @constant polite conjugations */ + Polite: { + /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */ + Masu: "infl:polite:masu", + }, + /** @constant common ending conjugations */ + Suffix: { + /** @constant -te ending (e.g. 売って) */ + Te: "infl:suffix:te", + /** @constant -tari ending (e.g. 遊んだり) */ + Tari: "infl:suffix:tari", + }, + /** @constant internal deinflection rules */ + Reason: { + /** @constant applied if word was deconjugated as -ru (ichidan) verb */ + Ru: "infl:reason:ru", + /** @constant applied if word was deconjugated as -u (godan) verb */ + U: "infl:reason:u", + /** @constant applied if word was deconjugated as suru verb */ + Suru: "infl:reason:suru", + /** @constant applied if word was deconjugated as kuru verb */ + Kuru: "infl:reason:kuru", + /** @constant applied if word was deconjugated as i-adjective */ + AdjI: "infl:reason:adj-i", + }, + }, + /** @constant uncategorized tags */ + Auxiliary: { + /** @constant word usually written using only kana (but also has kanji) */ + UsuallyKana: "aux:uk", + }, +} as const; + +export type TokenTag = string; // no way around it + +export type TokenTags = Set; + diff --git a/language/types.ts b/language/types.ts new file mode 100644 index 0000000..40ea4ba --- /dev/null +++ b/language/types.ts @@ -0,0 +1,49 @@ +import { TokenTags } from "./tags.ts"; + +export enum ParseDepth { + Term, + Glossary, +}; + +export interface TokenReading { + text: string; + ruby?: string; +}; + +export interface GlossaryDefinition { + +}; + +export interface Glossary { + id: number; + definitions: GlossaryDefinition[]; +}; + +export interface ParseToken { + reading: TokenReading[]; + tags: TokenTags; + glossary?: Glossary; + term_id: number; + source: string; +}; + +export interface ParseResult { + depth: ParseDepth; + tokens: ParseToken[] +}; + +/** @summary option struct for Parser */ +export interface InputSentenceProps { + /** @prop max amount of characters to look ahead when attempting to deconjugate */ + lookahead?: number; + /** @prop amount of detail to return in search results */ + depth?: ParseDepth; + /** @prop search bias multipliers */ + priorityMod?: { + /** @prop multiplier for negative bias */ + low?: number; + /** @prop multiplier for positive bias */ + high?: number; + }; +}; + diff --git a/license b/license new file mode 100644 index 0000000..a86647f --- /dev/null +++ b/license @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 lonkaars + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/main.ts b/main.ts new file mode 100644 index 0000000..00be856 --- /dev/null +++ b/main.ts @@ -0,0 +1,61 @@ +import * as path from 'https://deno.land/std@0.102.0/path/mod.ts'; +Deno.chdir(path.dirname(path.fromFileUrl(Deno.mainModule))); + +import { ParseResult } from "./language/types.ts"; + +function prettyprintParseResult(input: ParseResult) { + for (var token of input.tokens) { + var out = ""; + + out += token.term_id; + out += ": "; + out += token.reading.map(r => r.text).reduce((a, b) => a + b); + out += " ("; + out += token.reading.map(r => r.ruby ? r.ruby : r.text).reduce((a, b) => a + b); + out += ") "; + out += token.tags.arr().map(a => `[${a}]`).join(" "); + + console.log(out); + } + console.log(input.tokens.map(t => t.source).join(" ")); +} + +import API from "./core/api.ts"; + +import YomikunDirectAPIClient from "./core/direct/client.ts"; + +import YomikunRemoteAPIServer from "./core/http/server.ts"; +import YomikunRemoteAPIClient from "./core/http/client.ts"; + +async function apiTest(api: API) { + prettyprintParseResult(await api.parseSentence("浮上したハイラル城の下にてゼルダ様達の捜索を行うこととなった")); + console.log("-------------"); + prettyprintParseResult(await api.parseSentence("浮上した城の様")); + console.log("-------------"); + prettyprintParseResult(await api.parseSentence("迷子になってしまった")); +} + +// test 1 (direct api) +(async () => { + var api = new YomikunDirectAPIClient(); + await api.prepare(); + + console.log("Prepare direct api done"); + await apiTest(api); +})(); + +// test 2 (remote api) +(async () => { + // default host = localhost:9400 + var server = new YomikunRemoteAPIServer(); + await server.prepare(); + server.start(); + + var api = new YomikunRemoteAPIClient(); + await api.prepare(); + + console.log("Prepare remote api done"); + await apiTest(api); + + Deno.exit(0); +})(); diff --git a/makefile b/makefile new file mode 100644 index 0000000..cad47ab --- /dev/null +++ b/makefile @@ -0,0 +1,22 @@ +TARGET = yomikun + +SRCS += ./main.ts +SRCS += ./core/yomikun.ts +SRCS += ./db/db.ts +SRCS += ./language/tags.ts +SRCS += ./language/translator.ts +SRCS += ./language/types.ts +SRCS += ./util/array.ts +SRCS += ./util/error.ts +SRCS += ./util/string.ts + +DENO_FLAGS += --unstable +DENO_FLAGS += --allow-ffi +DENO_FLAGS += --allow-env +DENO_FLAGS += --allow-read +DENO_FLAGS += --allow-write +DENO_FLAGS += --allow-net + +$(TARGET): $(SRCS) + deno compile --output $@ $< $(DENO_FLAGS) + diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..d1b527a --- /dev/null +++ b/readme.md @@ -0,0 +1,67 @@ +# Yomikun + +> This project is very much a work in progress. Crossed out text in this +> document is for features that will be implemented, but aren't. See [the +> dream](#the-dream) for ideas/features that are in the works. Documentation is +> currently also incomplete, but is intended to be fixed before an official +> 'dev-ready' pre-release. + +Yomikun is a spiritual successor to Yomichan, but implemented from scratch. +Because Yomichan is no longer maintained, and extension manifest v2 is being +deprecated, this project was created. Yomikun aims to provide all the original +features of Yomichan, but in a standalone application for tighter integration +with OS services and other applications. ~Yomikun has a very flexible extension +API~, and aims to provide extra features that help with immersion and sentence +mining on top of the base word lookup that Yomichan provided. **While Yomikun's +scope is larger than Yomichan, it's still focused on Japanese only.** + +## TODO + +- [x] working proof of concept sentence lookup using deno/sqlite3 +- [ ] port dictionaries for more advanced testing + - [x] JMdict (WIP) + - [ ] JMNedict +- [ ] add separate kanji readings/info table +- [ ] add separate frequency dictionary +- [ ] add more deinflections to db/deinflections.sql +- [ ] set up unit tests for sentence reading generation +- [ ] port server-internal API to simple HTTP JSON API +- [ ] create primitive search page ui +- [ ] add code formatter config +- [ ] complete documentation + +## ~New features (from Yomichan)~ + +NONE OF THESE ARE IMPLEMENTED YET + +- [ ] standalone server/client architecture for integration outside of browsers +- [ ] text input from other sources than clipboard and manual input + - [ ] integration with accessibility frameworks provided by OSes to extract + text from applications without using OCR + - [ ] OCR for games or manga + - [ ] subtitle tracking for video players such as YouTube, Jellyfin and MPV +- [ ] dedicated sentence mining features +- [ ] flexible dictionary editing, importing, exporting, toggling and sharing +- [ ] user dictionaries containing series-specific words or readings of kanji +- [ ] support for front-end UI translations +- [ ] slightly more accurate automatic reading generation +- [ ] bookmark words/sentences +- [ ] support for extensions + +## Documentation + +Some general documentation is done in markdown, but other general documentation +should be done in JSDoc format in the corresponding code files. The +documentation also makes frequent references to [Tae Kim's Japanese grammar +guide][taekim], which is abbreviated to [taekim] instead of copying the link +into the source code each time. + +## The dream + +Here's a Figma mockup for what the search page could look like (contains +questionable content/translations, this image is mainly for illustration +purposes) + +!["the dream"](assets/the-dream.png) + +[taekim]: https://guidetojapanese.org/learn/ diff --git a/util/array.ts b/util/array.ts new file mode 100644 index 0000000..76e2a9e --- /dev/null +++ b/util/array.ts @@ -0,0 +1,17 @@ +declare global { + interface Array { + anyOf(arr2: Array): boolean; + peek(): T; + } +} + +/** @summary check if any of the elements of `arr2` are included in `this` */ +Array.prototype.anyOf = function(arr2) { + return !!this.filter(e => arr2.includes(e)).length; +}; + +/** @summary return last element of array without removing it */ +Array.prototype.peek = function() { + return this[this.length - 1]; +}; + diff --git a/util/error.ts b/util/error.ts new file mode 100644 index 0000000..1e9d6eb --- /dev/null +++ b/util/error.ts @@ -0,0 +1,7 @@ +export default class YomikunError extends Error { + constructor(message = "", ...args: any) { + super(message, ...args); + this.message = message; + this.name = "YomikunError"; + } +} diff --git a/util/readme.md b/util/readme.md new file mode 100644 index 0000000..e642629 --- /dev/null +++ b/util/readme.md @@ -0,0 +1,10 @@ +# Utilities + +This directory contains files that extend Javascript built-in types with handy +functions. To use these, simply import them: + +```typescript +import "../util/array.ts"; // array extensions example +["foo", "bar"].anyOf(["bar", "baz"]); // -> true +``` + diff --git a/util/set.ts b/util/set.ts new file mode 100644 index 0000000..9790682 --- /dev/null +++ b/util/set.ts @@ -0,0 +1,17 @@ +declare global { + interface Set { + anyOf(arr2: Array): boolean; + arr(): Array; + } +} + +/** @summary return set items as array */ +Set.prototype.arr = function() { + return Array.from(this); +} + +/** @summary check if any of the elements of `arr2` are included in `this` */ +Set.prototype.anyOf = function(arr2) { + return !!this.arr().filter(e => arr2.includes(e)).length; +}; + diff --git a/util/string.ts b/util/string.ts new file mode 100644 index 0000000..d94f5a3 --- /dev/null +++ b/util/string.ts @@ -0,0 +1,74 @@ +import { TokenTags, TokenTag, Tag } from "../language/tags.ts"; +import JapaneseString from "../language/japanese.ts"; + +declare global { + /** @summary extended String prototype functions */ + interface String { + range(): UnicodeRange; + rangeTally(): RangeTally; + + jp(): JapaneseString; + + parseTags(): TokenTags; + } +} + +export enum UnicodeRange { + BasicLatin = "latin", + Whitespace = "any-whitespace", + Punctuation = "any-punctuation", + Unknown = "any-unknown", + JapanesePunctuation = "jp-punctuation", + JapaneseHiragana = "jp-hiragana", + JapaneseKatakana = "jp-katakana", + JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana", + JapaneseKanji = "jp-kanji", +} + +type RangeTally = Record; + +/** @summary get UnicodeRange for character at index 0 */ +String.prototype.range = function() { + var code = this.charCodeAt(0); + + if (0x09 == code) return UnicodeRange.Whitespace; // tab + if (0x20 == code) return UnicodeRange.Whitespace; // space + if (0x21 == code) return UnicodeRange.Punctuation; // exclamation mark + if (0x2e == code) return UnicodeRange.Punctuation; // full stop + if (0x3f == code) return UnicodeRange.Punctuation; // question mark + + if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin; + if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation; + if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana; + if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana; + if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana; + if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji; + return UnicodeRange.Unknown; +} + +/** @summary create a RangeTally object for counting used unicode ranges in string */ +String.prototype.rangeTally = function() { + var tally = Object.keys(UnicodeRange).reduce((a: any,c) => (a[c] = 0, a), {}) as RangeTally; + for (var char of this) tally[char.range()]++; + return tally; +}; + +/** @summary get JapaneseString from this string */ +String.prototype.jp = function() { + return new JapaneseString(this); +} + +/** @summary parse concatenated tag string to TokenTags */ +String.prototype.parseTags = function() { + var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + for (var tag of tags) { + // skip past tense tags after -te and -tari deinflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + filteredTags.push(tag); + } + return new Set(filteredTags) as TokenTags; +} + -- cgit v1.2.3