initial public commit

author: lonkaars <loek@pipeframe.xyz> 2023-06-28 23:59:50 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-06-28 23:59:50 +0200
commit: 67dbb6421976254658c5e38045513129dd18187a (patch)
tree: 288b599d1097b26bdbcad3b6749b38e133017cf2 /db
19 files changed, 946 insertions, 0 deletions
diff --git a/db/.gitignore b/db/.gitignore
new file mode 100644
index 0000000..98e6ef6
--- /dev/null
+++ b/db/.gitignore
@@ -0,0 +1 @@
+*.db
diff --git a/db/db.ts b/db/db.ts
new file mode 100644
index 0000000..d5a2b76
--- /dev/null
+++ b/db/db.ts
@@ -0,0 +1,109 @@
+import { Database, Statement } from "https://deno.land/x/sqlite3@0.9.1/mod.ts";
+import * as path from 'https://deno.land/std@0.102.0/path/mod.ts';
+
+import { TokenTags } from "../language/tags.ts";
+import "../util/string.ts";
+import YomikunError from "../util/error.ts";
+
+export interface DBDictInfo {
+	id: number;
+	name: string;
+	language: string;
+	priority: number;
+};
+
+export interface FindResult {
+	id: number;
+	expression: string;
+	reading: string;
+	tags: TokenTags;
+	sort: number;
+	depth: number;
+	original: string;
+	match: {
+		kanji: boolean;
+		reading: boolean;
+	};
+}
+
+interface DBFindResult {
+	id: number;
+	expression: string;
+	reading: string;
+	tags: string;
+  rules: string;
+	depth: number;
+	original: string;
+	deinflected: string;
+	root_overlay: number;
+	user_overlay: number;
+}
+
+/**
+ * @summary dictionary database connection, handles deconjugation and lookup in SQL
+ *
+ * @example
+ * const db = new DB();
+ * await db.prepare();
+ * const results = db.findTerm("なった");
+ */
+export default class DB {
+	private connection: Database;
+	public ready: boolean = false;
+	private paths = {
+		db: {
+			dict: path.resolve('db', 'dict.db'),
+			user: path.resolve('db', 'user.db'),
+		},
+		query: {
+			find: path.resolve('db', 'find.sql'),
+		},
+	} as const;
+	private statement: {
+		attach: Statement;
+		queryTerm: Statement;
+	};
+
+	constructor() {
+		this.connection = new Database(":memory:", { create: false });
+		this.statement = {
+			attach: this.connection.prepare("attach database ? as ?"),
+			queryTerm: this.connection.prepare(""), // initialized in prepare()
+		};
+		this.attach(this.paths.db.dict, 'dict');
+		this.attach(this.paths.db.user, 'user');
+	}
+	
+	private attach(dbPath: string, alias?: string) {
+		this.statement.attach.run(dbPath, alias);
+	}
+
+	async prepare() {
+		const statement = await Deno.readTextFile(this.paths.query.find);
+		this.statement.queryTerm = this.connection.prepare(statement);
+		this.ready = true;
+	}
+
+	findTerm(term: string): FindResult[] {
+		if (!this.ready) throw new YomikunError("DB not ready yet, call `async DB::prepare()` first");
+
+		var results = this.statement.queryTerm.all({ term }) as unknown as DBFindResult[];
+		var terms: FindResult[] = results?.map(term => {
+			if (term.rules == null) term.rules = "";
+			return {
+				id: term.id,
+				expression: term.expression,
+				reading: term.reading,
+				tags: (term.tags + ' ' + term.rules).parseTags(),
+				sort: term.user_overlay ?? term.root_overlay ?? 100,
+				depth: term.depth,
+				original: term.original,
+				match: {
+					kanji: term.expression == term.deinflected,
+					reading: term.reading == term.deinflected,
+				},
+			};
+		});
+		return terms;
+	}
+};
diff --git a/db/dict/.gitignore b/db/dict/.gitignore
new file mode 100644
index 0000000..96bd267
--- /dev/null
+++ b/db/dict/.gitignore
@@ -0,0 +1,10 @@
+base.sql
+full.sql
+
+test_a.sql
+test_b.sql
+test_pitch_accent.sql
+
+jmdict*
+
+dict.sql
diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql
new file mode 100644
index 0000000..d13f313
--- /dev/null
+++ b/db/dict/deinflections.sql
@@ -0,0 +1,183 @@
+-- deinflection rules (ordered by appearance in tae kim's japanese grammar guide)
+create temporary table deinflection_temp (tag, kana_in, kana_out, rules_in, rules_out);
+insert into deinflection_temp values
+	-- negative <https://guidetojapanese.org/learn/grammar/negativeverbs>
+	('infl:negative',   'ない', 'る', 'a', 'ru'),
+	('infl:negative', 'わない', 'う', 'a', 'u'),
+	('infl:negative', 'かない', 'く', 'a', 'u'),
+	('infl:negative', 'がない', 'ぐ', 'a', 'u'),
+	('infl:negative', 'さない', 'す', 'a', 'u'),
+	('infl:negative', 'たない', 'つ', 'a', 'u'),
+	('infl:negative', 'なない', 'ぬ', 'a', 'u'),
+	('infl:negative', 'ばない', 'ぶ', 'a', 'u'),
+	('infl:negative', 'まない', 'む', 'a', 'u'),
+	('infl:negative', 'らない', 'る', 'a', 'u'),
+	('infl:negative', 'しない', 'する', 'a', 's'),
+	('infl:negative', 'こない', 'くる', 'a', 'k'),
+	('infl:negative', '来ない', '来る', 'a', 'k'),
+	('infl:negative',   'ない', 'ある', 'a', 'ru'), -- this one may cause problems (?)
+	-- ('infl:negative',   'ない', '', 'a', 'ru'), -- this one may cause problems (?)
+
+	-- past tense <https://guidetojapanese.org/learn/grammar/past_tense>
+	('infl:tense:past',   'た', 'る', 'a', 'ru'),
+	('infl:tense:past', 'した', 'す', 'a', 'u'),
+	('infl:tense:past', 'いた', 'く', 'a', 'u'),
+	('infl:tense:past', 'いだ', 'ぐ', 'a', 'u'),
+	('infl:tense:past', 'んだ', 'む', 'a', 'u'),
+	('infl:tense:past', 'んだ', 'ぬ', 'a', 'u'),
+	('infl:tense:past', 'んだ', 'ぶ', 'a', 'u'),
+	('infl:tense:past', 'った', 'う', 'a', 'u'),
+	('infl:tense:past', 'った', 'つ', 'a', 'u'),
+	('infl:tense:past', 'った', 'る', 'a', 'u'),
+	('infl:tense:past', 'した', 'する', 'a', 's'),
+	('infl:tense:past', 'きた', 'くる', 'a', 'k'),
+	('infl:tense:past', '来た', 'くる', 'a', 'k'),
+	('infl:tense:past', '行った', '行く', 'a', ''),
+
+	-- adjective to adverb <https://guidetojapanese.org/learn/grammar/adverbs>
+	('infl:adverb', 'く', 'い', 'a', 'i'),
+	-- TODO: na-adjectives aren't deconjugated
+
+	-- polite form <https://guidetojapanese.org/learn/grammar/polite>
+	('infl:polite:masu',   'ます', 'る', 'a', 'ru'),
+	('infl:polite:masu', 'います', 'う', 'a', 'u'),
+	('infl:polite:masu', 'きます', 'く', 'a', 'u'),
+	('infl:polite:masu', 'ぎます', 'ぐ', 'a', 'u'),
+	('infl:polite:masu', 'します', 'す', 'a', 'u'),
+	('infl:polite:masu', 'ちます', 'つ', 'a', 'u'),
+	('infl:polite:masu', 'にます', 'ぬ', 'a', 'u'),
+	('infl:polite:masu', 'びます', 'ぶ', 'a', 'u'),
+	('infl:polite:masu', 'みます', 'む', 'a', 'u'),
+	('infl:polite:masu', 'ります', 'る', 'a', 'u'),
+	('infl:polite:masu', 'します', 'する', 'a', 's'),
+	('infl:polite:masu', 'きます', 'くる', 'a', 'k'),
+	('infl:polite:masu', '来ます', '来る', 'a', 'k'),
+	('infl:polite:masu infl:negative',   'ません', 'る', 'a', 'ru'),
+	('infl:polite:masu infl:negative', 'いません', 'う', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'きません', 'く', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'ぎません', 'ぐ', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'しません', 'す', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'ちません', 'つ', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'にません', 'ぬ', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'びません', 'ぶ', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'みません', 'む', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'りません', 'る', 'a', 'u'),
+	('infl:polite:masu infl:negative', 'しません', 'する', 'a', 's'),
+	('infl:polite:masu infl:negative', 'きません', 'くる', 'a', 'k'),
+	('infl:polite:masu infl:negative', '来ません', '来る', 'a', 'k'),
+	('infl:polite:masu infl:tense:past',   'ました', 'る', 'a', 'ru'),
+	('infl:polite:masu infl:tense:past', 'いました', 'う', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'きました', 'く', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'ぎました', 'ぐ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'しました', 'す', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'ちました', 'つ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'にました', 'ぬ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'びました', 'ぶ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'みました', 'む', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'りました', 'る', 'a', 'u'),
+	('infl:polite:masu infl:tense:past', 'しました', 'する', 'a', 's'),
+	('infl:polite:masu infl:tense:past', 'きました', 'くる', 'a', 'k'),
+	('infl:polite:masu infl:tense:past', '来ました', '来る', 'a', 'k'),
+	('infl:polite:masu infl:tense:past infl:negative',   'ませんでした', 'る', 'a', 'ru'),
+	('infl:polite:masu infl:tense:past infl:negative', 'いませんでした', 'う', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'く', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'ぎませんでした', 'ぐ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'す', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'ちませんでした', 'つ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'にませんでした', 'ぬ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'びませんでした', 'ぶ', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'みませんでした', 'む', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'りませんでした', 'る', 'a', 'u'),
+	('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'する', 'a', 's'),
+	('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'くる', 'a', 'k'),
+	('infl:polite:masu infl:tense:past infl:negative', '来ませんでした', '来る', 'a', 'k'),
+	-- ('infl:polite:masu infl:tense:past infl:negative', 'くありません', 'い', 'a', 'i'), -- reality check (should not be needed)
+
+	-- TODO: compound nouns and adjectives <https://guidetojapanese.org/learn/grammar/compound>
+
+	-- te-form <https://guidetojapanese.org/learn/grammar/compound>
+	('infl:suffix:te',   'て',   'た', 'a', 'ru'),
+	('infl:suffix:te', 'して', 'した', 'a', 's u'),
+	('infl:suffix:te', 'いて', 'いた', 'a', 'u'),
+	('infl:suffix:te', 'いで', 'いだ', 'a', 'u'),
+	('infl:suffix:te', 'んで', 'んだ', 'a', 'u'),
+	('infl:suffix:te', 'って', 'った', 'a', 'u'),
+	('infl:suffix:te', 'きて', 'きた', 'a', 'k'),
+	('infl:suffix:te', '来て', '来た', 'a', 'k'),
+	('infl:suffix:te', 'くて', 'い', 'a', ''), -- TODO: rules_out of this one is i?
+
+	-- -tari lists <https://guidetojapanese.org/learn/grammar/compound>
+	('infl:suffix:tari',   'たり',   'た', 'a', 'ru'),
+	('infl:suffix:tari', 'したり', 'した', 'a', 's u'),
+	('infl:suffix:tari', 'いたり', 'いた', 'a', 'u'),
+	('infl:suffix:tari', 'いだり', 'いだ', 'a', 'u'),
+	('infl:suffix:tari', 'んだり', 'んだ', 'a', 'u'),
+	('infl:suffix:tari', 'ったり', 'った', 'a', 'u'),
+	('infl:suffix:tari', 'きたり', 'きた', 'a', 'k'),
+	('infl:suffix:tari', '来たり', '来た', 'a', 'k'),
+
+	-- auxiliary rules
+	('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem
+
+-- rule/bitmask lookup table
+create temporary table rule_map (tag, name, mask);
+insert into rule_map values
+	(null,                'a',   -1    ), -- all      (allow all rules in)
+	(null,                '',    0     ), --          (nothing)
+	('infl:reason:ru',    'ru',  1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide)
+	('infl:reason:u',     'u',   1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide)
+	('infl:reason:suru',  's',   1 << 2), -- する     (suru)
+	('infl:reason:kuru',  'k',   1 << 3), -- くる     (kuru)
+	(null,                'z',   1 << 4), -- ずる     (zuru)
+	('infl:reason:adj-i', 'i',   1 << 5), -- 形容詞   (i-adjective)
+	(null,                'iru', 1 << 6); -- 〜いる   (temporary iru for progressive tense)
+
+-- add tags to db
+insert into deinflection_rules (mask, tag)
+select mask, tag
+from rule_map
+where tag is not null;
+
+-- convert space-separated strings to bitmasks and insert into deinflection table
+insert into deinflection (tag, kana_in, kana_out, rules_in, rules_out)
+with temp_deinflection_map(tag, kana_in, kana_out, rules_in_stack, rules_in, rules_out_stack, rules_out) as (
+	select
+		tag,
+		kana_in,
+		kana_out,
+		rules_in || ' ',
+		'',
+		rules_out || ' ',
+		''
+	from deinflection_temp
+	union
+	select
+		tag,
+		kana_in,
+		kana_out,
+		substr(rules_in_stack, instr(rules_in_stack, ' ') + 1),
+		replace(substr(rules_in_stack, 0, instr(rules_in_stack, ' ')), ' ', ''),
+		substr(rules_out_stack, instr(rules_out_stack, ' ') + 1),
+		replace(substr(rules_out_stack, 0, instr(rules_out_stack, ' ')), ' ', '')
+	from temp_deinflection_map
+	where
+		(length(rules_in_stack) > 1) or
+		(length(rules_out_stack) > 1)
+)
+select
+	temp_deinflection_map.tag,
+	temp_deinflection_map.kana_in,
+	temp_deinflection_map.kana_out,
+	-- NOTE: sum() should really be bitwise and across all rows after group by
+	-- here. be careful to not specify rules more than once in these columns, or
+	-- any other rule if the column has 'a'.
+	sum(rule_map_in.mask) as rules_in,
+	sum(rule_map_out.mask) as rules_out
+from temp_deinflection_map
+join rule_map as rule_map_in on rule_map_in.name = temp_deinflection_map.rules_in
+join rule_map as rule_map_out on rule_map_out.name = temp_deinflection_map.rules_out
+group by
+	temp_deinflection_map.tag,
+	temp_deinflection_map.kana_in,
+	temp_deinflection_map.kana_out;
+
diff --git a/db/dict/init.sql b/db/dict/init.sql
new file mode 100644
index 0000000..4e9fcc9
--- /dev/null
+++ b/db/dict/init.sql
@@ -0,0 +1,115 @@
+-- deinflection map (see dict/deinflections.sql)
+create table if not exists deinflection (
+	tag text not null,
+	kana_in text not null,
+	kana_out text not null,
+	rules_in int not null default 0,
+	rules_out int not null default 0
+);
+create index deinflection_kana_in on deinflection (kana_in);
+
+-- lookup table for deinflection rule bitmasks
+create table if not exists deinflection_rules (
+	mask int not null default 0,
+	tag text not null
+);
+
+-- dictionary metadata
+create table if not exists dict (
+	id integer primary key autoincrement,
+	tag text not null, -- TODO: dict.tag -> dict.tag_id
+	language text null default null,
+	priority int not null default 1
+);
+
+-- tag code<->id lookup
+create table if not exists tag (
+	id integer primary key autoincrement,
+	code text not null,
+	unique(code) on conflict ignore
+);
+create index tag_code on tag (code);
+
+
+-- definition of term (with single meaning)
+--
+-- terms that have multiple meanings should have multiple entries in the
+-- definition table with the same term_id. glossary contains an explanation or
+-- list of synonyms to illustrate the meaning of the term.
+create table if not exists definition (
+	id integer primary key autoincrement,
+	term_id int not null, -- reference to term (combined writing and reading)
+	sort int not null default 1, -- sort multiple meanings/glossaries within dictionary
+  -- search results for terms are grouped by term, then dictionary, then sort
+  -- within each dictionary
+	glossary text null default null, -- explanation or list of synonyms for term
+	dict_id int not null, -- origin dictionary of term
+	foreign key(term_id) references term(id),
+	foreign key(dict_id) references dict(id),
+	unique(glossary) on conflict ignore
+);
+
+-- human-readable tag label lookup table (used in UI and for translations)
+create table if not exists tag_label (
+	id integer primary key autoincrement,
+	tag_id int not null, -- reference to tag
+	language text not null default "en-US", -- label locale as RFC 5646 tag (e.g. ja)
+  -- label locale is used when displaying tags in a UI frontend. labels are
+  -- sorted based on langauge preference, with "en-US" being inserted last as a
+  -- fallback.
+	label_short text null default null, -- abbreviated label (e.g. ティアキン)
+	label text not null, -- full label (e.g. ティアーズ・オブ・ザ・キングダム)
+	description text null default null, -- short description (e.g. ゼルダの伝説シリーズより)
+	foreign key(tag_id) references tag(id)
+);
+
+-- standalone combination of kanji and reading
+--
+-- terms are added automatically when importing a dictionary, but are shared
+-- between dictionaries and thus not removed when disabling/removing a
+-- dictionary. terms are ranked outside the database, and the starting rank
+-- score can be changed using the user database (TODO)
+create table if not exists term (
+	id integer primary key autoincrement,
+	expression text not null, -- writing of term (e.g. 乗り越える)
+	reading text not null, -- reading of term (e.g. のりこえる)
+  alt int null default null, -- reference to main writing if this is is an alternate
+  -- alternate readings are stored as normal terms, but point to their main
+  -- reading. also used internally to store alternate readings of irregular
+  -- verbs example:
+  -- id  expression  reading  alt
+  -- 1   言葉        ことば   NULL
+  -- 2   詞          ことば   1
+  -- 3   辞          ことば   1
+  -- 4   来る        くる     NULL
+  -- 5   来た        きた     4
+  -- NOTE: alternate writings can technically be stored recursively this way,
+  -- but are only resolved one level deep TODO: resolve alternates
+  foreign key(alt) references term(id),
+	unique(expression, reading) on conflict ignore
+);
+create index term_expression on term (expression);
+create index term_reading on term (reading);
+-- TODO: (automatically) remove unused terms from db (using constraints?)
+
+
+-- allow many<->many relation between definition and tag
+create table if not exists definition_tag (
+	id integer primary key autoincrement,
+	definition_id int not null,
+	tag_id int not null,
+	foreign key(definition_id) references definition(id),
+	foreign key(tag_id) references tag(id),
+	unique(definition_id, tag_id) on conflict ignore
+);
+
+-- allow many<->many relation between term and tag
+create table if not exists term_tag (
+	id integer primary key autoincrement,
+	term_id int not null,
+	tag_id int not null,
+	foreign key(term_id) references term(id),
+	foreign key(tag_id) references tag(id),
+	unique(term_id, tag_id) on conflict ignore
+);
+
diff --git a/db/dict/reset.sql b/db/dict/reset.sql
new file mode 100644
index 0000000..757e31b
--- /dev/null
+++ b/db/dict/reset.sql
@@ -0,0 +1,12 @@
+drop table if exists definition_tag;
+drop table if exists term_tag;
+
+drop table if exists definition;
+drop table if exists tag_label;
+drop table if exists term;
+
+drop table if exists deinflection;
+drop table if exists deinflection_rules;
+drop table if exists dict;
+drop table if exists tag;
+
diff --git a/db/dict/tags.sql b/db/dict/tags.sql
new file mode 100644
index 0000000..a200abb
--- /dev/null
+++ b/db/dict/tags.sql
@@ -0,0 +1,11 @@
+insert into tag (code) values
+  ('class:verb'),
+  ('class:verb:suru'),
+  ('class:verb:suru-included'),
+  ('class:noun'),
+  ('class:suffix'),
+  ('class:part'),
+  ('class:expr'),
+  ('name:place'),
+  ('name:female'),
+  ('name:male');
diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql.m4
new file mode 100644
index 0000000..00de413
--- /dev/null
+++ b/db/dict/template.sql.m4
@@ -0,0 +1,104 @@
+-- create temporary ingest table
+drop table if exists ingest;
+-- TODO: ingest pitch-accent dictionaries
+-- TODO: ingest alternate writings (space-separated)
+create temporary table ingest(
+	-- term fields
+	expression text not null, -- kanji of term (e.g. 読み込む)
+	reading text not null, -- reading of term (e.g. よみこむ)
+	term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB
+
+	-- definition fields
+	glossary_sort int not null default 1, -- order of multiple meanings (glossaries)
+	glossary text null default null, -- glossary content (support for basic HTML markup/styling)
+	glossary_tags text null default null -- add tags to single glossary entry
+);
+
+include(`/dev/stdin')dnl --'
+-- the apostrophe is so my editor highlighting keeps working if I force the
+-- filetype to sql instead of m4
+
+-- create dict id
+insert into dict (tag, language) values ('dict:' || :dict, :lang);
+
+-- add terms
+insert into term (expression, reading)
+select expression, reading
+from ingest;
+
+-- add definitions
+insert into definition (term_id, sort, glossary, dict_id)
+select
+	term.id,
+	ingest.glossary_sort,
+	ingest.glossary,
+	(select id from dict where tag = 'dict:' || :dict)
+from ingest
+join term on term.expression = ingest.expression and term.reading = ingest.reading;
+
+-- create map of term_id and tag code
+drop table if exists term_tag_map;
+create temporary table term_tag_map (term_id, tag);
+insert into term_tag_map
+with tag_map(term_id, temp, tag) as (
+	select
+		(select id from term where expression is ingest.expression and reading is ingest.reading),
+		term_tags || ' ',
+		''
+	from ingest
+	union
+	select
+		term_id,
+		`substr'(temp, instr(temp, ' ') + 1),
+		`substr'(temp, 0, instr(temp, ' '))
+	from tag_map
+	where length(temp) > 1
+)
+select term_id, replace(tag, ' ', '')
+from tag_map
+where length(tag) > 0;
+
+-- create map of definition_id and tag code
+drop table if exists definition_tag_map;
+create temporary table definition_tag_map (definition_id, tag);
+insert into definition_tag_map
+with tag_map(definition_id, temp, tag) as (
+	select
+		(select id from definition where glossary is ingest.glossary),
+		glossary_tags || ' ',
+		''
+	from ingest
+	union
+	select
+		definition_id,
+		`substr'(temp, instr(temp, ' ') + 1),
+		`substr'(temp, 0, instr(temp, ' '))
+	from tag_map
+	where length(temp) > 1
+)
+select definition_id, replace(tag, ' ', '')
+from tag_map
+where length(tag) > 0;
+
+-- make sure tags exist
+insert into tag (code)
+select tag from term_tag_map
+union
+select tag from definition_tag_map;
+
+-- add tags to terms
+insert into term_tag (term_id, tag_id)
+select
+	term_id,
+	tag.id
+from term_tag_map
+join tag on tag.code = term_tag_map.tag;
+
+-- add tags to definitions
+insert into definition_tag (definition_id, tag_id)
+select
+	definition_id,
+	tag.id
+from definition_tag_map
+join tag on tag.code = definition_tag_map.tag;
+
diff --git a/db/dict/test_a.dict.sql b/db/dict/test_a.dict.sql
new file mode 100644
index 0000000..854d207
--- /dev/null
+++ b/db/dict/test_a.dict.sql
@@ -0,0 +1,25 @@
+-- set these
+.param set :dict 'test_a'
+.param set :lang 'nl_NL'
+
+-- dictionary content
+insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values
+	('浮上', 'ふじょう', 'class:verb:suru class:noun', 1, 'opstijgen, zweven', ''),
+	('城', 'しろ', 'class:noun', 1, 'kasteel', ''),
+	('城', 'じょう', 'class:suffix', 1, '-burcht, -burg (suffix voor kastelen)', ''),
+	('下', 'した', 'class:noun', 1, 'onder, beneden, omlaag', ''),
+	('の下に', 'のもとに', 'class:expr', 1, 'onder leiding van', ''),
+	('ハイラル', 'はいらる', 'series:zelda name:place class:noun', 1, 'Hyrule', ''),
+	('にて', 'にて', 'class:part', 1, '{duidt aanwezigheid op een plaats aan}', ''),
+	('ゼルダ', 'ぜるだ', 'series:zelda name:female class:noun', 1, 'Zelda', ''),
+	('様', 'さま', 'class:suffix', 1, 'meneer, mijnheer, mevrouw, madame', ''),
+	('様', 'よう', 'class:noun', 1, '(eruit zien) als, zoals, manier', ''),
+	('達', 'たち', 'class:suffix', 1, '{meervoudssuffix}', ''),
+	('の', 'の', 'class:part', 1, '{bezitspartikel}', ''),
+	('と', 'と', 'class:part', 1, '{opsommingspartikel}', ''),
+	('捜索', 'そうさく', 'class:verb:suru class:noun', 1, 'onderzoek, opspeuren, achterhalen', ''),
+	('を', 'を', 'class:part', 1, '{lijdend voornaamwoord partikel}', ''),
+	('行う', 'おこなう', 'class:verb', 1, 'uitvoeren', ''),
+	('こと', 'こと', 'class:part class:noun', 1, '{eindpartikel dat een bevel / waarschuwing uitdrukt}', ''),
+	('成る', 'なる', 'class:verb aux:test', 1, 'worden, veranderen in', '');
+
diff --git a/db/dict/test_b.dict.sql b/db/dict/test_b.dict.sql
new file mode 100644
index 0000000..0140189
--- /dev/null
+++ b/db/dict/test_b.dict.sql
@@ -0,0 +1,9 @@
+-- set these
+.param set :dict 'test_b'
+.param set :lang 'en_US'
+
+-- dictionary content
+insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values
+	('城', 'しろ', 'aux:test_b', 1, 'castle', ''),
+	('城', 'じょう', 'aux:test_b aux:test_b aux:nog_een_test_b', 1, '-burg (suffix for castles)', 'aux:gert aux:test aux:nog_wat');
+
diff --git a/db/dict/test_pitch_accent.dict.sql b/db/dict/test_pitch_accent.dict.sql
new file mode 100644
index 0000000..3b8298f
--- /dev/null
+++ b/db/dict/test_pitch_accent.dict.sql
@@ -0,0 +1,10 @@
+-- this is an example pitch accent dictionary
+-- set these
+.param set :dict 'pitch_accent'
+.param set :lang null
+
+-- dictionary content
+insert into ingest (expression, reading, glossary_tags) values
+	('浮上', 'ふじょう', 'pitch:0'),
+	('成る', 'なる', 'pitch:1');
+
diff --git a/db/find.sql b/db/find.sql
new file mode 100644
index 0000000..cdaebb3
--- /dev/null
+++ b/db/find.sql
@@ -0,0 +1,96 @@
+-- this statement is prepared and run using :term and :user as inputs (see
+-- db.ts or test/find)
+
+-- this file is kind of messy because it needs to be one large query, instead
+-- of separate phases creating temporary tables. queries with more than one
+-- statement can't return results because of the way sqlite3 works.
+
+-- TODO: add more comments in this file to explain what is going on
+
+-- explain query plan -- testing only
+with results(id, expression, reading, tags, depth, rules, original, deinflected) as (
+  -- stripped deinflection table (remove some columns and duplicates)
+  with deinflections(term, tags, depth, original, rules) as (
+    -- recursively generated deinflection table
+    with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as (
+      -- input term all substrings until length 1
+      with inputs(length, term, tags, rules, rules_in, rules_out, depth) as (
+        select length(:term), :term, '', -1, 0, 0, 0
+        union
+        select
+          inputs.length - 1,
+          substr(inputs.term, 1, inputs.length - 1),
+          inputs.tags,
+          inputs.rules,
+          inputs.rules_in,
+          inputs.rules_out,
+          inputs.depth
+        from inputs
+        where inputs.length > 1
+      )
+      select * from inputs
+      union -- join all recursive rows into one large table
+      select
+        deinflect.length,
+        substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out,
+        deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side
+        deinflection.rules_out,
+        deinflection.rules_in,
+        deinflect.rules,
+        deinflect.depth + 1
+      from deinflect -- temp table
+      inner join deinflection -- deinflection rules table
+      on
+        -- rules_in has to contain any of the current deconjugation rules
+        (deinflect.rules & deinflection.rules_in != 0) and
+        -- term.endsWith(kana_in)
+        (substr(term, length(term) - length(kana_in) + 1) = kana_in) and
+        -- can't deconjugate to length <1
+        (length(term) > 0)
+      limit 50 -- failsafe to catch any infinite loops
+    )
+    select term, tags, depth, substr(:term, 1, deinflect.length), rules_out
+    from deinflect
+  )
+  select
+    term.id,
+    term.expression,
+    term.reading,
+    deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags,
+    deinflections.depth,
+    rules,
+    deinflections.original,
+		deinflections.term
+  from deinflections
+  inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term)
+  inner join term_tag on term_tag.term_id = term.id
+  inner join tag on term_tag.tag_id = tag.id
+  group by term.id, deinflections.original
+  having term.id is not null
+)
+select
+  results.id,
+  results.expression,
+  results.reading,
+  results.tags,
+  group_concat(deinflection_rules.tag, ' ') as rules,
+  results.depth,
+  results.original,
+	results.deinflected,
+	root_overlay.sort as root_overlay,
+	user_overlay.sort as user_overlay
+from results
+left join deinflection_rules
+	on results.rules & deinflection_rules.mask != 0
+left join sort_overlay
+	as root_overlay
+	on (root_overlay.expression = results.expression) and
+	(root_overlay.reading = results.reading) and
+	(root_overlay.user_id = 0)
+left join sort_overlay
+	as user_overlay
+	on (user_overlay.expression = results.expression) and
+	(user_overlay.reading = results.reading) and
+	(user_overlay.user_id = (select id from user where username = :user))
+group by results.id, results.original;
+
diff --git a/db/makefile b/db/makefile
new file mode 100644
index 0000000..c1e527e
--- /dev/null
+++ b/db/makefile
@@ -0,0 +1,45 @@
+SQL = sqlite3
+DICT_DB = dict.db
+USER_DB = user.db
+
+DICT_TEMPLATE = dict/template.sql.m4
+
+.PHONY: clean test
+
+all: $(DICT_DB) $(USER_DB)
+
+$(DICT_DB): dict/full.sql
+	$(SQL) $@ < $<
+
+$(USER_DB): user/full.sql
+	$(SQL) $@ < $<
+
+dict/base.sql: dict/reset.sql dict/init.sql dict/deinflections.sql dict/tags.sql
+	cat $^ > $@
+
+dict/full.sql: dict/base.sql dict/dict.sql
+	cat $^ > $@
+
+dict/dict.sql: dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql dict/jmdict.sql
+	cat $^ > $@
+
+user/base.sql: user/reset.sql user/init.sql
+	cat $^ > $@
+
+user/full.sql: user/base.sql user/root.sql
+	cat $^ > $@
+
+%.sql: %.dict.sql $(DICT_TEMPLATE)
+	m4 $(DICT_TEMPLATE) < $< > $@
+
+# delete generated sql files and database
+clean:
+	$(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql user/base.sql user/full.sql
+
+test: $(DICT_DB) find.sql
+	./test/find '浮上しました'
+	./test/find 'ならない'
+	./test/find '浮上した'
+	./test/find 'なって'
+	./test/find 'の下にて'
+
diff --git a/db/readme.md b/db/readme.md
new file mode 100644
index 0000000..985ee15
--- /dev/null
+++ b/db/readme.md
@@ -0,0 +1,168 @@
+# DB
+
+Yomikun's database (sqlite3) handles search scoping and deinflection. For full
+details on how Yomikun's serach algorithm works, see [search
+algorithm](#search-algorithm).
+
+This directory contains:
+
+- Database initialization scripts
+- Deinflection rules
+- User-submitted directories
+- A makefile to initialize an empty database
+- Typescript Database interface
+
+## ERD
+
+```mermaid
+erDiagram
+
+%% tables
+deinflection
+dict
+tag
+definition
+tag_label
+term
+
+%% relations
+definition  }|--|| term : "" %% definition has one term, term can have multiple definitions
+tag_label }o--|| tag : "" %% tag_label has one tag, tag can have 0 or more labels (though it should have 1)
+term }o--o{ tag : term_tag
+definition }o--o{ tag : definition_tag
+dict ||--|{ definition : ""
+```
+
+See [dictionary init.sql](dict/init.sql) for details on tables.
+
+## Search algorithm
+
+The search algorithm runs in part in the database engine, and in Typescript.
+The Typescript part is implemented in the [Language
+subdirectory](../language).
+
+|step|implemented in|
+|-|-|
+|Scoping|Typescript and SQL|
+|Deinflection|SQL|
+|Lookup|SQL|
+|Ranking|Typescript|
+
+### Scoping
+
+Sentences are parsed in chunks. Each step of the sentence parser moves the
+chunk start forward one or more character, depending on the results of the
+previous chunk.
+
+The Typescript part of the parser shifts forward the beginning of each chunk
+(vertical sections in diagram). The database engine (sqlite3) generates all
+possible substrings from the beginning of the chunk (triangle pattern in
+diagram).
+
+This diagram shows how the sentence 「大福を食べようかな」 would be scoped with
+a maximum lookahead of 5 (real parser uses 15). Red is wanted results, other
+valid matches are highlighted in light blue. Vertical axis represents time
+(forward = down), and horizontal axis is character indices for the input
+string.
+
+<!-- I know a Gantt chart isn't the 'right' chart to use here, but it gets the
+point across -->
+```mermaid
+gantt
+dateFormat X
+axisFormat %s
+
+section 大福
+大福を食べ : 0,5s
+大福を食 : 0,4s
+大福を : 0,3s
+大福 : crit,0,2s
+大 : active,0,1s
+section を
+を食べよう : 2,5s
+を食べよ : 2,4s
+を食べ : 2,3s
+を食 : 2,2s
+を : crit,2,1s
+section 食べる
+食べようか : 3,5s
+食べよう : crit,3,4s
+食べよ : active,3,3s
+食べ : active,3,2s
+食 : active,3,1s
+section かな
+かな : crit,7,2s
+か : active,7,1s
+```
+
+### Deinflection
+
+The deinflection step uses a table with simple find/replace rules, similar to
+Yomichan's deinflection rules file. Each rule has a `kana_in`, `kana_out`,
+`rules_in` and `rules_out` column. The rules are applied using the following
+algorithm (psuedocode, real implementation is almost unreadably large SQL
+query):
+
+```python
+possibilities = []
+function deconjugate(original_input, depth) {
+  for (rule in deinflection_rules) {
+    # reset input after each rule check
+    input = original_input;
+
+    # check if rule matches at all
+    if (input does not end in rule.kana_in) continue;
+    # make sure (for example) godan deconjugations don't get applied
+    # after ichidan deconjugations
+    if (rule.rules_in does not contain input.rules) continue;
+    # swap kana_in for kana_out on input string
+    input.replace_end(rule.kana_in, rule.kana_out);
+    # check if deconjugation didn't clear the input
+    if (input.length < 1) continue;
+
+    # apply new rules to input
+    input.rules = rule.rules_out;
+    
+    # attempt another deconjugation step
+    depth += 1;
+    deconjugate(input, depth);
+  }
+}
+```
+
+The deinflection rules' `rules_in` and `rules_out` are checked using bitwise
+operators, and each bit has the same meaning as Yomichan's `rulesIn` and
+`rulesOut`:
+
+|alias|bitmask|meaning|
+|-|-|-|
+|a|`-1`|all (allow all rules)|
+||`0`|nothing|
+|ru|`1 << 0`|一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide)|
+|u|`1 << 1`|五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide)|
+|s|`1 << 2`|する (suru)|
+|k|`1 << 3`|くる (kuru)|
+|z|`1 << 4`|ずる (zuru)|
+|i|`1 << 5`|形容詞 (i-adjective)|
+|iru|`1 << 6`|〜いる (temporary iru for progressive tense)|
+
+The deinflection rules are mostly derived from [Tae Kim's Japanese grammar
+guide][taekim] and are initialized in [deinflections.sql](dict/deinflections.sql).
+
+### Lookup
+
+Lookup is done on the results of the deinflection step in the database. All
+possible deinflections are checked for an exact match on either the reading or
+writing of a word in all dictionaries. This step appends any ~dictionary and~
+word tags to the tags added by the deinflector.
+
+### Ranking
+
+Ranking happens in Typescript. Ranking also removes additional illegally
+deconjugated words and gives priority to words with certain tags depending on
+context. The filtering/ranking code is intentionally kept as readable as
+possible because this code is mostly responsible for generating readings as
+accurately as possible. Read the code [here](../language/parser.ts).
+
+[taekim]: https://guidetojapanese.org/learn/
+
diff --git a/db/test/find b/db/test/find
new file mode 100755
index 0000000..d22d711
--- /dev/null
+++ b/db/test/find
@@ -0,0 +1,20 @@
+#!/bin/sh
+
+TERM="$1"
+USER="root"
+
+DICT_DB="../dict.db"
+USER_DB="../user.db"
+FIND_SQL="../find.sql"
+
+cd "$(dirname "$0")"
+
+sqlite3 \
+	-markdown \
+	-cmd ".timer ON" \
+	-cmd ".param set :term '$TERM'" \
+	-cmd ".param set :user '$USER'" \
+	-cmd "attach '$DICT_DB' as dict" \
+	-cmd "attach '$USER_DB' as user" \
+	< "$FIND_SQL"
+
diff --git a/db/user/.gitignore b/db/user/.gitignore
new file mode 100644
index 0000000..7b59807
--- /dev/null
+++ b/db/user/.gitignore
@@ -0,0 +1,2 @@
+base.sql
+full.sql
diff --git a/db/user/init.sql b/db/user/init.sql
new file mode 100644
index 0000000..1d0e830
--- /dev/null
+++ b/db/user/init.sql
@@ -0,0 +1,16 @@
+create table if not exists user (
+	id integer primary key autoincrement,
+	username text not null,
+	unique(username)
+);
+
+create table if not exists sort_overlay (
+	id integer primary key autoincrement,
+	user_id int not null default 0,
+	expression text not null,
+	reading text not null,
+	sort int not null,
+	foreign key(user_id) references user(id),
+	unique(user_id, expression, reading) on conflict replace
+);
+
diff --git a/db/user/reset.sql b/db/user/reset.sql
new file mode 100644
index 0000000..d136af3
--- /dev/null
+++ b/db/user/reset.sql
@@ -0,0 +1,4 @@
+drop table if exists sort_overlay;
+
+drop table if exists user;
+
diff --git a/db/user/root.sql b/db/user/root.sql
new file mode 100644
index 0000000..70671ce
--- /dev/null
+++ b/db/user/root.sql
@@ -0,0 +1,6 @@
+-- default user_id = 0 = root (apply to everyone)
+insert into sort_overlay (expression, reading, sort) values
+	('達','だち',-1),
+	('の下に','のもとに',-1),
+	('下に','しもに',-1);
+
author	lonkaars <loek@pipeframe.xyz>	2023-06-28 23:59:50 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-06-28 23:59:50 +0200
commit	67dbb6421976254658c5e38045513129dd18187a (patch)
tree	288b599d1097b26bdbcad3b6749b38e133017cf2 /db