diff options
Diffstat (limited to 'db/dict')
-rw-r--r-- | db/dict/.gitignore | 10 | ||||
-rw-r--r-- | db/dict/deinflections.sql | 183 | ||||
-rw-r--r-- | db/dict/init.sql | 115 | ||||
-rw-r--r-- | db/dict/reset.sql | 12 | ||||
-rw-r--r-- | db/dict/tags.sql | 11 | ||||
-rw-r--r-- | db/dict/template.sql.m4 | 104 | ||||
-rw-r--r-- | db/dict/test_a.dict.sql | 25 | ||||
-rw-r--r-- | db/dict/test_b.dict.sql | 9 | ||||
-rw-r--r-- | db/dict/test_pitch_accent.dict.sql | 10 |
9 files changed, 479 insertions, 0 deletions
diff --git a/db/dict/.gitignore b/db/dict/.gitignore new file mode 100644 index 0000000..96bd267 --- /dev/null +++ b/db/dict/.gitignore @@ -0,0 +1,10 @@ +base.sql +full.sql + +test_a.sql +test_b.sql +test_pitch_accent.sql + +jmdict* + +dict.sql diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql new file mode 100644 index 0000000..d13f313 --- /dev/null +++ b/db/dict/deinflections.sql @@ -0,0 +1,183 @@ +-- deinflection rules (ordered by appearance in tae kim's japanese grammar guide) +create temporary table deinflection_temp (tag, kana_in, kana_out, rules_in, rules_out); +insert into deinflection_temp values + -- negative <https://guidetojapanese.org/learn/grammar/negativeverbs> + ('infl:negative', 'ない', 'る', 'a', 'ru'), + ('infl:negative', 'わない', 'う', 'a', 'u'), + ('infl:negative', 'かない', 'く', 'a', 'u'), + ('infl:negative', 'がない', 'ぐ', 'a', 'u'), + ('infl:negative', 'さない', 'す', 'a', 'u'), + ('infl:negative', 'たない', 'つ', 'a', 'u'), + ('infl:negative', 'なない', 'ぬ', 'a', 'u'), + ('infl:negative', 'ばない', 'ぶ', 'a', 'u'), + ('infl:negative', 'まない', 'む', 'a', 'u'), + ('infl:negative', 'らない', 'る', 'a', 'u'), + ('infl:negative', 'しない', 'する', 'a', 's'), + ('infl:negative', 'こない', 'くる', 'a', 'k'), + ('infl:negative', '来ない', '来る', 'a', 'k'), + ('infl:negative', 'ない', 'ある', 'a', 'ru'), -- this one may cause problems (?) + -- ('infl:negative', 'ない', '', 'a', 'ru'), -- this one may cause problems (?) + + -- past tense <https://guidetojapanese.org/learn/grammar/past_tense> + ('infl:tense:past', 'た', 'る', 'a', 'ru'), + ('infl:tense:past', 'した', 'す', 'a', 'u'), + ('infl:tense:past', 'いた', 'く', 'a', 'u'), + ('infl:tense:past', 'いだ', 'ぐ', 'a', 'u'), + ('infl:tense:past', 'んだ', 'む', 'a', 'u'), + ('infl:tense:past', 'んだ', 'ぬ', 'a', 'u'), + ('infl:tense:past', 'んだ', 'ぶ', 'a', 'u'), + ('infl:tense:past', 'った', 'う', 'a', 'u'), + ('infl:tense:past', 'った', 'つ', 'a', 'u'), + ('infl:tense:past', 'った', 'る', 'a', 'u'), + ('infl:tense:past', 'した', 'する', 'a', 's'), + ('infl:tense:past', 'きた', 'くる', 'a', 'k'), + ('infl:tense:past', '来た', 'くる', 'a', 'k'), + ('infl:tense:past', '行った', '行く', 'a', ''), + + -- adjective to adverb <https://guidetojapanese.org/learn/grammar/adverbs> + ('infl:adverb', 'く', 'い', 'a', 'i'), + -- TODO: na-adjectives aren't deconjugated + + -- polite form <https://guidetojapanese.org/learn/grammar/polite> + ('infl:polite:masu', 'ます', 'る', 'a', 'ru'), + ('infl:polite:masu', 'います', 'う', 'a', 'u'), + ('infl:polite:masu', 'きます', 'く', 'a', 'u'), + ('infl:polite:masu', 'ぎます', 'ぐ', 'a', 'u'), + ('infl:polite:masu', 'します', 'す', 'a', 'u'), + ('infl:polite:masu', 'ちます', 'つ', 'a', 'u'), + ('infl:polite:masu', 'にます', 'ぬ', 'a', 'u'), + ('infl:polite:masu', 'びます', 'ぶ', 'a', 'u'), + ('infl:polite:masu', 'みます', 'む', 'a', 'u'), + ('infl:polite:masu', 'ります', 'る', 'a', 'u'), + ('infl:polite:masu', 'します', 'する', 'a', 's'), + ('infl:polite:masu', 'きます', 'くる', 'a', 'k'), + ('infl:polite:masu', '来ます', '来る', 'a', 'k'), + ('infl:polite:masu infl:negative', 'ません', 'る', 'a', 'ru'), + ('infl:polite:masu infl:negative', 'いません', 'う', 'a', 'u'), + ('infl:polite:masu infl:negative', 'きません', 'く', 'a', 'u'), + ('infl:polite:masu infl:negative', 'ぎません', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'しません', 'す', 'a', 'u'), + ('infl:polite:masu infl:negative', 'ちません', 'つ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'にません', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'びません', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:negative', 'みません', 'む', 'a', 'u'), + ('infl:polite:masu infl:negative', 'りません', 'る', 'a', 'u'), + ('infl:polite:masu infl:negative', 'しません', 'する', 'a', 's'), + ('infl:polite:masu infl:negative', 'きません', 'くる', 'a', 'k'), + ('infl:polite:masu infl:negative', '来ません', '来る', 'a', 'k'), + ('infl:polite:masu infl:tense:past', 'ました', 'る', 'a', 'ru'), + ('infl:polite:masu infl:tense:past', 'いました', 'う', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'きました', 'く', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'ぎました', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'しました', 'す', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'ちました', 'つ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'にました', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'びました', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'みました', 'む', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'りました', 'る', 'a', 'u'), + ('infl:polite:masu infl:tense:past', 'しました', 'する', 'a', 's'), + ('infl:polite:masu infl:tense:past', 'きました', 'くる', 'a', 'k'), + ('infl:polite:masu infl:tense:past', '来ました', '来る', 'a', 'k'), + ('infl:polite:masu infl:tense:past infl:negative', 'ませんでした', 'る', 'a', 'ru'), + ('infl:polite:masu infl:tense:past infl:negative', 'いませんでした', 'う', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'く', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'ぎませんでした', 'ぐ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'す', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'ちませんでした', 'つ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'にませんでした', 'ぬ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'びませんでした', 'ぶ', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'みませんでした', 'む', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'りませんでした', 'る', 'a', 'u'), + ('infl:polite:masu infl:tense:past infl:negative', 'しませんでした', 'する', 'a', 's'), + ('infl:polite:masu infl:tense:past infl:negative', 'きませんでした', 'くる', 'a', 'k'), + ('infl:polite:masu infl:tense:past infl:negative', '来ませんでした', '来る', 'a', 'k'), + -- ('infl:polite:masu infl:tense:past infl:negative', 'くありません', 'い', 'a', 'i'), -- reality check (should not be needed) + + -- TODO: compound nouns and adjectives <https://guidetojapanese.org/learn/grammar/compound> + + -- te-form <https://guidetojapanese.org/learn/grammar/compound> + ('infl:suffix:te', 'て', 'た', 'a', 'ru'), + ('infl:suffix:te', 'して', 'した', 'a', 's u'), + ('infl:suffix:te', 'いて', 'いた', 'a', 'u'), + ('infl:suffix:te', 'いで', 'いだ', 'a', 'u'), + ('infl:suffix:te', 'んで', 'んだ', 'a', 'u'), + ('infl:suffix:te', 'って', 'った', 'a', 'u'), + ('infl:suffix:te', 'きて', 'きた', 'a', 'k'), + ('infl:suffix:te', '来て', '来た', 'a', 'k'), + ('infl:suffix:te', 'くて', 'い', 'a', ''), -- TODO: rules_out of this one is i? + + -- -tari lists <https://guidetojapanese.org/learn/grammar/compound> + ('infl:suffix:tari', 'たり', 'た', 'a', 'ru'), + ('infl:suffix:tari', 'したり', 'した', 'a', 's u'), + ('infl:suffix:tari', 'いたり', 'いた', 'a', 'u'), + ('infl:suffix:tari', 'いだり', 'いだ', 'a', 'u'), + ('infl:suffix:tari', 'んだり', 'んだ', 'a', 'u'), + ('infl:suffix:tari', 'ったり', 'った', 'a', 'u'), + ('infl:suffix:tari', 'きたり', 'きた', 'a', 'k'), + ('infl:suffix:tari', '来たり', '来た', 'a', 'k'), + + -- auxiliary rules + ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem + +-- rule/bitmask lookup table +create temporary table rule_map (tag, name, mask); +insert into rule_map values + (null, 'a', -1 ), -- all (allow all rules in) + (null, '', 0 ), -- (nothing) + ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide) + ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide) + ('infl:reason:suru', 's', 1 << 2), -- する (suru) + ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru) + (null, 'z', 1 << 4), -- ずる (zuru) + ('infl:reason:adj-i', 'i', 1 << 5), -- 形容詞 (i-adjective) + (null, 'iru', 1 << 6); -- 〜いる (temporary iru for progressive tense) + +-- add tags to db +insert into deinflection_rules (mask, tag) +select mask, tag +from rule_map +where tag is not null; + +-- convert space-separated strings to bitmasks and insert into deinflection table +insert into deinflection (tag, kana_in, kana_out, rules_in, rules_out) +with temp_deinflection_map(tag, kana_in, kana_out, rules_in_stack, rules_in, rules_out_stack, rules_out) as ( + select + tag, + kana_in, + kana_out, + rules_in || ' ', + '', + rules_out || ' ', + '' + from deinflection_temp + union + select + tag, + kana_in, + kana_out, + substr(rules_in_stack, instr(rules_in_stack, ' ') + 1), + replace(substr(rules_in_stack, 0, instr(rules_in_stack, ' ')), ' ', ''), + substr(rules_out_stack, instr(rules_out_stack, ' ') + 1), + replace(substr(rules_out_stack, 0, instr(rules_out_stack, ' ')), ' ', '') + from temp_deinflection_map + where + (length(rules_in_stack) > 1) or + (length(rules_out_stack) > 1) +) +select + temp_deinflection_map.tag, + temp_deinflection_map.kana_in, + temp_deinflection_map.kana_out, + -- NOTE: sum() should really be bitwise and across all rows after group by + -- here. be careful to not specify rules more than once in these columns, or + -- any other rule if the column has 'a'. + sum(rule_map_in.mask) as rules_in, + sum(rule_map_out.mask) as rules_out +from temp_deinflection_map +join rule_map as rule_map_in on rule_map_in.name = temp_deinflection_map.rules_in +join rule_map as rule_map_out on rule_map_out.name = temp_deinflection_map.rules_out +group by + temp_deinflection_map.tag, + temp_deinflection_map.kana_in, + temp_deinflection_map.kana_out; + diff --git a/db/dict/init.sql b/db/dict/init.sql new file mode 100644 index 0000000..4e9fcc9 --- /dev/null +++ b/db/dict/init.sql @@ -0,0 +1,115 @@ +-- deinflection map (see dict/deinflections.sql) +create table if not exists deinflection ( + tag text not null, + kana_in text not null, + kana_out text not null, + rules_in int not null default 0, + rules_out int not null default 0 +); +create index deinflection_kana_in on deinflection (kana_in); + +-- lookup table for deinflection rule bitmasks +create table if not exists deinflection_rules ( + mask int not null default 0, + tag text not null +); + +-- dictionary metadata +create table if not exists dict ( + id integer primary key autoincrement, + tag text not null, -- TODO: dict.tag -> dict.tag_id + language text null default null, + priority int not null default 1 +); + +-- tag code<->id lookup +create table if not exists tag ( + id integer primary key autoincrement, + code text not null, + unique(code) on conflict ignore +); +create index tag_code on tag (code); + + +-- definition of term (with single meaning) +-- +-- terms that have multiple meanings should have multiple entries in the +-- definition table with the same term_id. glossary contains an explanation or +-- list of synonyms to illustrate the meaning of the term. +create table if not exists definition ( + id integer primary key autoincrement, + term_id int not null, -- reference to term (combined writing and reading) + sort int not null default 1, -- sort multiple meanings/glossaries within dictionary + -- search results for terms are grouped by term, then dictionary, then sort + -- within each dictionary + glossary text null default null, -- explanation or list of synonyms for term + dict_id int not null, -- origin dictionary of term + foreign key(term_id) references term(id), + foreign key(dict_id) references dict(id), + unique(glossary) on conflict ignore +); + +-- human-readable tag label lookup table (used in UI and for translations) +create table if not exists tag_label ( + id integer primary key autoincrement, + tag_id int not null, -- reference to tag + language text not null default "en-US", -- label locale as RFC 5646 tag (e.g. ja) + -- label locale is used when displaying tags in a UI frontend. labels are + -- sorted based on langauge preference, with "en-US" being inserted last as a + -- fallback. + label_short text null default null, -- abbreviated label (e.g. ティアキン) + label text not null, -- full label (e.g. ティアーズ・オブ・ザ・キングダム) + description text null default null, -- short description (e.g. ゼルダの伝説シリーズより) + foreign key(tag_id) references tag(id) +); + +-- standalone combination of kanji and reading +-- +-- terms are added automatically when importing a dictionary, but are shared +-- between dictionaries and thus not removed when disabling/removing a +-- dictionary. terms are ranked outside the database, and the starting rank +-- score can be changed using the user database (TODO) +create table if not exists term ( + id integer primary key autoincrement, + expression text not null, -- writing of term (e.g. 乗り越える) + reading text not null, -- reading of term (e.g. のりこえる) + alt int null default null, -- reference to main writing if this is is an alternate + -- alternate readings are stored as normal terms, but point to their main + -- reading. also used internally to store alternate readings of irregular + -- verbs example: + -- id expression reading alt + -- 1 言葉 ことば NULL + -- 2 詞 ことば 1 + -- 3 辞 ことば 1 + -- 4 来る くる NULL + -- 5 来た きた 4 + -- NOTE: alternate writings can technically be stored recursively this way, + -- but are only resolved one level deep TODO: resolve alternates + foreign key(alt) references term(id), + unique(expression, reading) on conflict ignore +); +create index term_expression on term (expression); +create index term_reading on term (reading); +-- TODO: (automatically) remove unused terms from db (using constraints?) + + +-- allow many<->many relation between definition and tag +create table if not exists definition_tag ( + id integer primary key autoincrement, + definition_id int not null, + tag_id int not null, + foreign key(definition_id) references definition(id), + foreign key(tag_id) references tag(id), + unique(definition_id, tag_id) on conflict ignore +); + +-- allow many<->many relation between term and tag +create table if not exists term_tag ( + id integer primary key autoincrement, + term_id int not null, + tag_id int not null, + foreign key(term_id) references term(id), + foreign key(tag_id) references tag(id), + unique(term_id, tag_id) on conflict ignore +); + diff --git a/db/dict/reset.sql b/db/dict/reset.sql new file mode 100644 index 0000000..757e31b --- /dev/null +++ b/db/dict/reset.sql @@ -0,0 +1,12 @@ +drop table if exists definition_tag; +drop table if exists term_tag; + +drop table if exists definition; +drop table if exists tag_label; +drop table if exists term; + +drop table if exists deinflection; +drop table if exists deinflection_rules; +drop table if exists dict; +drop table if exists tag; + diff --git a/db/dict/tags.sql b/db/dict/tags.sql new file mode 100644 index 0000000..a200abb --- /dev/null +++ b/db/dict/tags.sql @@ -0,0 +1,11 @@ +insert into tag (code) values + ('class:verb'), + ('class:verb:suru'), + ('class:verb:suru-included'), + ('class:noun'), + ('class:suffix'), + ('class:part'), + ('class:expr'), + ('name:place'), + ('name:female'), + ('name:male'); diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql.m4 new file mode 100644 index 0000000..00de413 --- /dev/null +++ b/db/dict/template.sql.m4 @@ -0,0 +1,104 @@ +-- create temporary ingest table +drop table if exists ingest; +-- TODO: ingest pitch-accent dictionaries +-- TODO: ingest alternate writings (space-separated) +create temporary table ingest( + -- term fields + expression text not null, -- kanji of term (e.g. 読み込む) + reading text not null, -- reading of term (e.g. よみこむ) + term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB + + -- definition fields + glossary_sort int not null default 1, -- order of multiple meanings (glossaries) + glossary text null default null, -- glossary content (support for basic HTML markup/styling) + glossary_tags text null default null -- add tags to single glossary entry +); + +include(`/dev/stdin')dnl --' +-- the apostrophe is so my editor highlighting keeps working if I force the +-- filetype to sql instead of m4 + +-- create dict id +insert into dict (tag, language) values ('dict:' || :dict, :lang); + +-- add terms +insert into term (expression, reading) +select expression, reading +from ingest; + +-- add definitions +insert into definition (term_id, sort, glossary, dict_id) +select + term.id, + ingest.glossary_sort, + ingest.glossary, + (select id from dict where tag = 'dict:' || :dict) +from ingest +join term on term.expression = ingest.expression and term.reading = ingest.reading; + +-- create map of term_id and tag code +drop table if exists term_tag_map; +create temporary table term_tag_map (term_id, tag); +insert into term_tag_map +with tag_map(term_id, temp, tag) as ( + select + (select id from term where expression is ingest.expression and reading is ingest.reading), + term_tags || ' ', + '' + from ingest + union + select + term_id, + `substr'(temp, instr(temp, ' ') + 1), + `substr'(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select term_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- create map of definition_id and tag code +drop table if exists definition_tag_map; +create temporary table definition_tag_map (definition_id, tag); +insert into definition_tag_map +with tag_map(definition_id, temp, tag) as ( + select + (select id from definition where glossary is ingest.glossary), + glossary_tags || ' ', + '' + from ingest + union + select + definition_id, + `substr'(temp, instr(temp, ' ') + 1), + `substr'(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select definition_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- make sure tags exist +insert into tag (code) +select tag from term_tag_map +union +select tag from definition_tag_map; + +-- add tags to terms +insert into term_tag (term_id, tag_id) +select + term_id, + tag.id +from term_tag_map +join tag on tag.code = term_tag_map.tag; + +-- add tags to definitions +insert into definition_tag (definition_id, tag_id) +select + definition_id, + tag.id +from definition_tag_map +join tag on tag.code = definition_tag_map.tag; + diff --git a/db/dict/test_a.dict.sql b/db/dict/test_a.dict.sql new file mode 100644 index 0000000..854d207 --- /dev/null +++ b/db/dict/test_a.dict.sql @@ -0,0 +1,25 @@ +-- set these +.param set :dict 'test_a' +.param set :lang 'nl_NL' + +-- dictionary content +insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values + ('浮上', 'ふじょう', 'class:verb:suru class:noun', 1, 'opstijgen, zweven', ''), + ('城', 'しろ', 'class:noun', 1, 'kasteel', ''), + ('城', 'じょう', 'class:suffix', 1, '-burcht, -burg (suffix voor kastelen)', ''), + ('下', 'した', 'class:noun', 1, 'onder, beneden, omlaag', ''), + ('の下に', 'のもとに', 'class:expr', 1, 'onder leiding van', ''), + ('ハイラル', 'はいらる', 'series:zelda name:place class:noun', 1, 'Hyrule', ''), + ('にて', 'にて', 'class:part', 1, '{duidt aanwezigheid op een plaats aan}', ''), + ('ゼルダ', 'ぜるだ', 'series:zelda name:female class:noun', 1, 'Zelda', ''), + ('様', 'さま', 'class:suffix', 1, 'meneer, mijnheer, mevrouw, madame', ''), + ('様', 'よう', 'class:noun', 1, '(eruit zien) als, zoals, manier', ''), + ('達', 'たち', 'class:suffix', 1, '{meervoudssuffix}', ''), + ('の', 'の', 'class:part', 1, '{bezitspartikel}', ''), + ('と', 'と', 'class:part', 1, '{opsommingspartikel}', ''), + ('捜索', 'そうさく', 'class:verb:suru class:noun', 1, 'onderzoek, opspeuren, achterhalen', ''), + ('を', 'を', 'class:part', 1, '{lijdend voornaamwoord partikel}', ''), + ('行う', 'おこなう', 'class:verb', 1, 'uitvoeren', ''), + ('こと', 'こと', 'class:part class:noun', 1, '{eindpartikel dat een bevel / waarschuwing uitdrukt}', ''), + ('成る', 'なる', 'class:verb aux:test', 1, 'worden, veranderen in', ''); + diff --git a/db/dict/test_b.dict.sql b/db/dict/test_b.dict.sql new file mode 100644 index 0000000..0140189 --- /dev/null +++ b/db/dict/test_b.dict.sql @@ -0,0 +1,9 @@ +-- set these +.param set :dict 'test_b' +.param set :lang 'en_US' + +-- dictionary content +insert into ingest(expression, reading, term_tags, glossary_sort, glossary, glossary_tags) values + ('城', 'しろ', 'aux:test_b', 1, 'castle', ''), + ('城', 'じょう', 'aux:test_b aux:test_b aux:nog_een_test_b', 1, '-burg (suffix for castles)', 'aux:gert aux:test aux:nog_wat'); + diff --git a/db/dict/test_pitch_accent.dict.sql b/db/dict/test_pitch_accent.dict.sql new file mode 100644 index 0000000..3b8298f --- /dev/null +++ b/db/dict/test_pitch_accent.dict.sql @@ -0,0 +1,10 @@ +-- this is an example pitch accent dictionary +-- set these +.param set :dict 'pitch_accent' +.param set :lang null + +-- dictionary content +insert into ingest (expression, reading, glossary_tags) values + ('浮上', 'ふじょう', 'pitch:0'), + ('成る', 'なる', 'pitch:1'); + |