diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
commit | c998e1c0477d51c886f9e4246e102dec4d7ef8dd (patch) | |
tree | 4d979c57f16b138ff4b2ce5fb3151ce241af6881 /db/dict/template.sql | |
parent | 67dbb6421976254658c5e38045513129dd18187a (diff) |
add jmdict importer to repo
Diffstat (limited to 'db/dict/template.sql')
-rw-r--r-- | db/dict/template.sql | 113 |
1 files changed, 113 insertions, 0 deletions
diff --git a/db/dict/template.sql b/db/dict/template.sql new file mode 100644 index 0000000..1a07252 --- /dev/null +++ b/db/dict/template.sql @@ -0,0 +1,113 @@ +-- create temporary ingest table +drop table if exists ingest; +-- TODO: ingest pitch-accent dictionaries +-- TODO: ingest alternate writings (space-separated) +create temporary table ingest( + -- term fields + expression text not null, -- kanji of term (e.g. 読み込む) + reading text not null, -- reading of term (e.g. よみこむ) + term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB + + -- definition fields + glossary_sort int not null default 1, -- order of multiple meanings (glossaries) + glossary text null default null, -- glossary content (support for basic HTML markup/styling) + glossary_tags text null default null -- add tags to single glossary entry +); + +-- #DICTIONARY_CONTENT_BEGIN +-- this template is 'rendered' by pasting a .dict.sql file in between these +-- DICTIONARY_CONTENT markers. the makefile can render these using the +-- following m4 code (called using m4 -P template.sql < any.dict.sql): +-- +-- m4_undivert(`/dev/stdin') +-- +-- this breaks when the first line of the input file is not a comment or empty +-- line, so the makefile accounts for this by concatenating an empty line with +-- the dict first. the runtime typescript dictionary importer handles this by +-- not calling m4 for this. +-- #DICTIONARY_CONTENT_END + +-- create dict id +insert into dict (tag, language) values ('dict:' || :dict, :lang); + +-- add terms +insert into term (expression, reading) +select expression, reading +from ingest; + +-- add definitions +insert into definition (term_id, sort, glossary, dict_id) +select + term.id, + ingest.glossary_sort, + ingest.glossary, + (select id from dict where tag = 'dict:' || :dict) +from ingest +join term on term.expression = ingest.expression and term.reading = ingest.reading; + +-- create map of term_id and tag code +drop table if exists term_tag_map; +create temporary table term_tag_map (term_id, tag); +insert into term_tag_map +with tag_map(term_id, temp, tag) as ( + select + (select id from term where expression is ingest.expression and reading is ingest.reading), + term_tags || ' ', + '' + from ingest + union + select + term_id, + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select term_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- create map of definition_id and tag code +drop table if exists definition_tag_map; +create temporary table definition_tag_map (definition_id, tag); +insert into definition_tag_map +with tag_map(definition_id, temp, tag) as ( + select + (select id from definition where glossary is ingest.glossary), + glossary_tags || ' ', + '' + from ingest + union + select + definition_id, + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select definition_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- make sure tags exist +insert into tag (code) +select tag from term_tag_map +union +select tag from definition_tag_map; + +-- add tags to terms +insert into term_tag (term_id, tag_id) +select + term_id, + tag.id +from term_tag_map +join tag on tag.code = term_tag_map.tag; + +-- add tags to definitions +insert into definition_tag (definition_id, tag_id) +select + definition_id, + tag.id +from definition_tag_map +join tag on tag.code = definition_tag_map.tag; + |