From c998e1c0477d51c886f9e4246e102dec4d7ef8dd Mon Sep 17 00:00:00 2001 From: lonkaars Date: Thu, 29 Jun 2023 11:33:23 +0200 Subject: add jmdict importer to repo --- db/dict/template.sql | 113 +++++++++++++++++++++++++++++++++++++++++++++++ db/dict/template.sql.m4 | 104 ------------------------------------------- db/makefile | 14 ++++-- import/jmdict/.gitignore | 4 ++ import/jmdict/jmdict.ts | 64 +++++++++++++++++++++++++++ import/jmdict/makefile | 13 ++++++ import/readme.md | 17 +++++++ import/util.ts | 0 language/tags.ts | 16 ++++++- util/string.ts | 13 +----- 10 files changed, 238 insertions(+), 120 deletions(-) create mode 100644 db/dict/template.sql delete mode 100644 db/dict/template.sql.m4 create mode 100644 import/jmdict/.gitignore create mode 100644 import/jmdict/jmdict.ts create mode 100644 import/jmdict/makefile create mode 100644 import/readme.md create mode 100644 import/util.ts diff --git a/db/dict/template.sql b/db/dict/template.sql new file mode 100644 index 0000000..1a07252 --- /dev/null +++ b/db/dict/template.sql @@ -0,0 +1,113 @@ +-- create temporary ingest table +drop table if exists ingest; +-- TODO: ingest pitch-accent dictionaries +-- TODO: ingest alternate writings (space-separated) +create temporary table ingest( + -- term fields + expression text not null, -- kanji of term (e.g. 読み込む) + reading text not null, -- reading of term (e.g. よみこむ) + term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB + + -- definition fields + glossary_sort int not null default 1, -- order of multiple meanings (glossaries) + glossary text null default null, -- glossary content (support for basic HTML markup/styling) + glossary_tags text null default null -- add tags to single glossary entry +); + +-- #DICTIONARY_CONTENT_BEGIN +-- this template is 'rendered' by pasting a .dict.sql file in between these +-- DICTIONARY_CONTENT markers. the makefile can render these using the +-- following m4 code (called using m4 -P template.sql < any.dict.sql): +-- +-- m4_undivert(`/dev/stdin') +-- +-- this breaks when the first line of the input file is not a comment or empty +-- line, so the makefile accounts for this by concatenating an empty line with +-- the dict first. the runtime typescript dictionary importer handles this by +-- not calling m4 for this. +-- #DICTIONARY_CONTENT_END + +-- create dict id +insert into dict (tag, language) values ('dict:' || :dict, :lang); + +-- add terms +insert into term (expression, reading) +select expression, reading +from ingest; + +-- add definitions +insert into definition (term_id, sort, glossary, dict_id) +select + term.id, + ingest.glossary_sort, + ingest.glossary, + (select id from dict where tag = 'dict:' || :dict) +from ingest +join term on term.expression = ingest.expression and term.reading = ingest.reading; + +-- create map of term_id and tag code +drop table if exists term_tag_map; +create temporary table term_tag_map (term_id, tag); +insert into term_tag_map +with tag_map(term_id, temp, tag) as ( + select + (select id from term where expression is ingest.expression and reading is ingest.reading), + term_tags || ' ', + '' + from ingest + union + select + term_id, + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select term_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- create map of definition_id and tag code +drop table if exists definition_tag_map; +create temporary table definition_tag_map (definition_id, tag); +insert into definition_tag_map +with tag_map(definition_id, temp, tag) as ( + select + (select id from definition where glossary is ingest.glossary), + glossary_tags || ' ', + '' + from ingest + union + select + definition_id, + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) + from tag_map + where length(temp) > 1 +) +select definition_id, replace(tag, ' ', '') +from tag_map +where length(tag) > 0; + +-- make sure tags exist +insert into tag (code) +select tag from term_tag_map +union +select tag from definition_tag_map; + +-- add tags to terms +insert into term_tag (term_id, tag_id) +select + term_id, + tag.id +from term_tag_map +join tag on tag.code = term_tag_map.tag; + +-- add tags to definitions +insert into definition_tag (definition_id, tag_id) +select + definition_id, + tag.id +from definition_tag_map +join tag on tag.code = definition_tag_map.tag; + diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql.m4 deleted file mode 100644 index 00de413..0000000 --- a/db/dict/template.sql.m4 +++ /dev/null @@ -1,104 +0,0 @@ --- create temporary ingest table -drop table if exists ingest; --- TODO: ingest pitch-accent dictionaries --- TODO: ingest alternate writings (space-separated) -create temporary table ingest( - -- term fields - expression text not null, -- kanji of term (e.g. 読み込む) - reading text not null, -- reading of term (e.g. よみこむ) - term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB - - -- definition fields - glossary_sort int not null default 1, -- order of multiple meanings (glossaries) - glossary text null default null, -- glossary content (support for basic HTML markup/styling) - glossary_tags text null default null -- add tags to single glossary entry -); - -include(`/dev/stdin')dnl --' --- the apostrophe is so my editor highlighting keeps working if I force the --- filetype to sql instead of m4 - --- create dict id -insert into dict (tag, language) values ('dict:' || :dict, :lang); - --- add terms -insert into term (expression, reading) -select expression, reading -from ingest; - --- add definitions -insert into definition (term_id, sort, glossary, dict_id) -select - term.id, - ingest.glossary_sort, - ingest.glossary, - (select id from dict where tag = 'dict:' || :dict) -from ingest -join term on term.expression = ingest.expression and term.reading = ingest.reading; - --- create map of term_id and tag code -drop table if exists term_tag_map; -create temporary table term_tag_map (term_id, tag); -insert into term_tag_map -with tag_map(term_id, temp, tag) as ( - select - (select id from term where expression is ingest.expression and reading is ingest.reading), - term_tags || ' ', - '' - from ingest - union - select - term_id, - `substr'(temp, instr(temp, ' ') + 1), - `substr'(temp, 0, instr(temp, ' ')) - from tag_map - where length(temp) > 1 -) -select term_id, replace(tag, ' ', '') -from tag_map -where length(tag) > 0; - --- create map of definition_id and tag code -drop table if exists definition_tag_map; -create temporary table definition_tag_map (definition_id, tag); -insert into definition_tag_map -with tag_map(definition_id, temp, tag) as ( - select - (select id from definition where glossary is ingest.glossary), - glossary_tags || ' ', - '' - from ingest - union - select - definition_id, - `substr'(temp, instr(temp, ' ') + 1), - `substr'(temp, 0, instr(temp, ' ')) - from tag_map - where length(temp) > 1 -) -select definition_id, replace(tag, ' ', '') -from tag_map -where length(tag) > 0; - --- make sure tags exist -insert into tag (code) -select tag from term_tag_map -union -select tag from definition_tag_map; - --- add tags to terms -insert into term_tag (term_id, tag_id) -select - term_id, - tag.id -from term_tag_map -join tag on tag.code = term_tag_map.tag; - --- add tags to definitions -insert into definition_tag (definition_id, tag_id) -select - definition_id, - tag.id -from definition_tag_map -join tag on tag.code = definition_tag_map.tag; - diff --git a/db/makefile b/db/makefile index c1e527e..88d4bba 100644 --- a/db/makefile +++ b/db/makefile @@ -2,7 +2,13 @@ SQL = sqlite3 DICT_DB = dict.db USER_DB = user.db -DICT_TEMPLATE = dict/template.sql.m4 +DICT_TEMPLATE = dict/template.sql + +# comment any of these lines to disable including in the default DB +DEFAULT_DICTS += dict/test_a.sql +DEFAULT_DICTS += dict/test_b.sql +DEFAULT_DICTS += dict/test_pitch_accent.sql +DEFAULT_DICTS += ../import/jmdict/jmdict.sql .PHONY: clean test @@ -20,7 +26,7 @@ dict/base.sql: dict/reset.sql dict/init.sql dict/deinflections.sql dict/tags.sql dict/full.sql: dict/base.sql dict/dict.sql cat $^ > $@ -dict/dict.sql: dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql dict/jmdict.sql +dict/dict.sql: $(DEFAULT_DICTS) cat $^ > $@ user/base.sql: user/reset.sql user/init.sql @@ -30,11 +36,11 @@ user/full.sql: user/base.sql user/root.sql cat $^ > $@ %.sql: %.dict.sql $(DICT_TEMPLATE) - m4 $(DICT_TEMPLATE) < $< > $@ + echo "" | cat - $< | m4 -P $(DICT_TEMPLATE) > $@ # delete generated sql files and database clean: - $(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql user/base.sql user/full.sql + $(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql $(DEFAULT_DICTS) user/base.sql user/full.sql test: $(DICT_DB) find.sql ./test/find '浮上しました' diff --git a/import/jmdict/.gitignore b/import/jmdict/.gitignore new file mode 100644 index 0000000..d4d466d --- /dev/null +++ b/import/jmdict/.gitignore @@ -0,0 +1,4 @@ +jmdict.dict.sql +jmdict.sql +jmdict.json +jmdict.zip diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts new file mode 100644 index 0000000..1f391e5 --- /dev/null +++ b/import/jmdict/jmdict.ts @@ -0,0 +1,64 @@ +import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; + +// this script is very messy right now, and doesn't transfer all information +// present in the dictionary. +// +// proceed with caution + +const LANG = "eng"; + +// no simple way to do this on non-unix using Deno.stdin +const input = await Deno.readFile("/dev/stdin"); + +const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; + +// TODO: more tags +const tagLookup = { + ["misc/uk"]: "aux:uk", + ["class/adv"]: "class:adverb", + ["class/vs"]: "class:verb:suru", + ["class/v1"]: "class:verb:ru", + ["class/v5"]: "class:verb:u", + ["class/n"]: "class:noun", + ["class/suf"]: "class:suffix", + ["class/prt"]: "class:part", + ["class/exp"]: "class:expr", +} as { [map: string]: string }; + +console.log(`.param set :dict 'jmdict_${LANG}'`); +console.log(".param set :lang 'en'"); + +// TODO: separate term and glossary tags +console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values"); + +// var max = -100; +for (let i = 0; i < jmdict.words.length; i++) { + // max++; + // if (max < 0) continue; + // if (max > 400) break; + let term = jmdict.words[i]; + let last = i == jmdict.words.length - 1; + + // TODO: properly resolve appliesToKanji/appliesToKana + var definitions = term.sense + .filter(s => s.gloss[0].lang == LANG) + .map(s => s.gloss.map(g => g.text).join(", ")); + if (definitions.length == 0) continue; + var reading = term.kana[0].text; + if (term.kanji.length == 0) term.kanji = term.kana; + var writing = term.kanji[0].text; + var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text); + var tags = [... new Set([ + ...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`), + ...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`), + ...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`), + ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), + ])]; + var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); + for (let j = 0; j < definitions.length; j++) { + var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`; + if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`; + console.log(out); + } +} + diff --git a/import/jmdict/makefile b/import/jmdict/makefile new file mode 100644 index 0000000..d19c9af --- /dev/null +++ b/import/jmdict/makefile @@ -0,0 +1,13 @@ +CURL = curl +UNZIP = unzip + +all: jmdict.dict.sql + +jmdict.zip: + $(CURL) -Ls 'https://github.com/scriptin/jmdict-simplified/releases/download/3.5.0%2B20230619121907/jmdict-all-3.5.0+20230619121907.json.zip' > $@ + +jmdict.json: jmdict.zip + $(UNZIP) -p $< > $@ + +jmdict.dict.sql: jmdict.json jmdict.ts + deno run -A --unstable ./jmdict.ts < $< > $@ diff --git a/import/readme.md b/import/readme.md new file mode 100644 index 0000000..48adf4b --- /dev/null +++ b/import/readme.md @@ -0,0 +1,17 @@ +# Dictionary imports + +This folder contains import scripts for various dictionaries. Publicly +available dictionaries can automatically be downloaded, others have to be +user-provided. **None of the dictionary files will be hosted in this +repository. All dictionaries keep their original license.** + +To generate a Yomikun dictionary, run `make` in one of these subdirectories. +This command will output one or more .dict.sql files, which can be imported +manually, or included in the default dictionary database file by editing [the +dictionary makefile](../db/makefile). + +## JMdict + +[License](https://www.edrdg.org/edrdg/licence.html). + + diff --git a/import/util.ts b/import/util.ts new file mode 100644 index 0000000..e69de29 diff --git a/language/tags.ts b/language/tags.ts index 4c1f134..d56ce98 100644 --- a/language/tags.ts +++ b/language/tags.ts @@ -5,7 +5,7 @@ export const Tag = { /** @constant verb subgroup */ Verb: { /** @constant any verb (fallback for vague dictionaries) */ - Unspecified: "class:verb", + Unspecified: "class:verb", // TODO: deprecate this property and implement verb classifier in ../import/util.ts /** @constant noun that can be conjugated into a verb by adding する */ Suru: "class:verb:suru", /** @@ -100,3 +100,17 @@ export type TokenTag = string; // no way around it export type TokenTags = Set; +/** @summary parse concatenated tag string to TokenTags */ +export function parseTags(input: string) { + var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + for (var tag of tags) { + // skip past tense tags after -te and -tari deinflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + filteredTags.push(tag); + } + return new Set(filteredTags) as TokenTags; +} + diff --git a/util/string.ts b/util/string.ts index d94f5a3..e0cc5eb 100644 --- a/util/string.ts +++ b/util/string.ts @@ -1,4 +1,4 @@ -import { TokenTags, TokenTag, Tag } from "../language/tags.ts"; +import { TokenTags, parseTags } from "../language/tags.ts"; import JapaneseString from "../language/japanese.ts"; declare global { @@ -60,15 +60,6 @@ String.prototype.jp = function() { /** @summary parse concatenated tag string to TokenTags */ String.prototype.parseTags = function() { - var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; - var filteredTags: TokenTag[] = []; - for (var tag of tags) { - // skip past tense tags after -te and -tari deinflection - if (tag == Tag.Inflection.Tense.Past && - filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; - - filteredTags.push(tag); - } - return new Set(filteredTags) as TokenTags; + return parseTags(this as string); } -- cgit v1.2.3