diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
commit | c998e1c0477d51c886f9e4246e102dec4d7ef8dd (patch) | |
tree | 4d979c57f16b138ff4b2ce5fb3151ce241af6881 | |
parent | 67dbb6421976254658c5e38045513129dd18187a (diff) |
add jmdict importer to repo
-rw-r--r-- | db/dict/template.sql (renamed from db/dict/template.sql.m4) | 23 | ||||
-rw-r--r-- | db/makefile | 14 | ||||
-rw-r--r-- | import/jmdict/.gitignore | 4 | ||||
-rw-r--r-- | import/jmdict/jmdict.ts | 64 | ||||
-rw-r--r-- | import/jmdict/makefile | 13 | ||||
-rw-r--r-- | import/readme.md | 17 | ||||
-rw-r--r-- | import/util.ts | 0 | ||||
-rw-r--r-- | language/tags.ts | 16 | ||||
-rw-r--r-- | util/string.ts | 13 |
9 files changed, 141 insertions, 23 deletions
diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql index 00de413..1a07252 100644 --- a/db/dict/template.sql.m4 +++ b/db/dict/template.sql @@ -14,9 +14,18 @@ create temporary table ingest( glossary_tags text null default null -- add tags to single glossary entry ); -include(`/dev/stdin')dnl --' --- the apostrophe is so my editor highlighting keeps working if I force the --- filetype to sql instead of m4 +-- #DICTIONARY_CONTENT_BEGIN +-- this template is 'rendered' by pasting a .dict.sql file in between these +-- DICTIONARY_CONTENT markers. the makefile can render these using the +-- following m4 code (called using m4 -P template.sql < any.dict.sql): +-- +-- m4_undivert(`/dev/stdin') +-- +-- this breaks when the first line of the input file is not a comment or empty +-- line, so the makefile accounts for this by concatenating an empty line with +-- the dict first. the runtime typescript dictionary importer handles this by +-- not calling m4 for this. +-- #DICTIONARY_CONTENT_END -- create dict id insert into dict (tag, language) values ('dict:' || :dict, :lang); @@ -49,8 +58,8 @@ with tag_map(term_id, temp, tag) as ( union select term_id, - `substr'(temp, instr(temp, ' ') + 1), - `substr'(temp, 0, instr(temp, ' ')) + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) from tag_map where length(temp) > 1 ) @@ -71,8 +80,8 @@ with tag_map(definition_id, temp, tag) as ( union select definition_id, - `substr'(temp, instr(temp, ' ') + 1), - `substr'(temp, 0, instr(temp, ' ')) + substr(temp, instr(temp, ' ') + 1), + substr(temp, 0, instr(temp, ' ')) from tag_map where length(temp) > 1 ) diff --git a/db/makefile b/db/makefile index c1e527e..88d4bba 100644 --- a/db/makefile +++ b/db/makefile @@ -2,7 +2,13 @@ SQL = sqlite3 DICT_DB = dict.db USER_DB = user.db -DICT_TEMPLATE = dict/template.sql.m4 +DICT_TEMPLATE = dict/template.sql + +# comment any of these lines to disable including in the default DB +DEFAULT_DICTS += dict/test_a.sql +DEFAULT_DICTS += dict/test_b.sql +DEFAULT_DICTS += dict/test_pitch_accent.sql +DEFAULT_DICTS += ../import/jmdict/jmdict.sql .PHONY: clean test @@ -20,7 +26,7 @@ dict/base.sql: dict/reset.sql dict/init.sql dict/deinflections.sql dict/tags.sql dict/full.sql: dict/base.sql dict/dict.sql cat $^ > $@ -dict/dict.sql: dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql dict/jmdict.sql +dict/dict.sql: $(DEFAULT_DICTS) cat $^ > $@ user/base.sql: user/reset.sql user/init.sql @@ -30,11 +36,11 @@ user/full.sql: user/base.sql user/root.sql cat $^ > $@ %.sql: %.dict.sql $(DICT_TEMPLATE) - m4 $(DICT_TEMPLATE) < $< > $@ + echo "" | cat - $< | m4 -P $(DICT_TEMPLATE) > $@ # delete generated sql files and database clean: - $(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql user/base.sql user/full.sql + $(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql $(DEFAULT_DICTS) user/base.sql user/full.sql test: $(DICT_DB) find.sql ./test/find '浮上しました' diff --git a/import/jmdict/.gitignore b/import/jmdict/.gitignore new file mode 100644 index 0000000..d4d466d --- /dev/null +++ b/import/jmdict/.gitignore @@ -0,0 +1,4 @@ +jmdict.dict.sql +jmdict.sql +jmdict.json +jmdict.zip diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts new file mode 100644 index 0000000..1f391e5 --- /dev/null +++ b/import/jmdict/jmdict.ts @@ -0,0 +1,64 @@ +import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; + +// this script is very messy right now, and doesn't transfer all information +// present in the dictionary. +// +// proceed with caution + +const LANG = "eng"; + +// no simple way to do this on non-unix using Deno.stdin +const input = await Deno.readFile("/dev/stdin"); + +const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; + +// TODO: more tags +const tagLookup = { + ["misc/uk"]: "aux:uk", + ["class/adv"]: "class:adverb", + ["class/vs"]: "class:verb:suru", + ["class/v1"]: "class:verb:ru", + ["class/v5"]: "class:verb:u", + ["class/n"]: "class:noun", + ["class/suf"]: "class:suffix", + ["class/prt"]: "class:part", + ["class/exp"]: "class:expr", +} as { [map: string]: string }; + +console.log(`.param set :dict 'jmdict_${LANG}'`); +console.log(".param set :lang 'en'"); + +// TODO: separate term and glossary tags +console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values"); + +// var max = -100; +for (let i = 0; i < jmdict.words.length; i++) { + // max++; + // if (max < 0) continue; + // if (max > 400) break; + let term = jmdict.words[i]; + let last = i == jmdict.words.length - 1; + + // TODO: properly resolve appliesToKanji/appliesToKana + var definitions = term.sense + .filter(s => s.gloss[0].lang == LANG) + .map(s => s.gloss.map(g => g.text).join(", ")); + if (definitions.length == 0) continue; + var reading = term.kana[0].text; + if (term.kanji.length == 0) term.kanji = term.kana; + var writing = term.kanji[0].text; + var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text); + var tags = [... new Set([ + ...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`), + ...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`), + ...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`), + ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), + ])]; + var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); + for (let j = 0; j < definitions.length; j++) { + var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`; + if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`; + console.log(out); + } +} + diff --git a/import/jmdict/makefile b/import/jmdict/makefile new file mode 100644 index 0000000..d19c9af --- /dev/null +++ b/import/jmdict/makefile @@ -0,0 +1,13 @@ +CURL = curl +UNZIP = unzip + +all: jmdict.dict.sql + +jmdict.zip: + $(CURL) -Ls 'https://github.com/scriptin/jmdict-simplified/releases/download/3.5.0%2B20230619121907/jmdict-all-3.5.0+20230619121907.json.zip' > $@ + +jmdict.json: jmdict.zip + $(UNZIP) -p $< > $@ + +jmdict.dict.sql: jmdict.json jmdict.ts + deno run -A --unstable ./jmdict.ts < $< > $@ diff --git a/import/readme.md b/import/readme.md new file mode 100644 index 0000000..48adf4b --- /dev/null +++ b/import/readme.md @@ -0,0 +1,17 @@ +# Dictionary imports + +This folder contains import scripts for various dictionaries. Publicly +available dictionaries can automatically be downloaded, others have to be +user-provided. **None of the dictionary files will be hosted in this +repository. All dictionaries keep their original license.** + +To generate a Yomikun dictionary, run `make` in one of these subdirectories. +This command will output one or more .dict.sql files, which can be imported +manually, or included in the default dictionary database file by editing [the +dictionary makefile](../db/makefile). + +## JMdict + +[License](https://www.edrdg.org/edrdg/licence.html). + + diff --git a/import/util.ts b/import/util.ts new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/import/util.ts diff --git a/language/tags.ts b/language/tags.ts index 4c1f134..d56ce98 100644 --- a/language/tags.ts +++ b/language/tags.ts @@ -5,7 +5,7 @@ export const Tag = { /** @constant verb subgroup */ Verb: { /** @constant any verb (fallback for vague dictionaries) */ - Unspecified: "class:verb", + Unspecified: "class:verb", // TODO: deprecate this property and implement verb classifier in ../import/util.ts /** @constant noun that can be conjugated into a verb by adding する */ Suru: "class:verb:suru", /** @@ -100,3 +100,17 @@ export type TokenTag = string; // no way around it export type TokenTags = Set<TokenTag>; +/** @summary parse concatenated tag string to TokenTags */ +export function parseTags(input: string) { + var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + for (var tag of tags) { + // skip past tense tags after -te and -tari deinflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + filteredTags.push(tag); + } + return new Set(filteredTags) as TokenTags; +} + diff --git a/util/string.ts b/util/string.ts index d94f5a3..e0cc5eb 100644 --- a/util/string.ts +++ b/util/string.ts @@ -1,4 +1,4 @@ -import { TokenTags, TokenTag, Tag } from "../language/tags.ts"; +import { TokenTags, parseTags } from "../language/tags.ts"; import JapaneseString from "../language/japanese.ts"; declare global { @@ -60,15 +60,6 @@ String.prototype.jp = function() { /** @summary parse concatenated tag string to TokenTags */ String.prototype.parseTags = function() { - var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; - var filteredTags: TokenTag[] = []; - for (var tag of tags) { - // skip past tense tags after -te and -tari deinflection - if (tag == Tag.Inflection.Tense.Past && - filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; - - filteredTags.push(tag); - } - return new Set(filteredTags) as TokenTags; + return parseTags(this as string); } |