diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-06-29 11:33:23 +0200 |
commit | c998e1c0477d51c886f9e4246e102dec4d7ef8dd (patch) | |
tree | 4d979c57f16b138ff4b2ce5fb3151ce241af6881 /import/jmdict | |
parent | 67dbb6421976254658c5e38045513129dd18187a (diff) |
add jmdict importer to repo
Diffstat (limited to 'import/jmdict')
-rw-r--r-- | import/jmdict/.gitignore | 4 | ||||
-rw-r--r-- | import/jmdict/jmdict.ts | 64 | ||||
-rw-r--r-- | import/jmdict/makefile | 13 |
3 files changed, 81 insertions, 0 deletions
diff --git a/import/jmdict/.gitignore b/import/jmdict/.gitignore new file mode 100644 index 0000000..d4d466d --- /dev/null +++ b/import/jmdict/.gitignore @@ -0,0 +1,4 @@ +jmdict.dict.sql +jmdict.sql +jmdict.json +jmdict.zip diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts new file mode 100644 index 0000000..1f391e5 --- /dev/null +++ b/import/jmdict/jmdict.ts @@ -0,0 +1,64 @@ +import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; + +// this script is very messy right now, and doesn't transfer all information +// present in the dictionary. +// +// proceed with caution + +const LANG = "eng"; + +// no simple way to do this on non-unix using Deno.stdin +const input = await Deno.readFile("/dev/stdin"); + +const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; + +// TODO: more tags +const tagLookup = { + ["misc/uk"]: "aux:uk", + ["class/adv"]: "class:adverb", + ["class/vs"]: "class:verb:suru", + ["class/v1"]: "class:verb:ru", + ["class/v5"]: "class:verb:u", + ["class/n"]: "class:noun", + ["class/suf"]: "class:suffix", + ["class/prt"]: "class:part", + ["class/exp"]: "class:expr", +} as { [map: string]: string }; + +console.log(`.param set :dict 'jmdict_${LANG}'`); +console.log(".param set :lang 'en'"); + +// TODO: separate term and glossary tags +console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values"); + +// var max = -100; +for (let i = 0; i < jmdict.words.length; i++) { + // max++; + // if (max < 0) continue; + // if (max > 400) break; + let term = jmdict.words[i]; + let last = i == jmdict.words.length - 1; + + // TODO: properly resolve appliesToKanji/appliesToKana + var definitions = term.sense + .filter(s => s.gloss[0].lang == LANG) + .map(s => s.gloss.map(g => g.text).join(", ")); + if (definitions.length == 0) continue; + var reading = term.kana[0].text; + if (term.kanji.length == 0) term.kanji = term.kana; + var writing = term.kanji[0].text; + var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text); + var tags = [... new Set([ + ...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`), + ...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`), + ...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`), + ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), + ])]; + var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); + for (let j = 0; j < definitions.length; j++) { + var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`; + if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`; + console.log(out); + } +} + diff --git a/import/jmdict/makefile b/import/jmdict/makefile new file mode 100644 index 0000000..d19c9af --- /dev/null +++ b/import/jmdict/makefile @@ -0,0 +1,13 @@ +CURL = curl +UNZIP = unzip + +all: jmdict.dict.sql + +jmdict.zip: + $(CURL) -Ls 'https://github.com/scriptin/jmdict-simplified/releases/download/3.5.0%2B20230619121907/jmdict-all-3.5.0+20230619121907.json.zip' > $@ + +jmdict.json: jmdict.zip + $(UNZIP) -p $< > $@ + +jmdict.dict.sql: jmdict.json jmdict.ts + deno run -A --unstable ./jmdict.ts < $< > $@ |