aboutsummaryrefslogtreecommitdiff
path: root/import/jmdict
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-06-29 11:33:23 +0200
committerlonkaars <loek@pipeframe.xyz>2023-06-29 11:33:23 +0200
commitc998e1c0477d51c886f9e4246e102dec4d7ef8dd (patch)
tree4d979c57f16b138ff4b2ce5fb3151ce241af6881 /import/jmdict
parent67dbb6421976254658c5e38045513129dd18187a (diff)
add jmdict importer to repo
Diffstat (limited to 'import/jmdict')
-rw-r--r--import/jmdict/.gitignore4
-rw-r--r--import/jmdict/jmdict.ts64
-rw-r--r--import/jmdict/makefile13
3 files changed, 81 insertions, 0 deletions
diff --git a/import/jmdict/.gitignore b/import/jmdict/.gitignore
new file mode 100644
index 0000000..d4d466d
--- /dev/null
+++ b/import/jmdict/.gitignore
@@ -0,0 +1,4 @@
+jmdict.dict.sql
+jmdict.sql
+jmdict.json
+jmdict.zip
diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts
new file mode 100644
index 0000000..1f391e5
--- /dev/null
+++ b/import/jmdict/jmdict.ts
@@ -0,0 +1,64 @@
+import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
+
+// this script is very messy right now, and doesn't transfer all information
+// present in the dictionary.
+//
+// proceed with caution
+
+const LANG = "eng";
+
+// no simple way to do this on non-unix using Deno.stdin
+const input = await Deno.readFile("/dev/stdin");
+
+const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;
+
+// TODO: more tags
+const tagLookup = {
+ ["misc/uk"]: "aux:uk",
+ ["class/adv"]: "class:adverb",
+ ["class/vs"]: "class:verb:suru",
+ ["class/v1"]: "class:verb:ru",
+ ["class/v5"]: "class:verb:u",
+ ["class/n"]: "class:noun",
+ ["class/suf"]: "class:suffix",
+ ["class/prt"]: "class:part",
+ ["class/exp"]: "class:expr",
+} as { [map: string]: string };
+
+console.log(`.param set :dict 'jmdict_${LANG}'`);
+console.log(".param set :lang 'en'");
+
+// TODO: separate term and glossary tags
+console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values");
+
+// var max = -100;
+for (let i = 0; i < jmdict.words.length; i++) {
+ // max++;
+ // if (max < 0) continue;
+ // if (max > 400) break;
+ let term = jmdict.words[i];
+ let last = i == jmdict.words.length - 1;
+
+ // TODO: properly resolve appliesToKanji/appliesToKana
+ var definitions = term.sense
+ .filter(s => s.gloss[0].lang == LANG)
+ .map(s => s.gloss.map(g => g.text).join(", "));
+ if (definitions.length == 0) continue;
+ var reading = term.kana[0].text;
+ if (term.kanji.length == 0) term.kanji = term.kana;
+ var writing = term.kanji[0].text;
+ var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text);
+ var tags = [... new Set([
+ ...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`),
+ ...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`),
+ ...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`),
+ ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
+ ])];
+ var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
+ for (let j = 0; j < definitions.length; j++) {
+ var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`;
+ if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`;
+ console.log(out);
+ }
+}
+
diff --git a/import/jmdict/makefile b/import/jmdict/makefile
new file mode 100644
index 0000000..d19c9af
--- /dev/null
+++ b/import/jmdict/makefile
@@ -0,0 +1,13 @@
+CURL = curl
+UNZIP = unzip
+
+all: jmdict.dict.sql
+
+jmdict.zip:
+ $(CURL) -Ls 'https://github.com/scriptin/jmdict-simplified/releases/download/3.5.0%2B20230619121907/jmdict-all-3.5.0+20230619121907.json.zip' > $@
+
+jmdict.json: jmdict.zip
+ $(UNZIP) -p $< > $@
+
+jmdict.dict.sql: jmdict.json jmdict.ts
+ deno run -A --unstable ./jmdict.ts < $< > $@