1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
import { Tag } from "../../search/tags.ts";
import "../../util/string.ts";
// this script is very messy right now, and doesn't transfer all information
// present in the dictionary.
//
// proceed with caution
//
// TODO: separate term and glossary tags
// TODO: dictionary normalization (numbers/half-width/長音符)
// TODO: use sql synthesis library instead of garbo format strings
const LANG = "eng";
// no simple way to do this on non-unix using Deno.stdin
const input = await Deno.readFile("/dev/stdin");
const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;
// TODO: more tags
const tagLookup = {
["misc/uk"]: Tag.Auxiliary.UsuallyKana,
["class/adv"]: Tag.Class.Adverb,
["class/vs"]: Tag.Class.Verb.Suru,
["class/vs-i"]: Tag.Class.Verb.Suru,
["class/v1"]: Tag.Class.Verb.Ru,
["class/v5"]: Tag.Class.Verb.U,
["class/v5k"]: Tag.Class.Verb.U,
["class/v5k-s"]: Tag.Class.Verb.U,
["class/v5uru"]: Tag.Class.Verb.U,
["class/v5r-i"]: Tag.Class.Verb.U,
["class/v5u-s"]: Tag.Class.Verb.U,
["class/v5aru"]: Tag.Class.Verb.U,
["class/v5b"]: Tag.Class.Verb.U,
["class/v5g"]: Tag.Class.Verb.U,
["class/v5n"]: Tag.Class.Verb.U,
["class/v5m"]: Tag.Class.Verb.U,
["class/v5r"]: Tag.Class.Verb.U,
["class/v5t"]: Tag.Class.Verb.U,
["class/v5s"]: Tag.Class.Verb.U,
["class/v5u"]: Tag.Class.Verb.U,
["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb)
["class/n"]: Tag.Class.Noun,
["class/suf"]: Tag.Class.Suffix,
["class/prt"]: Tag.Class.Particle,
["class/exp"]: Tag.Class.Expression,
["class/adj-i"]: Tag.Class.Adjective.I,
["class/adj-na"]: Tag.Class.Adjective.Na,
} as { [map: string]: string };
console.log(`.param set :dict 'jmdict_${LANG}'`);
console.log(".param set :lang 'en'");
var alts = "insert into alts(expression, reading, normal_expression, normal_reading) values\n";
var ingest = "insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values\n"
// var max = -100;
for (let i = 0; i < jmdict.words.length; i++) {
// max++;
// if (max < 0) continue;
// if (max > 400) break;
let term = jmdict.words[i];
let last = i == jmdict.words.length - 1;
// TODO: properly resolve appliesToKanji/appliesToKana
var definitions = term.sense
.filter(s => s.gloss[0].lang == LANG)
.map(s => s.gloss.map(g => g.text).join(", "));
if (definitions.length == 0) continue;
var reading = term.kana[0].text;
if (term.kanji.length == 0) term.kanji = term.kana;
var writing = term.kanji[0].text;
var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text);
var tags = [... new Set([
...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`),
...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`),
...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`),
...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
])];
var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
// if (writing == "来る") console.log(term);
definitions.forEach((definition, j) => {
ingest += `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definition.replaceAll("'", "''")}'),\n`;
});
other_writings.forEach(alt => {
alts += `\t('${alt}', '${reading}', '${writing}', '${reading}'),\n`;
});
}
ingest = ingest.replaceLast(",", ";");
alts = alts.replaceLast(",", ";");
console.log(ingest);
console.log(alts);
|