import/jmdict/jmdict.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
import { Tag } from "../../search/tags.ts";
import "../../util/string.ts";

// this script is very messy right now, and doesn't transfer all information
// present in the dictionary.
//
// proceed with caution
//
// TODO: separate term and glossary tags
// TODO: dictionary normalization (numbers/half-width/長音符)
// TODO: use sql synthesis library instead of garbo format strings

const LANG = "eng";

// no simple way to do this on non-unix using Deno.stdin
const input = await Deno.readFile("/dev/stdin");

const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;

// TODO: more tags
const tagLookup = {
	["misc/uk"]: Tag.Auxiliary.UsuallyKana,
	["class/adv"]: Tag.Class.Adverb,
	["class/vs"]: Tag.Class.Verb.Suru,
	["class/vs-i"]: Tag.Class.Verb.Suru,
	["class/v1"]: Tag.Class.Verb.Ru,
	["class/v5"]: Tag.Class.Verb.U,
	["class/v5k"]: Tag.Class.Verb.U,
	["class/v5k-s"]: Tag.Class.Verb.U,
	["class/v5uru"]: Tag.Class.Verb.U,
	["class/v5r-i"]: Tag.Class.Verb.U,
	["class/v5u-s"]: Tag.Class.Verb.U,
	["class/v5aru"]: Tag.Class.Verb.U,
	["class/v5b"]: Tag.Class.Verb.U,
	["class/v5g"]: Tag.Class.Verb.U,
	["class/v5n"]: Tag.Class.Verb.U,
	["class/v5m"]: Tag.Class.Verb.U,
	["class/v5r"]: Tag.Class.Verb.U,
	["class/v5t"]: Tag.Class.Verb.U,
	["class/v5s"]: Tag.Class.Verb.U,
	["class/v5u"]: Tag.Class.Verb.U,
	["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb)
	["class/n"]: Tag.Class.Noun,
	["class/suf"]: Tag.Class.Suffix,
	["class/prt"]: Tag.Class.Particle,
	["class/exp"]: Tag.Class.Expression,
	["class/adj-i"]: Tag.Class.Adjective.I,
	["class/adj-na"]: Tag.Class.Adjective.Na,
} as { [map: string]: string };

console.log(`.param set :dict 'jmdict_${LANG}'`);
console.log(".param set :lang 'en'");

var alts = "insert into alts(expression, reading, normal_expression, normal_reading) values\n";
var ingest = "insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values\n"

// var max = -100;
for (let i = 0; i < jmdict.words.length; i++) {
	// max++;
	// if (max < 0) continue;
	// if (max > 400) break;
	let term = jmdict.words[i];
	let last = i == jmdict.words.length - 1;

	// TODO: properly resolve appliesToKanji/appliesToKana
	var definitions = term.sense
		.filter(s => s.gloss[0].lang == LANG)
		.map(s => s.gloss.map(g => g.text).join(", "));
	if (definitions.length == 0) continue;
	var reading = term.kana[0].text;
	if (term.kanji.length == 0) term.kanji = term.kana;
	var writing = term.kanji[0].text;
	var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text);
	var tags = [... new Set([
		...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`),
		...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`),
		...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`),
		...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
	])];
	var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
	// if (writing == "来る") console.log(term);
	definitions.forEach((definition, j) => {
		ingest += `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definition.replaceAll("'", "''")}'),\n`;
	});
	other_writings.forEach(alt => {
		alts += `\t('${alt}', '${reading}', '${writing}', '${reading}'),\n`;
	});
}

ingest = ingest.replaceLast(",", ";");
alts = alts.replaceLast(",", ";");

console.log(ingest);
console.log(alts);