import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; import { Tag } from "../../search/tags.ts"; import "../../util/string.ts"; // this script is very messy right now, and doesn't transfer all information // present in the dictionary. // // proceed with caution // // TODO: separate term and glossary tags // TODO: dictionary normalization (numbers/half-width/長音符) // TODO: use sql synthesis library instead of garbo format strings const LANG = "eng"; // no simple way to do this on non-unix using Deno.stdin const input = await Deno.readFile("/dev/stdin"); const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; // TODO: more tags const tagLookup = { ["misc/uk"]: Tag.Auxiliary.UsuallyKana, ["class/adv"]: Tag.Class.Adverb, ["class/vs"]: Tag.Class.Verb.Suru, ["class/vs-i"]: Tag.Class.Verb.Suru, ["class/v1"]: Tag.Class.Verb.Ru, ["class/v5"]: Tag.Class.Verb.U, ["class/v5k"]: Tag.Class.Verb.U, ["class/v5k-s"]: Tag.Class.Verb.U, ["class/v5uru"]: Tag.Class.Verb.U, ["class/v5r-i"]: Tag.Class.Verb.U, ["class/v5u-s"]: Tag.Class.Verb.U, ["class/v5aru"]: Tag.Class.Verb.U, ["class/v5b"]: Tag.Class.Verb.U, ["class/v5g"]: Tag.Class.Verb.U, ["class/v5n"]: Tag.Class.Verb.U, ["class/v5m"]: Tag.Class.Verb.U, ["class/v5r"]: Tag.Class.Verb.U, ["class/v5t"]: Tag.Class.Verb.U, ["class/v5s"]: Tag.Class.Verb.U, ["class/v5u"]: Tag.Class.Verb.U, ["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb) ["class/n"]: Tag.Class.Noun, ["class/suf"]: Tag.Class.Suffix, ["class/prt"]: Tag.Class.Particle, ["class/exp"]: Tag.Class.Expression, ["class/adj-i"]: Tag.Class.Adjective.I, ["class/adj-na"]: Tag.Class.Adjective.Na, } as { [map: string]: string }; console.log(`.param set :dict 'jmdict_${LANG}'`); console.log(".param set :lang 'en'"); var alts = "insert into alts(expression, reading, normal_expression, normal_reading) values\n"; var ingest = "insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values\n" // var max = -100; for (let i = 0; i < jmdict.words.length; i++) { // max++; // if (max < 0) continue; // if (max > 400) break; let term = jmdict.words[i]; let last = i == jmdict.words.length - 1; // TODO: properly resolve appliesToKanji/appliesToKana var definitions = term.sense .filter(s => s.gloss[0].lang == LANG) .map(s => s.gloss.map(g => g.text).join(", ")); if (definitions.length == 0) continue; var reading = term.kana[0].text; if (term.kanji.length == 0) term.kanji = term.kana; var writing = term.kanji[0].text; var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text); var tags = [... new Set([ ...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`), ...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`), ...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`), ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), ])]; var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); // if (writing == "来る") console.log(term); definitions.forEach((definition, j) => { ingest += `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definition.replaceAll("'", "''")}'),\n`; }); other_writings.forEach(alt => { alts += `\t('${alt}', '${reading}', '${writing}', '${reading}'),\n`; }); } ingest = ingest.replaceLast(",", ";"); alts = alts.replaceLast(",", ";"); console.log(ingest); console.log(alts);