From 5b0b8c82a8477cfe49a538f267805488daa7f5bd Mon Sep 17 00:00:00 2001 From: lonkaars Date: Mon, 3 Jul 2023 16:38:56 +0200 Subject: more correct sentence parsing --- import/jmdict/jmdict.ts | 36 +++++++++++++++++++++++++++--------- 1 file changed, 27 insertions(+), 9 deletions(-) (limited to 'import') diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts index 1f391e5..6109c9b 100644 --- a/import/jmdict/jmdict.ts +++ b/import/jmdict/jmdict.ts @@ -1,4 +1,5 @@ import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; +import { Tag } from "../../language/tags.ts"; // this script is very messy right now, and doesn't transfer all information // present in the dictionary. @@ -14,15 +15,31 @@ const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; // TODO: more tags const tagLookup = { - ["misc/uk"]: "aux:uk", - ["class/adv"]: "class:adverb", - ["class/vs"]: "class:verb:suru", - ["class/v1"]: "class:verb:ru", - ["class/v5"]: "class:verb:u", - ["class/n"]: "class:noun", - ["class/suf"]: "class:suffix", - ["class/prt"]: "class:part", - ["class/exp"]: "class:expr", + ["misc/uk"]: Tag.Auxiliary.UsuallyKana, + ["class/adv"]: Tag.Class.Adverb, + ["class/vs"]: Tag.Class.Verb.Suru, + ["class/v1"]: Tag.Class.Verb.Ru, + ["class/v5"]: Tag.Class.Verb.U, + ["class/v5k"]: Tag.Class.Verb.U, + ["class/v5uru"]: Tag.Class.Verb.U, + ["class/v5r-i"]: Tag.Class.Verb.U, + ["class/v5u-s"]: Tag.Class.Verb.U, + ["class/v5aru"]: Tag.Class.Verb.U, + ["class/v5b"]: Tag.Class.Verb.U, + ["class/v5g"]: Tag.Class.Verb.U, + ["class/v5n"]: Tag.Class.Verb.U, + ["class/v5m"]: Tag.Class.Verb.U, + ["class/v5r"]: Tag.Class.Verb.U, + ["class/v5t"]: Tag.Class.Verb.U, + ["class/v5s"]: Tag.Class.Verb.U, + ["class/v5u"]: Tag.Class.Verb.U, + ["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb) + ["class/n"]: Tag.Class.Noun, + ["class/suf"]: Tag.Class.Suffix, + ["class/prt"]: Tag.Class.Particle, + ["class/exp"]: Tag.Class.Expression, + ["class/adj-i"]: Tag.Class.Adjective.I, + ["class/adj-na"]: Tag.Class.Adjective.Na, } as { [map: string]: string }; console.log(`.param set :dict 'jmdict_${LANG}'`); @@ -55,6 +72,7 @@ for (let i = 0; i < jmdict.words.length; i++) { ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), ])]; var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); + // if (writing == "来る") console.log(term); for (let j = 0; j < definitions.length; j++) { var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`; if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`; -- cgit v1.2.3