aboutsummaryrefslogtreecommitdiff
path: root/import
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-03 16:38:56 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-03 16:38:56 +0200
commit5b0b8c82a8477cfe49a538f267805488daa7f5bd (patch)
tree2f8e45d3edb6d1848882b10596980e002053298e /import
parentdab9bee4b46aaa1241cdb6b565ddbe0f19137c5e (diff)
more correct sentence parsing
Diffstat (limited to 'import')
-rw-r--r--import/jmdict/jmdict.ts36
1 files changed, 27 insertions, 9 deletions
diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts
index 1f391e5..6109c9b 100644
--- a/import/jmdict/jmdict.ts
+++ b/import/jmdict/jmdict.ts
@@ -1,4 +1,5 @@
import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
+import { Tag } from "../../language/tags.ts";
// this script is very messy right now, and doesn't transfer all information
// present in the dictionary.
@@ -14,15 +15,31 @@ const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;
// TODO: more tags
const tagLookup = {
- ["misc/uk"]: "aux:uk",
- ["class/adv"]: "class:adverb",
- ["class/vs"]: "class:verb:suru",
- ["class/v1"]: "class:verb:ru",
- ["class/v5"]: "class:verb:u",
- ["class/n"]: "class:noun",
- ["class/suf"]: "class:suffix",
- ["class/prt"]: "class:part",
- ["class/exp"]: "class:expr",
+ ["misc/uk"]: Tag.Auxiliary.UsuallyKana,
+ ["class/adv"]: Tag.Class.Adverb,
+ ["class/vs"]: Tag.Class.Verb.Suru,
+ ["class/v1"]: Tag.Class.Verb.Ru,
+ ["class/v5"]: Tag.Class.Verb.U,
+ ["class/v5k"]: Tag.Class.Verb.U,
+ ["class/v5uru"]: Tag.Class.Verb.U,
+ ["class/v5r-i"]: Tag.Class.Verb.U,
+ ["class/v5u-s"]: Tag.Class.Verb.U,
+ ["class/v5aru"]: Tag.Class.Verb.U,
+ ["class/v5b"]: Tag.Class.Verb.U,
+ ["class/v5g"]: Tag.Class.Verb.U,
+ ["class/v5n"]: Tag.Class.Verb.U,
+ ["class/v5m"]: Tag.Class.Verb.U,
+ ["class/v5r"]: Tag.Class.Verb.U,
+ ["class/v5t"]: Tag.Class.Verb.U,
+ ["class/v5s"]: Tag.Class.Verb.U,
+ ["class/v5u"]: Tag.Class.Verb.U,
+ ["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb)
+ ["class/n"]: Tag.Class.Noun,
+ ["class/suf"]: Tag.Class.Suffix,
+ ["class/prt"]: Tag.Class.Particle,
+ ["class/exp"]: Tag.Class.Expression,
+ ["class/adj-i"]: Tag.Class.Adjective.I,
+ ["class/adj-na"]: Tag.Class.Adjective.Na,
} as { [map: string]: string };
console.log(`.param set :dict 'jmdict_${LANG}'`);
@@ -55,6 +72,7 @@ for (let i = 0; i < jmdict.words.length; i++) {
...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
])];
var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
+ // if (writing == "来る") console.log(term);
for (let j = 0; j < definitions.length; j++) {
var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`;
if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`;