diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-07-03 16:38:56 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-07-03 16:38:56 +0200 |
commit | 5b0b8c82a8477cfe49a538f267805488daa7f5bd (patch) | |
tree | 2f8e45d3edb6d1848882b10596980e002053298e | |
parent | dab9bee4b46aaa1241cdb6b565ddbe0f19137c5e (diff) |
more correct sentence parsing
-rw-r--r-- | db/dict/deinflections.sql | 23 | ||||
-rw-r--r-- | examples/furigana-html.ts | 11 | ||||
-rw-r--r-- | import/jmdict/jmdict.ts | 36 | ||||
-rw-r--r-- | language/parser.ts | 14 | ||||
-rw-r--r-- | language/tags.ts | 17 | ||||
-rw-r--r-- | main.ts | 36 |
6 files changed, 86 insertions, 51 deletions
diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql index d13f313..ff177e2 100644 --- a/db/dict/deinflections.sql +++ b/db/dict/deinflections.sql @@ -116,21 +116,26 @@ insert into deinflection_temp values ('infl:suffix:tari', 'きたり', 'きた', 'a', 'k'), ('infl:suffix:tari', '来たり', '来た', 'a', 'k'), + -- -sa (adjective->noun) <https://guidetojapanese.org/learn/grammar/amount> + ('infl:suffix:sa class:noun', 'さ', '', 'a', 'na'), + ('infl:suffix:sa class:noun', 'さ', 'い', 'a', 'i'), + -- auxiliary rules ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem -- rule/bitmask lookup table create temporary table rule_map (tag, name, mask); insert into rule_map values - (null, 'a', -1 ), -- all (allow all rules in) - (null, '', 0 ), -- (nothing) - ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide) - ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide) - ('infl:reason:suru', 's', 1 << 2), -- する (suru) - ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru) - (null, 'z', 1 << 4), -- ずる (zuru) - ('infl:reason:adj-i', 'i', 1 << 5), -- 形容詞 (i-adjective) - (null, 'iru', 1 << 6); -- 〜いる (temporary iru for progressive tense) + (null, 'a', -1 ), -- all (allow all rules in) + (null, '', 0 ), -- (nothing) + ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide) + ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide) + ('infl:reason:suru', 's', 1 << 2), -- する (suru) + ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru) + (null, 'z', 1 << 4), -- ずる (zuru) + ('infl:reason:adj:i', 'i', 1 << 5), -- 形容詞 (i-adjective) + (null, 'iru', 1 << 6), -- 〜いる (temporary iru for progressive tense) + ('infl:reason:adj:na', 'na', 1 << 7); -- 形容動詞 (na-adjective) -- add tags to db insert into deinflection_rules (mask, tag) diff --git a/examples/furigana-html.ts b/examples/furigana-html.ts index 6a0e801..f0ff067 100644 --- a/examples/furigana-html.ts +++ b/examples/furigana-html.ts @@ -14,17 +14,6 @@ var sentence = await api.sentence("日本に来て一番驚いたことは自動 // Copy the sentence verbatim but add furigana to each word's kanji var furigana = sentence.furigana("HTML"); -// TODO: sentence is not copied verbatim, words are replaced by their kanji if they matched by kana only console.log(furigana); -// this sentence works :tada: -// console.log((await api.sentence("浮上したハイラル城の下にてゼルダ様達の捜索を行うこととなった")).furigana("HTML")); - -var test = "日本に来て一番驚いたことは自動販売機の多さだ。"; -console.log(test); -console.log((await api.sentence(test)).furigana("parenthesis")); - -test = "にほんに来て一番驚いたことは自動販売機の多さだ。"; -console.log(test); -console.log((await api.sentence(test)).furigana("parenthesis")); diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts index 1f391e5..6109c9b 100644 --- a/import/jmdict/jmdict.ts +++ b/import/jmdict/jmdict.ts @@ -1,4 +1,5 @@ import type { JMdict } from "npm:@scriptin/jmdict-simplified-types"; +import { Tag } from "../../language/tags.ts"; // this script is very messy right now, and doesn't transfer all information // present in the dictionary. @@ -14,15 +15,31 @@ const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict; // TODO: more tags const tagLookup = { - ["misc/uk"]: "aux:uk", - ["class/adv"]: "class:adverb", - ["class/vs"]: "class:verb:suru", - ["class/v1"]: "class:verb:ru", - ["class/v5"]: "class:verb:u", - ["class/n"]: "class:noun", - ["class/suf"]: "class:suffix", - ["class/prt"]: "class:part", - ["class/exp"]: "class:expr", + ["misc/uk"]: Tag.Auxiliary.UsuallyKana, + ["class/adv"]: Tag.Class.Adverb, + ["class/vs"]: Tag.Class.Verb.Suru, + ["class/v1"]: Tag.Class.Verb.Ru, + ["class/v5"]: Tag.Class.Verb.U, + ["class/v5k"]: Tag.Class.Verb.U, + ["class/v5uru"]: Tag.Class.Verb.U, + ["class/v5r-i"]: Tag.Class.Verb.U, + ["class/v5u-s"]: Tag.Class.Verb.U, + ["class/v5aru"]: Tag.Class.Verb.U, + ["class/v5b"]: Tag.Class.Verb.U, + ["class/v5g"]: Tag.Class.Verb.U, + ["class/v5n"]: Tag.Class.Verb.U, + ["class/v5m"]: Tag.Class.Verb.U, + ["class/v5r"]: Tag.Class.Verb.U, + ["class/v5t"]: Tag.Class.Verb.U, + ["class/v5s"]: Tag.Class.Verb.U, + ["class/v5u"]: Tag.Class.Verb.U, + ["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb) + ["class/n"]: Tag.Class.Noun, + ["class/suf"]: Tag.Class.Suffix, + ["class/prt"]: Tag.Class.Particle, + ["class/exp"]: Tag.Class.Expression, + ["class/adj-i"]: Tag.Class.Adjective.I, + ["class/adj-na"]: Tag.Class.Adjective.Na, } as { [map: string]: string }; console.log(`.param set :dict 'jmdict_${LANG}'`); @@ -55,6 +72,7 @@ for (let i = 0; i < jmdict.words.length; i++) { ...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`), ])]; var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]); + // if (writing == "来る") console.log(term); for (let j = 0; j < definitions.length; j++) { var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`; if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`; diff --git a/language/parser.ts b/language/parser.ts index bb4ac1e..40bdd81 100644 --- a/language/parser.ts +++ b/language/parser.ts @@ -5,6 +5,11 @@ import "../util/array.ts"; import "../util/set.ts"; import { DeepPartial } from "../util/types.ts"; +const CONJUGABLE_TAGS = [ + ...Object.values(Tag.Class.Verb), + ...Object.values(Tag.Class.Adjective), +]; + // TODO: rename Parser to Search /** @summary main Parser class */ export default class Parser { @@ -20,6 +25,7 @@ export default class Parser { }); } + // Search.sentence() async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> { await this.ready; @@ -64,8 +70,8 @@ export default class Parser { // deconjugated words if (result.depth > 0) { - // can't be conjugated at all - if (!result.tags.anyOf(Object.values(Tag.Class.Verb))) return false; + // check if this word can be conjugated at all + if (!result.tags.anyOf(CONJUGABLE_TAGS)) return false; // ignore other wrong deconjugations if (result.tags.includes(Tag.Class.Verb.U) && @@ -74,6 +80,10 @@ export default class Parser { !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; if (result.tags.includes(Tag.Class.Verb.Suru) && !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; + if (result.tags.includes(Tag.Class.Adjective.I) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; + if (result.tags.includes(Tag.Class.Adjective.Na) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; } // all other results should be valid grammatically diff --git a/language/tags.ts b/language/tags.ts index d40904f..3065c77 100644 --- a/language/tags.ts +++ b/language/tags.ts @@ -6,8 +6,6 @@ export const Tag = { Class: { /** @constant verb subgroup */ Verb: { - /** @constant any verb (fallback for vague dictionaries) */ - Unspecified: "class:verb", // TODO: deprecate this property and implement verb classifier in ../import/util.ts /** @constant noun that can be conjugated into a verb by adding する */ Suru: "class:verb:suru", /** @@ -21,6 +19,14 @@ export const Tag = { U: "class:verb:u", /** @constant ichidan verbs (〜る in [taekim]) */ Ru: "class:verb:ru", + /** @constant kuru (来る) */ + Kuru: "class:verb:kuru", + }, + Adjective: { + /** @constant adjectives that end in 〜い */ + I: "class:adj:i", + /** @constant adjectives that need to be conjugated using な */ + Na: "class:adj:na", }, /** @constant regular nouns or words that can be treated as nouns */ Noun: "class:noun", @@ -36,6 +42,8 @@ export const Tag = { * @see ./readme.md#behavior-altering-tags */ Expression: "class:expr", + /** @constant adverbs (e.g. 早く) */ + Adverb: "class:adverb", }, /** @constant types of names */ Name: { @@ -88,7 +96,10 @@ export const Tag = { /** @constant applied if word was deconjugated as kuru verb */ Kuru: "infl:reason:kuru", /** @constant applied if word was deconjugated as i-adjective */ - AdjI: "infl:reason:adj-i", + Adjective: { + I: "infl:reason:adj:i", + Na: "infl:reason:adj:na", + }, }, }, /** @constant uncategorized tags */ @@ -6,9 +6,9 @@ function prettyprintParseResult(input: ParseResult) { out += token.term_id; out += ": "; - out += token.reading.map(r => r.text).reduce((a, b) => a + b); + out += token.writing; out += " ("; - out += token.reading.map(r => r.ruby ? r.ruby : r.text).reduce((a, b) => a + b); + out += token.reading; out += ") "; out += token.tags.map(a => `[${a}]`).join(" "); @@ -30,6 +30,8 @@ async function coreTest(core: Core) { prettyprintParseResult(await core.parseSentence("浮上した城の様")); console.log("-------------"); prettyprintParseResult(await core.parseSentence("迷子になってしまった")); + console.log("-------------"); + prettyprintParseResult(await core.parseSentence("日本に来て一番驚いたことは自動販売機の多さだ。")); } // test 1 (direct core) @@ -41,18 +43,18 @@ await (async () => { await coreTest(core); })(); -console.log("\n".repeat(2)); - -// test 2 (remote core) -await (async () => { - // default host = localhost:9400 - new RemoteCoreServer().start(); - - var core = new RemoteCoreClient(); - await core.ready; - - console.log("Prepare remote core done"); - await coreTest(core); - - Deno.exit(0); -})(); +// console.log("\n".repeat(2)); +// +// // test 2 (remote core) +// await (async () => { +// // default host = localhost:9400 +// new RemoteCoreServer().start(); +// +// var core = new RemoteCoreClient(); +// await core.ready; +// +// console.log("Prepare remote core done"); +// await coreTest(core); +// +// Deno.exit(0); +// })(); |