aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-03 16:38:56 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-03 16:38:56 +0200
commit5b0b8c82a8477cfe49a538f267805488daa7f5bd (patch)
tree2f8e45d3edb6d1848882b10596980e002053298e
parentdab9bee4b46aaa1241cdb6b565ddbe0f19137c5e (diff)
more correct sentence parsing
-rw-r--r--db/dict/deinflections.sql23
-rw-r--r--examples/furigana-html.ts11
-rw-r--r--import/jmdict/jmdict.ts36
-rw-r--r--language/parser.ts14
-rw-r--r--language/tags.ts17
-rw-r--r--main.ts36
6 files changed, 86 insertions, 51 deletions
diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql
index d13f313..ff177e2 100644
--- a/db/dict/deinflections.sql
+++ b/db/dict/deinflections.sql
@@ -116,21 +116,26 @@ insert into deinflection_temp values
('infl:suffix:tari', 'きたり', 'きた', 'a', 'k'),
('infl:suffix:tari', '来たり', '来た', 'a', 'k'),
+ -- -sa (adjective->noun) <https://guidetojapanese.org/learn/grammar/amount>
+ ('infl:suffix:sa class:noun', 'さ', '', 'a', 'na'),
+ ('infl:suffix:sa class:noun', 'さ', 'い', 'a', 'i'),
+
-- auxiliary rules
('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem
-- rule/bitmask lookup table
create temporary table rule_map (tag, name, mask);
insert into rule_map values
- (null, 'a', -1 ), -- all (allow all rules in)
- (null, '', 0 ), -- (nothing)
- ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide)
- ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide)
- ('infl:reason:suru', 's', 1 << 2), -- する (suru)
- ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru)
- (null, 'z', 1 << 4), -- ずる (zuru)
- ('infl:reason:adj-i', 'i', 1 << 5), -- 形容詞 (i-adjective)
- (null, 'iru', 1 << 6); -- 〜いる (temporary iru for progressive tense)
+ (null, 'a', -1 ), -- all (allow all rules in)
+ (null, '', 0 ), -- (nothing)
+ ('infl:reason:ru', 'ru', 1 << 0), -- 一段活用 (ichidan a.k.a. ru-verbs in tae kim's japanese grammar guide)
+ ('infl:reason:u', 'u', 1 << 1), -- 五段活用 (godan a.k.a. u-verbs in tae kim's japanese grammar guide)
+ ('infl:reason:suru', 's', 1 << 2), -- する (suru)
+ ('infl:reason:kuru', 'k', 1 << 3), -- くる (kuru)
+ (null, 'z', 1 << 4), -- ずる (zuru)
+ ('infl:reason:adj:i', 'i', 1 << 5), -- 形容詞 (i-adjective)
+ (null, 'iru', 1 << 6), -- 〜いる (temporary iru for progressive tense)
+ ('infl:reason:adj:na', 'na', 1 << 7); -- 形容動詞 (na-adjective)
-- add tags to db
insert into deinflection_rules (mask, tag)
diff --git a/examples/furigana-html.ts b/examples/furigana-html.ts
index 6a0e801..f0ff067 100644
--- a/examples/furigana-html.ts
+++ b/examples/furigana-html.ts
@@ -14,17 +14,6 @@ var sentence = await api.sentence("日本に来て一番驚いたことは自動
// Copy the sentence verbatim but add furigana to each word's kanji
var furigana = sentence.furigana("HTML");
-// TODO: sentence is not copied verbatim, words are replaced by their kanji if they matched by kana only
console.log(furigana);
-// this sentence works :tada:
-// console.log((await api.sentence("浮上したハイラル城の下にてゼルダ様達の捜索を行うこととなった")).furigana("HTML"));
-
-var test = "日本に来て一番驚いたことは自動販売機の多さだ。";
-console.log(test);
-console.log((await api.sentence(test)).furigana("parenthesis"));
-
-test = "にほんに来て一番驚いたことは自動販売機の多さだ。";
-console.log(test);
-console.log((await api.sentence(test)).furigana("parenthesis"));
diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts
index 1f391e5..6109c9b 100644
--- a/import/jmdict/jmdict.ts
+++ b/import/jmdict/jmdict.ts
@@ -1,4 +1,5 @@
import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
+import { Tag } from "../../language/tags.ts";
// this script is very messy right now, and doesn't transfer all information
// present in the dictionary.
@@ -14,15 +15,31 @@ const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;
// TODO: more tags
const tagLookup = {
- ["misc/uk"]: "aux:uk",
- ["class/adv"]: "class:adverb",
- ["class/vs"]: "class:verb:suru",
- ["class/v1"]: "class:verb:ru",
- ["class/v5"]: "class:verb:u",
- ["class/n"]: "class:noun",
- ["class/suf"]: "class:suffix",
- ["class/prt"]: "class:part",
- ["class/exp"]: "class:expr",
+ ["misc/uk"]: Tag.Auxiliary.UsuallyKana,
+ ["class/adv"]: Tag.Class.Adverb,
+ ["class/vs"]: Tag.Class.Verb.Suru,
+ ["class/v1"]: Tag.Class.Verb.Ru,
+ ["class/v5"]: Tag.Class.Verb.U,
+ ["class/v5k"]: Tag.Class.Verb.U,
+ ["class/v5uru"]: Tag.Class.Verb.U,
+ ["class/v5r-i"]: Tag.Class.Verb.U,
+ ["class/v5u-s"]: Tag.Class.Verb.U,
+ ["class/v5aru"]: Tag.Class.Verb.U,
+ ["class/v5b"]: Tag.Class.Verb.U,
+ ["class/v5g"]: Tag.Class.Verb.U,
+ ["class/v5n"]: Tag.Class.Verb.U,
+ ["class/v5m"]: Tag.Class.Verb.U,
+ ["class/v5r"]: Tag.Class.Verb.U,
+ ["class/v5t"]: Tag.Class.Verb.U,
+ ["class/v5s"]: Tag.Class.Verb.U,
+ ["class/v5u"]: Tag.Class.Verb.U,
+ ["class/vk"]: Tag.Class.Verb.Ru, // TODO: this is possibly risky? (should be kuru, but kuru is a ru verb)
+ ["class/n"]: Tag.Class.Noun,
+ ["class/suf"]: Tag.Class.Suffix,
+ ["class/prt"]: Tag.Class.Particle,
+ ["class/exp"]: Tag.Class.Expression,
+ ["class/adj-i"]: Tag.Class.Adjective.I,
+ ["class/adj-na"]: Tag.Class.Adjective.Na,
} as { [map: string]: string };
console.log(`.param set :dict 'jmdict_${LANG}'`);
@@ -55,6 +72,7 @@ for (let i = 0; i < jmdict.words.length; i++) {
...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
])];
var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
+ // if (writing == "来る") console.log(term);
for (let j = 0; j < definitions.length; j++) {
var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`;
if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`;
diff --git a/language/parser.ts b/language/parser.ts
index bb4ac1e..40bdd81 100644
--- a/language/parser.ts
+++ b/language/parser.ts
@@ -5,6 +5,11 @@ import "../util/array.ts";
import "../util/set.ts";
import { DeepPartial } from "../util/types.ts";
+const CONJUGABLE_TAGS = [
+ ...Object.values(Tag.Class.Verb),
+ ...Object.values(Tag.Class.Adjective),
+];
+
// TODO: rename Parser to Search
/** @summary main Parser class */
export default class Parser {
@@ -20,6 +25,7 @@ export default class Parser {
});
}
+ // Search.sentence()
async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> {
await this.ready;
@@ -64,8 +70,8 @@ export default class Parser {
// deconjugated words
if (result.depth > 0) {
- // can't be conjugated at all
- if (!result.tags.anyOf(Object.values(Tag.Class.Verb))) return false;
+ // check if this word can be conjugated at all
+ if (!result.tags.anyOf(CONJUGABLE_TAGS)) return false;
// ignore other wrong deconjugations
if (result.tags.includes(Tag.Class.Verb.U) &&
@@ -74,6 +80,10 @@ export default class Parser {
!result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
if (result.tags.includes(Tag.Class.Verb.Suru) &&
!result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
+ if (result.tags.includes(Tag.Class.Adjective.I) &&
+ !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
+ if (result.tags.includes(Tag.Class.Adjective.Na) &&
+ !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
}
// all other results should be valid grammatically
diff --git a/language/tags.ts b/language/tags.ts
index d40904f..3065c77 100644
--- a/language/tags.ts
+++ b/language/tags.ts
@@ -6,8 +6,6 @@ export const Tag = {
Class: {
/** @constant verb subgroup */
Verb: {
- /** @constant any verb (fallback for vague dictionaries) */
- Unspecified: "class:verb", // TODO: deprecate this property and implement verb classifier in ../import/util.ts
/** @constant noun that can be conjugated into a verb by adding する */
Suru: "class:verb:suru",
/**
@@ -21,6 +19,14 @@ export const Tag = {
U: "class:verb:u",
/** @constant ichidan verbs (〜る in [taekim]) */
Ru: "class:verb:ru",
+ /** @constant kuru (来る) */
+ Kuru: "class:verb:kuru",
+ },
+ Adjective: {
+ /** @constant adjectives that end in 〜い */
+ I: "class:adj:i",
+ /** @constant adjectives that need to be conjugated using な */
+ Na: "class:adj:na",
},
/** @constant regular nouns or words that can be treated as nouns */
Noun: "class:noun",
@@ -36,6 +42,8 @@ export const Tag = {
* @see ./readme.md#behavior-altering-tags
*/
Expression: "class:expr",
+ /** @constant adverbs (e.g. 早く) */
+ Adverb: "class:adverb",
},
/** @constant types of names */
Name: {
@@ -88,7 +96,10 @@ export const Tag = {
/** @constant applied if word was deconjugated as kuru verb */
Kuru: "infl:reason:kuru",
/** @constant applied if word was deconjugated as i-adjective */
- AdjI: "infl:reason:adj-i",
+ Adjective: {
+ I: "infl:reason:adj:i",
+ Na: "infl:reason:adj:na",
+ },
},
},
/** @constant uncategorized tags */
diff --git a/main.ts b/main.ts
index e4c5b2e..2d15a47 100644
--- a/main.ts
+++ b/main.ts
@@ -6,9 +6,9 @@ function prettyprintParseResult(input: ParseResult) {
out += token.term_id;
out += ": ";
- out += token.reading.map(r => r.text).reduce((a, b) => a + b);
+ out += token.writing;
out += " (";
- out += token.reading.map(r => r.ruby ? r.ruby : r.text).reduce((a, b) => a + b);
+ out += token.reading;
out += ") ";
out += token.tags.map(a => `[${a}]`).join(" ");
@@ -30,6 +30,8 @@ async function coreTest(core: Core) {
prettyprintParseResult(await core.parseSentence("浮上した城の様"));
console.log("-------------");
prettyprintParseResult(await core.parseSentence("迷子になってしまった"));
+ console.log("-------------");
+ prettyprintParseResult(await core.parseSentence("日本に来て一番驚いたことは自動販売機の多さだ。"));
}
// test 1 (direct core)
@@ -41,18 +43,18 @@ await (async () => {
await coreTest(core);
})();
-console.log("\n".repeat(2));
-
-// test 2 (remote core)
-await (async () => {
- // default host = localhost:9400
- new RemoteCoreServer().start();
-
- var core = new RemoteCoreClient();
- await core.ready;
-
- console.log("Prepare remote core done");
- await coreTest(core);
-
- Deno.exit(0);
-})();
+// console.log("\n".repeat(2));
+//
+// // test 2 (remote core)
+// await (async () => {
+// // default host = localhost:9400
+// new RemoteCoreServer().start();
+//
+// var core = new RemoteCoreClient();
+// await core.ready;
+//
+// console.log("Prepare remote core done");
+// await coreTest(core);
+//
+// Deno.exit(0);
+// })();