aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-15 21:52:57 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-15 21:52:57 +0200
commit8e179a43e909ce4683f753a90bb3505630f05ad8 (patch)
tree5e46594af33ba7f82d1bd5ea954b99b4a92d0093
parent3dc9484fc81db8f3c8ffd4ebb4bab042e66c6214 (diff)
implement alternate writings (failing tests down to 500)
-rw-r--r--db/dict/deinflections.sql4
-rw-r--r--db/dict/init.sql2
-rw-r--r--db/dict/tags.sql12
-rw-r--r--db/dict/template.sql26
-rw-r--r--import/jmdict/jmdict.ts28
-rw-r--r--search/readme.md16
-rw-r--r--search/search.ts2
-rw-r--r--search/tags.ts20
-rw-r--r--test/base.ts8
-rw-r--r--test/deinflection/test.ts6
-rw-r--r--test/reading/test.ts7
11 files changed, 80 insertions, 51 deletions
diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql
index 7472122..1fb1ebe 100644
--- a/db/dict/deinflections.sql
+++ b/db/dict/deinflections.sql
@@ -286,8 +286,8 @@ insert into deinflection_temp values
('infl:passive', 'こられる', 'くる', 'ru', 'k'),
('infl:passive', '来られる', '来る', 'ru', 'k'),
- -- auxiliary rules
- ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem
+ -- suru verbs <https://guidetojapanese.org/learn/grammar/surunaru>
+ ('infl:suru', 'する', '', 's', ''); -- deconjugate suru verbs into stem
-- rule/bitmask lookup table
create temporary table rule_map (tag, name, mask);
diff --git a/db/dict/init.sql b/db/dict/init.sql
index 4e9fcc9..3c6dc50 100644
--- a/db/dict/init.sql
+++ b/db/dict/init.sql
@@ -92,8 +92,8 @@ create index term_expression on term (expression);
create index term_reading on term (reading);
-- TODO: (automatically) remove unused terms from db (using constraints?)
-
-- allow many<->many relation between definition and tag
+-- TODO: remove this table!
create table if not exists definition_tag (
id integer primary key autoincrement,
definition_id int not null,
diff --git a/db/dict/tags.sql b/db/dict/tags.sql
index a200abb..2088831 100644
--- a/db/dict/tags.sql
+++ b/db/dict/tags.sql
@@ -1,11 +1 @@
-insert into tag (code) values
- ('class:verb'),
- ('class:verb:suru'),
- ('class:verb:suru-included'),
- ('class:noun'),
- ('class:suffix'),
- ('class:part'),
- ('class:expr'),
- ('name:place'),
- ('name:female'),
- ('name:male');
+-- TODO: generate this file from TypeScript
diff --git a/db/dict/template.sql b/db/dict/template.sql
index 1a07252..6b17a0c 100644
--- a/db/dict/template.sql
+++ b/db/dict/template.sql
@@ -1,7 +1,5 @@
-- create temporary ingest table
drop table if exists ingest;
--- TODO: ingest pitch-accent dictionaries
--- TODO: ingest alternate writings (space-separated)
create temporary table ingest(
-- term fields
expression text not null, -- kanji of term (e.g. 読み込む)
@@ -13,6 +11,16 @@ create temporary table ingest(
glossary text null default null, -- glossary content (support for basic HTML markup/styling)
glossary_tags text null default null -- add tags to single glossary entry
);
+-- TODO: ingest pitch-accent dictionaries
+
+-- create temporary alternate readings table
+drop table if exists alts;
+create temporary table alts(
+ expression text not null, -- kanji of alternate version
+ reading text not null, -- reading of alternate version
+ normal_expression text not null, -- kanji of parent (original)
+ normal_reading text not null -- reading of parent (original)
+);
-- #DICTIONARY_CONTENT_BEGIN
-- this template is 'rendered' by pasting a .dict.sql file in between these
@@ -35,6 +43,11 @@ insert into term (expression, reading)
select expression, reading
from ingest;
+-- add alternates
+insert into term (expression, reading, alt)
+select expression, reading, (select id from term where expression = normal_expression and reading = normal_reading)
+from alts;
+
-- add definitions
insert into definition (term_id, sort, glossary, dict_id)
select
@@ -103,6 +116,15 @@ select
from term_tag_map
join tag on tag.code = term_tag_map.tag;
+-- add tags to alternates
+insert into term_tag (term_id, tag_id)
+select term_alt.id, term_tag.tag_id
+from alts
+inner join term as term_normal on term_normal.expression = alts.normal_expression and term_normal.reading = alts.normal_reading
+inner join term as term_alt on term_alt.expression = alts.expression and term_alt.reading = alts.reading
+left join term_tag on term_tag.term_id = term_normal.id
+where term_tag.tag_id is not null;
+
-- add tags to definitions
insert into definition_tag (definition_id, tag_id)
select
diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts
index 155c423..bf3614e 100644
--- a/import/jmdict/jmdict.ts
+++ b/import/jmdict/jmdict.ts
@@ -1,10 +1,15 @@
import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
-import { Tag } from "../../language/tags.ts";
+import { Tag } from "../../search/tags.ts";
+import "../../util/string.ts";
// this script is very messy right now, and doesn't transfer all information
// present in the dictionary.
//
// proceed with caution
+//
+// TODO: separate term and glossary tags
+// TODO: dictionary normalization (numbers/half-width/長音符)
+// TODO: use sql synthesis library instead of garbo format strings
const LANG = "eng";
@@ -47,8 +52,8 @@ const tagLookup = {
console.log(`.param set :dict 'jmdict_${LANG}'`);
console.log(".param set :lang 'en'");
-// TODO: separate term and glossary tags
-console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values");
+var alts = "insert into alts(expression, reading, normal_expression, normal_reading) values\n";
+var ingest = "insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values\n"
// var max = -100;
for (let i = 0; i < jmdict.words.length; i++) {
@@ -75,10 +80,17 @@ for (let i = 0; i < jmdict.words.length; i++) {
])];
var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
// if (writing == "来る") console.log(term);
- for (let j = 0; j < definitions.length; j++) {
- var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`;
- if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`;
- console.log(out);
- }
+ definitions.forEach((definition, j) => {
+ ingest += `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definition.replaceAll("'", "''")}'),\n`;
+ });
+ other_writings.forEach(alt => {
+ alts += `\t('${alt}', '${reading}', '${writing}', '${reading}'),\n`;
+ });
}
+ingest = ingest.replaceLast(",", ";");
+alts = alts.replaceLast(",", ";");
+
+console.log(ingest);
+console.log(alts);
+
diff --git a/search/readme.md b/search/readme.md
index 400c8ce..164dc9f 100644
--- a/search/readme.md
+++ b/search/readme.md
@@ -23,11 +23,11 @@ to Yomikun's tags for compatibility. Other tags include:
### Behavior-altering tags
-Some tag classes impact the parser's behavior. For example, the input text
+A word's class can impact the parser's behavior. For example, the input text
「完了しました」 will be parsed as just 「完了」, but with the
-`class:verb:suru-included` tag added by the parser. This is because the word
+inflection `infl:suru` tag added by the parser. This is because the word
「完了」 has the tag `class:verb:suru` in the database, which allows the parser
-to deconjugate a noun with the verb 「する」 back into the stem.
+to deconjugate a noun with the verb 「する」 attached back into the stem.
Other uses of this behavior include more accurate automatic kanji reading
generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」
@@ -35,11 +35,11 @@ because 「ハイラル」 has the tag `name:place` in the database, and
「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`.
Yomikun encourages homebrew dictionary sharing, and encourages using
-behavior-altering tags for fixing readings for cases like the above examples.
-As another example of this, it is encouraged that a dictionary for (for
-example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`,
-instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji
-「湖(みずうみ)」.
+behavior-altering tags instead of expressions for fixing readings for cases
+like the above examples. As another example of this, it is encouraged that a
+dictionary for (for example) Zelda add 「トト」 as a term with tags
+`class:noun` and `name:place`, instead of 「トト湖(こ)」 as an expression to
+fix the reading of the kanji 「湖(みずうみ)」.
If Yomikun doesn't generate the correct reading, and the reading isn't based on
natural language context (=a computer *could* accurately decide which reading
diff --git a/search/search.ts b/search/search.ts
index 89c8289..81ca937 100644
--- a/search/search.ts
+++ b/search/search.ts
@@ -48,7 +48,7 @@ export default class Search {
if (result.tags.includes(Tag.Class.Verb.Ru) &&
!result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
if (result.tags.includes(Tag.Class.Verb.Suru) &&
- !result.tags.anyOf([ Tag.Inflection.Reason.Suru, Tag.Class.Verb.SuruIncluded ])) return false;
+ !result.tags.anyOf([ Tag.Inflection.Reason.Suru, Tag.Inflection.Suru ])) return false;
if (result.tags.includes(Tag.Class.Adjective.I) &&
!result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
if (result.tags.includes(Tag.Class.Adjective.Na) &&
diff --git a/search/tags.ts b/search/tags.ts
index 32ce02f..0f21eea 100644
--- a/search/tags.ts
+++ b/search/tags.ts
@@ -6,22 +6,12 @@ export const Tag = {
Class: {
/** @constant verb subgroup */
Verb: {
- /** @constant noun that can be conjugated into a verb by adding する and する itself */
- Suru: "class:verb:suru",
- /**
- * @constant verb stored as conjugated noun in database (nominal verb)
- *
- * @deprecated The use of conjugated forms in dictionaries is discouraged.
- *
- * This tag is added by the deconjugation code to check for a legal
- * deconjugation if する has been deconjugated away for a word marked
- * suru-verb.
- */
- SuruIncluded: "class:verb:suru-included",
/** @constant 〜う verbs in [taekim] (godan) */
U: "class:verb:u",
/** @constant 〜る verbs in [taekim] (ichidan) */
Ru: "class:verb:ru",
+ /** @constant noun that can be conjugated into a verb by adding する and する itself */
+ Suru: "class:verb:suru",
/** @constant kuru (来る) */
Kuru: "class:verb:kuru",
},
@@ -47,6 +37,10 @@ export const Tag = {
Expression: "class:expr",
/** @constant adverbs (e.g. 早く) */
Adverb: "class:adverb",
+ Special: { // TODO: remove or start using instead of hardcoding する and くる
+ Suru: "class:special:suru",
+ Kuru: "class:special:suru",
+ }
},
/** @constant types of names */
Name: {
@@ -148,6 +142,8 @@ export const Tag = {
/** @constant 〜とする attempts (e.g. 入ろうとしている) */
ToSuru: "infl:attempt:tosuru",
},
+ /** @constant suru verbs conjugated with 〜する (e.g. 説明する) */
+ Suru: "infl:suru",
},
/** @constant uncategorized tags */
Auxiliary: {
diff --git a/test/base.ts b/test/base.ts
index 79c39ce..8369bfe 100644
--- a/test/base.ts
+++ b/test/base.ts
@@ -2,9 +2,17 @@ export { assertEquals } from "https://deno.land/std@0.193.0/testing/asserts.ts";
import Yomikun from "../api/yomikun.ts";
import DirectCoreClient from '../core/direct/client.ts';
+import { Wrap } from "../util/wrap.ts";
export const core = new DirectCoreClient();
export const api = new Yomikun(core);
await api.ready;
+export function formatCaseIndex(i: number, total: number) {
+ let out = "";
+ out += (i+1).toString().padStart(Math.log10(total) + 1, '0');
+ out += "/";
+ out += total.toString();
+ return out.wrap(Wrap.parenthesis);
+}
diff --git a/test/deinflection/test.ts b/test/deinflection/test.ts
index fac757e..0dfcdf9 100644
--- a/test/deinflection/test.ts
+++ b/test/deinflection/test.ts
@@ -1,10 +1,10 @@
import cases from "./cases.ts";
-import { core } from '../base.ts';
+import { core, formatCaseIndex } from '../base.ts';
import { Tag, TokenTag } from "../../search/tags.ts";
import { recursiveValues } from "../../util/object.ts";
-cases.forEach(({ input, mustHave, mustNotHave, force }) => {
- Deno.test(`deinflection - ${input}`, async () => {
+cases.forEach(({ input, mustHave, mustNotHave, force }, i) => {
+ Deno.test(`Deinflection ${formatCaseIndex(i, cases.length)} - ${input}`, async () => {
var terms = await core.search.terms(input);
if (terms.length == 0)
diff --git a/test/reading/test.ts b/test/reading/test.ts
index a2524de..c1e7de4 100644
--- a/test/reading/test.ts
+++ b/test/reading/test.ts
@@ -1,8 +1,9 @@
-import { api, assertEquals } from "../base.ts";
+import { api, assertEquals, formatCaseIndex } from "../base.ts";
import cases from "./cases.ts";
-cases.forEach(({input, output}) => {
- Deno.test(`Sentence reading - ${input}`, async () => {
+cases.forEach(({input, output}, i) => {
+ // if (i != 1) return;
+ Deno.test(`Sentence reading ${formatCaseIndex(i, cases.length)} - ${input}`, async () => {
// TODO: use sentence reading and tags
var sentence = await api.sentence(input);
assertEquals(sentence.furigana("refold-tools"), output);