aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-08-06 15:01:13 +0200
committerlonkaars <loek@pipeframe.xyz>2023-08-06 15:01:13 +0200
commite14b315a088d375b80e46978bd769c7d13e71001 (patch)
treed62a36d47b9865a6c38273d07c682e3082e4b86c
parent578af2663f9094f8b852d32cf16868b37518d648 (diff)
move todo and refactor db/find.sqlHEADmaster
-rw-r--r--db/find.sql119
-rw-r--r--readme.md34
-rw-r--r--temp/test.ts7
-rw-r--r--test/reading/cases.ts26
-rw-r--r--test/reading/test.ts1
-rw-r--r--todo.md150
6 files changed, 251 insertions, 86 deletions
diff --git a/db/find.sql b/db/find.sql
index 1d31217..d678a15 100644
--- a/db/find.sql
+++ b/db/find.sql
@@ -1,74 +1,81 @@
--- this statement is prepared and run using :term and :user as inputs (see
+-- this statement is prepared and run using :term and :user as input_substrings (see
-- db.ts or test/find)
-- this file is kind of messy because it needs to be one large query, instead
-- of separate phases creating temporary tables. queries with more than one
-- statement can't return results because of the way sqlite3 works.
+--
+-- the innermost `with` clause contains step 1, the outermost contains step 3,
+-- the last select being step 4. the steps are:
+-- 1. Create all possible substrings from input `:term`
+-- 2. Apply deconjugation rules recursively to `:term`
+-- 3. Find terms matching deconjugations
+-- 4. Clean up and add sort_overlay
--- TODO: add more comments in this file to explain what is going on
-
--- explain query plan -- testing only
-with results(id, expression, reading, tags, depth, rules, original, deinflected) as (
- -- stripped deinflection table (remove some columns and duplicates)
- with deinflections(term, tags, depth, original, rules) as (
- -- recursively generated deinflection table
- with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as (
- -- input term all substrings until length 1
- with inputs(length, term, tags, rules, rules_in, rules_out, depth) as (
- select length(:term), :term, '', -1, 0, 0, 0
- union
- select
- inputs.length - 1,
- substr(inputs.term, 1, inputs.length - 1),
- inputs.tags,
- inputs.rules,
- inputs.rules_in,
- inputs.rules_out,
- inputs.depth
- from inputs
- where inputs.length > 1
- )
- select * from inputs
- union -- join all recursive rows into one large table
- select
- deinflect.length,
- substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out,
- deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side
- deinflection.rules_out,
- deinflection.rules_in,
- deinflect.rules,
- deinflect.depth + 1
- from deinflect -- temp table
- inner join deinflection -- deinflection rules table
- on
- -- rules_in has to contain any of the current deconjugation rules
- (deinflect.rules & deinflection.rules_in != 0) and
- -- term.endsWith(kana_in)
- (substr(term, length(term) - length(kana_in) + 1) = kana_in) and
- -- can't deconjugate to length <1
- (length(term) > 0)
- limit 100 -- failsafe to catch any infinite loops
+-- STEP 3: match deconjugated strings to terms
+with results as (
+ -- STEP 2: apply definitions recursively
+ with deinflections as (
+ -- STEP 1: create all possible substrings from start of `:term`
+ with input_substrings as (
+ select -- first row of recursive query
+ length(:term) as length, -- original term length
+ :term as string -- original term
+ union
+ select -- recursive rows
+ input_substrings.length - 1, -- length = length - 1
+ substr(input_substrings.string, 1, input_substrings.length - 1) -- remove last character
+ from input_substrings
+ where input_substrings.length > 1 -- until only one character remains
)
- select term, tags, depth, substr(:term, 1, deinflect.length), rules
- from deinflect
+ select
+ input_substrings.length as length, -- input length
+ input_substrings.string as string, -- input substring
+ input_substrings.string as original, -- input substring (stays unmodified)
+ '' as tags, -- space-separated deinflection tag list
+ -1 as rules, -- bitmask rules (limits order of applied deinflection rules)
+ 0 as rules_in, -- allowed rules for current deinflection step
+ 0 as rules_out, -- replacement rules for next deinflection step
+ 0 as depth -- amount of applied deconjugations (steps)
+ from input_substrings
+ union
+ select
+ deinflections.length, -- length (passthrough)
+ substr(deinflections.string, 1, length(deinflections.string)-length(deinflection.kana_in)) || deinflection.kana_out, -- replace kana_in with kana_out
+ deinflections.original, -- original (passthrough)
+ deinflections.tags || ' ' || deinflection.tag, -- append deinflection reason tag(s)
+ deinflection.rules_out, -- get next rules (pass rules_out to rules of next iteration)
+ deinflection.rules_in, -- get rules_in
+ deinflections.rules, -- rules (passthrough)
+ deinflections.depth + 1 -- increment depth
+ from deinflections -- recursive table
+ inner join deinflection -- internal deinflection rules table
+ on
+ -- rules_in has to contain any of the current deconjugation rules
+ (deinflections.rules & deinflection.rules_in != 0) and
+ -- string.endsWith(kana_in)
+ (substr(string, length(string) - length(kana_in) + 1) = kana_in) and
+ -- can't deconjugate to length <1
+ (length(string) > 0)
+ limit 100 -- failsafe to catch any infinite loops
)
select
- term.id,
- term.expression,
- term.reading,
- deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags,
- deinflections.depth,
- rules,
- deinflections.original,
- deinflections.term
+ term.id as id, -- term group id
+ term.expression as expression, -- term writing (kanji)
+ term.reading as reading, -- term reading (kana)
+ deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, -- deinflection reasons + dictionary tags
+ deinflections.depth as depth, -- deinflection depth
+ rules as rules, -- deinflection rules (bitmask)
+ deinflections.original as original, -- original input
+ deinflections.string as deinflected -- deinflected input (search term)
from deinflections
- inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term)
+ inner join term on (term.expression = deinflections.string) or (term.reading = deinflections.string)
left join term_tag on term_tag.term_id = term.id
left join tag on term_tag.tag_id = tag.id
group by term.id, deinflections.original, deinflections.rules
having term.id is not null
)
-select
+select -- STEP 4: clean up and join `sort_overlay`
results.id,
results.expression,
results.reading,
diff --git a/readme.md b/readme.md
index 1960076..7cde012 100644
--- a/readme.md
+++ b/readme.md
@@ -15,23 +15,6 @@ API~, and aims to provide extra features that help with immersion and sentence
mining on top of the base word lookup that Yomichan provided. **While Yomikun's
scope is larger than Yomichan, it's still focused on Japanese only.**
-## TODO
-
-- [x] working proof of concept sentence lookup using deno/sqlite3
-- [ ] port dictionaries for more advanced testing
- - [x] JMdict (WIP)
- - [ ] JMNedict
-- [ ] add separate kanji readings/info table
-- [ ] add separate frequency dictionary
-- [x] add more deinflections to db/deinflections.sql
-- [x] set up unit tests for sentence reading generation
-- [x] port server-internal API to simple HTTP JSON API
-- [ ] create primitive search page ui
-- [ ] add code formatter config
-- [ ] complete documentation
-- [ ] remove makefiles for database initialization
-- [ ] replace .sql script files with typescript sql query generation library
-
## ~New features (from Yomichan)~
NONE OF THESE ARE IMPLEMENTED YET
@@ -52,14 +35,15 @@ NONE OF THESE ARE IMPLEMENTED YET
## Documentation
-Some general documentation is done in markdown, but other general documentation
-should be done in JSDoc format in the corresponding code files. The
-documentation also makes frequent references to, and uses terminology from [Tae
-Kim's Japanese grammar guide][taekim], which is abbreviated to [taekim] instead
-of copying the link into the source code each time. Tae Kim uses slightly
-different terms for grammatical concepts. The 'Tae Kim-version' of these terms
-is used for named constants in code. See [tags.ts](language/tags.ts) for an
-overview of relevant grammatical terms for the Yomikun parser.
+Some general project structure documentation is done in markdown, but other
+specific documentation (on algorithms, data structures, etc.) should be done in
+JSDoc format in the corresponding code files. The documentation also makes
+frequent references to, and uses terminology from [Tae Kim's Japanese grammar
+guide][taekim], which is abbreviated to [taekim] instead of copying the link
+into the source code each time. Tae Kim uses slightly different terms for
+grammatical concepts. The 'Tae Kim-version' of these terms is used for named
+constants in code. See [tags.ts](search/tags.ts) for an overview of relevant
+grammatical terms for the Yomikun parser.
## The dream
diff --git a/temp/test.ts b/temp/test.ts
new file mode 100644
index 0000000..5d5dc68
--- /dev/null
+++ b/temp/test.ts
@@ -0,0 +1,7 @@
+import Yomikun from "../api/yomikun.ts";
+import DirectCoreClient from "../core/direct/client.ts";
+var api = new Yomikun(new DirectCoreClient());
+await api.ready;
+var sentence = await api.sentence("コツとかあんの?");
+console.log(sentence.words);
+
diff --git a/test/reading/cases.ts b/test/reading/cases.ts
index bb7b6c2..3ebac9b 100644
--- a/test/reading/cases.ts
+++ b/test/reading/cases.ts
@@ -5914,7 +5914,7 @@ export default [
{
input: "移り住んだその年の秋には種蒔きをしとかなきゃ",
reading: "うつりすんだそのとしのあきにはたねまきをしとかなきゃ",
- output: "[移](うつ)り[住](す)んだその[年](とし)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ",
+ output: "[移](うつ)り[住](す)んだその[年](ねん)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ",
tags: [ "ヴィンランドサガ" ]
},
{
@@ -7273,10 +7273,10 @@ export default [
output: "あなたにできる[唯一](ゆいいつ)のことです",
tags: [ "見える子ちゃん" ]
},
- {
+ { // NOTE: くんえんざい is normally written as 燻煙剤 (this term is only listed correctly in my jp-jp dictionaries)
input: "くん煙剤です。大量に焚いときました。",
- reading: "くんけむりざいです。たいりょうにたいときました。",
- output: "くん[煙](けむり)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。",
+ reading: "くんえんざいです。たいりょうにたいときました。",
+ output: "くん[煙](えん)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。",
tags: [ "見える子ちゃん" ]
},
{
@@ -9476,4 +9476,22 @@ export default [
output: "ぜひ[目次](もくじ)を[見](み)ながら[欲](ほ)しいものを[探](さが)していってもらえると[嬉](うれ)しいです",
tags: [ "YouTube", "トバログ" ]
},
+ { // エロマンガ先生 episode 07 @ 13:13 (test for 点 + くらい as separate words)
+ input: "百万点くらい面白かったです!",
+ reading: "ひゃくまんてんくらいおもしろかったです!",
+ output: "[百万](ひゃくまん)[点](てん)くらい[面白](おもしろ)かったです!",
+ tags: [ "エロマンガ先生" ],
+ },
+ { // 無職転生 season 2 episode 3 @ 16:06 (test for 〜じまう -> しまう)
+ input: "とりあえず 今日は飲め 死ぬほど飲んじまえ",
+ reading: "とりあえず きょうはのめ しぬほどのんじまえ",
+ output: "とりあえず [今日](きょう)は[飲](の)め [死](し)ぬほど[飲](の)んじまえ",
+ tags: [ "無職転生:異世界行ったら本気だす" ],
+ },
+ { // よふかしのうた episode 02 @ 02:14 (test for しまえ -> 閉める +imperative)
+ input: "うおー びっくりした しまえ しまえ 早く",
+ reading: "うおー びっくりした しまえ しまえ 早く",
+ output: "うおー びっくりした しまえ しまえ 早く",
+ tags: [ "よふかしのうた" ],
+ },
] satisfies Test[];
diff --git a/test/reading/test.ts b/test/reading/test.ts
index d74228c..42aa43c 100644
--- a/test/reading/test.ts
+++ b/test/reading/test.ts
@@ -4,7 +4,6 @@ import cases from "./cases.ts";
cases.forEach(({input, output}, i) => {
// if (i != 1) return;
Deno.test(`Sentence reading ${formatCaseIndex(i, cases.length)} - ${input}`, async () => {
- // TODO: use domain/series tags
var sentence = await api.sentence(input);
assertStrDiff(output, sentence.furigana("refold-tools"));
});
diff --git a/todo.md b/todo.md
new file mode 100644
index 0000000..6840877
--- /dev/null
+++ b/todo.md
@@ -0,0 +1,150 @@
+# generic list of concrete todo items that don't need further consideration
+
+## 0.0.1 (standalone API)
+
+- [x] working proof of concept sentence lookup using deno/sqlite3
+- [ ] port dictionaries for more advanced testing
+ - [x] JMdict (WIP)
+ - [ ] JMNedict
+- [x] add more deinflections to db/deinflections.sql
+- [x] set up unit tests for sentence reading generation
+- [x] port server-internal API to simple HTTP JSON API
+- [ ] [improve DB schema](#how-to-store-multiple-readingswritings-in-db)
+- [ ] finish [API examples](examples/readme.md)
+- [ ] remove makefiles for database initialization
+- [ ] add separate kanji readings/info table
+- [ ] add separate frequency dictionary
+- [ ] complete documentation
+- [ ] add code formatter config
+- [ ] ~replace .sql script files with typescript sql query generation library~ ([the problem](https://www.reddit.com/r/Deno/comments/ss6568/alternative_to_knexjs_on_deno/))
+
+## 0.1.0 (front-end UI)
+
+- [ ] create primitive search page ui
+
+## always
+
+- [ ] improve sentence parser accuracy
+ - [ ] have the parser recursively explore N shorter terms at each word
+ found and rank resulting possible sentences (by frequency?)
+ - [ ] use domain-specific tags in reading tests (create domain-specific
+ dictionaries first)
+ - [ ] normalize dictionary before import
+ - [ ] remove "baked" combinations of word + suffix
+ - [ ] automatically create combinations of kanji replaced by kana as
+ alternate writings
+ - [ ] add more deinflections for casual speech and other colloquialisms
+
+# how to store multiple readings/writings in DB
+
+## idea 1
+
+positives:
+- allows multiple alternate readings/writings for terms
+- easy to find primary reading or writing for a term
+- efficiently stores kana-only words
+- allows parser to parse alternatively written words (currently requires manual
+ typescript intervention to resolve `alt` field back to actual term to get
+ it's tags)
+
+negatives:
+- ~creates duplicates in `text` column for readings of terms with different
+ kanji but the same reading~
+
+ I consider this a non-issue because this simplifies the sentence lookup
+ query. The alternative (a term\<-\>reading/writing reference table) would
+ save disk space in exchange for processing time and complexity.
+- ~unclear how to reference a specific word without using it's `term_id` (which
+ can vary from user to user when different dictionaries are installed), or
+ *what identifies a unique term in this case?*~
+
+ `user.sort_overlay` needs to be able to uniquely identify a `term_id`, but
+ also needs to be in-/exportable by users with different imported dictionaries
+ (ideally with minimal failure).
+
+ things to consider:
+
+ options:
+ - ~just use (primary) writing only~
+
+ this doesn't work for terms with multiple readings to distinguish between
+ meanings, e.g.
+ <ruby>人気<rt>ひとけ</rt></ruby>/<ruby>人気<rt>にんき</rt></ruby>
+ - ~identify as "term with text X and another text Y"~
+
+ this feels like a janky solution but is what is currently being used, where
+ X is always the default way of writing and Y the default reading
+ - directly reference `term_id` in `user.sort_overlay` and convert to matching
+ all known readings/writings at time of export/import
+
+ good:
+
+ - faster `user.sort_overlay` lookups
+ - still allows user preference import/exporting
+
+ bad:
+
+ - ~all data in `user.db` becomes useless when `dict.db` is lost or corrupted~
+
+ `user.sort_overlay` will be moved to `dict.db`, and `user.db` will only
+ be used for storing (mostly portable) user preferences and identifiers
+ (username, id, etc.).
+ - importing/exporting will take longer and require more complicated sql code
+
+
+### example tables
+
+#### readwritings (should have better name)
+
+(indexes from LSB)
+`flags[0]` = primary writing
+`flags[1]` = primary reading
+
+|`id`|`term_id`|`text`|`flags`|
+|-|-|-|-|
+|1|1|繰り返す|1|
+|2|1|くり返す|0|
+|3|1|繰返す|0|
+|4|1|繰りかえす|0|
+|5|1|くりかえす|2|
+|6|2|変える|1|
+|7|2|かえる|2|
+|8|3|帰る|1|
+|9|3|かえる|2|
+|10|4|にやにや|3|
+
+# how/where to deal with irregular readings
+
+WIP
+
+ideally one way of storing reading exceptions for:
+
+- 来る + する (conjugation-dependent)
+- 入る as (はいる) or (いる) (not sure about this one?)
+- counters (counter type + amount specific)
+- numbers (exceptions for certain powers of 10)
+
+# way to link expressions to a list of conjugated terms
+
+WIP
+
+this may be useful for dictionaries that provide meanings for expressions but
+don't provide readings for those expressions? (新和英大辞典 has some of these)
+
+examples:
+- 村長選 -> 村長 + 選[suf]
+- 花より団子 -> 花 + より[grammar] + 団子
+
+# random thoughts
+
+this project has 0 planning so here's a list of things that may eventually need
+some thought
+
+- how can a series-specific dictionary also 'encourage' the use of another
+ domain-specific category? (e.g. anime about programming makes computer domain
+ specific terms rank slightly higher or something?)
+- maybe have a mode in the front-end that captures preedit text from a user
+ typing japanese text to infer readings of kanji, or rank different terms
+ slightly higher? (using [compositionupdate
+ events](https://developer.mozilla.org/en-US/docs/Web/API/Element/compositionupdate_event))
+