From e14b315a088d375b80e46978bd769c7d13e71001 Mon Sep 17 00:00:00 2001 From: lonkaars Date: Sun, 6 Aug 2023 15:01:13 +0200 Subject: move todo and refactor db/find.sql --- db/find.sql | 119 ++++++++++++++++++++------------------- readme.md | 34 +++--------- temp/test.ts | 7 +++ test/reading/cases.ts | 26 +++++++-- test/reading/test.ts | 1 - todo.md | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 251 insertions(+), 86 deletions(-) create mode 100644 temp/test.ts create mode 100644 todo.md diff --git a/db/find.sql b/db/find.sql index 1d31217..d678a15 100644 --- a/db/find.sql +++ b/db/find.sql @@ -1,74 +1,81 @@ --- this statement is prepared and run using :term and :user as inputs (see +-- this statement is prepared and run using :term and :user as input_substrings (see -- db.ts or test/find) -- this file is kind of messy because it needs to be one large query, instead -- of separate phases creating temporary tables. queries with more than one -- statement can't return results because of the way sqlite3 works. +-- +-- the innermost `with` clause contains step 1, the outermost contains step 3, +-- the last select being step 4. the steps are: +-- 1. Create all possible substrings from input `:term` +-- 2. Apply deconjugation rules recursively to `:term` +-- 3. Find terms matching deconjugations +-- 4. Clean up and add sort_overlay --- TODO: add more comments in this file to explain what is going on - --- explain query plan -- testing only -with results(id, expression, reading, tags, depth, rules, original, deinflected) as ( - -- stripped deinflection table (remove some columns and duplicates) - with deinflections(term, tags, depth, original, rules) as ( - -- recursively generated deinflection table - with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as ( - -- input term all substrings until length 1 - with inputs(length, term, tags, rules, rules_in, rules_out, depth) as ( - select length(:term), :term, '', -1, 0, 0, 0 - union - select - inputs.length - 1, - substr(inputs.term, 1, inputs.length - 1), - inputs.tags, - inputs.rules, - inputs.rules_in, - inputs.rules_out, - inputs.depth - from inputs - where inputs.length > 1 - ) - select * from inputs - union -- join all recursive rows into one large table - select - deinflect.length, - substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out, - deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side - deinflection.rules_out, - deinflection.rules_in, - deinflect.rules, - deinflect.depth + 1 - from deinflect -- temp table - inner join deinflection -- deinflection rules table - on - -- rules_in has to contain any of the current deconjugation rules - (deinflect.rules & deinflection.rules_in != 0) and - -- term.endsWith(kana_in) - (substr(term, length(term) - length(kana_in) + 1) = kana_in) and - -- can't deconjugate to length <1 - (length(term) > 0) - limit 100 -- failsafe to catch any infinite loops +-- STEP 3: match deconjugated strings to terms +with results as ( + -- STEP 2: apply definitions recursively + with deinflections as ( + -- STEP 1: create all possible substrings from start of `:term` + with input_substrings as ( + select -- first row of recursive query + length(:term) as length, -- original term length + :term as string -- original term + union + select -- recursive rows + input_substrings.length - 1, -- length = length - 1 + substr(input_substrings.string, 1, input_substrings.length - 1) -- remove last character + from input_substrings + where input_substrings.length > 1 -- until only one character remains ) - select term, tags, depth, substr(:term, 1, deinflect.length), rules - from deinflect + select + input_substrings.length as length, -- input length + input_substrings.string as string, -- input substring + input_substrings.string as original, -- input substring (stays unmodified) + '' as tags, -- space-separated deinflection tag list + -1 as rules, -- bitmask rules (limits order of applied deinflection rules) + 0 as rules_in, -- allowed rules for current deinflection step + 0 as rules_out, -- replacement rules for next deinflection step + 0 as depth -- amount of applied deconjugations (steps) + from input_substrings + union + select + deinflections.length, -- length (passthrough) + substr(deinflections.string, 1, length(deinflections.string)-length(deinflection.kana_in)) || deinflection.kana_out, -- replace kana_in with kana_out + deinflections.original, -- original (passthrough) + deinflections.tags || ' ' || deinflection.tag, -- append deinflection reason tag(s) + deinflection.rules_out, -- get next rules (pass rules_out to rules of next iteration) + deinflection.rules_in, -- get rules_in + deinflections.rules, -- rules (passthrough) + deinflections.depth + 1 -- increment depth + from deinflections -- recursive table + inner join deinflection -- internal deinflection rules table + on + -- rules_in has to contain any of the current deconjugation rules + (deinflections.rules & deinflection.rules_in != 0) and + -- string.endsWith(kana_in) + (substr(string, length(string) - length(kana_in) + 1) = kana_in) and + -- can't deconjugate to length <1 + (length(string) > 0) + limit 100 -- failsafe to catch any infinite loops ) select - term.id, - term.expression, - term.reading, - deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, - deinflections.depth, - rules, - deinflections.original, - deinflections.term + term.id as id, -- term group id + term.expression as expression, -- term writing (kanji) + term.reading as reading, -- term reading (kana) + deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, -- deinflection reasons + dictionary tags + deinflections.depth as depth, -- deinflection depth + rules as rules, -- deinflection rules (bitmask) + deinflections.original as original, -- original input + deinflections.string as deinflected -- deinflected input (search term) from deinflections - inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term) + inner join term on (term.expression = deinflections.string) or (term.reading = deinflections.string) left join term_tag on term_tag.term_id = term.id left join tag on term_tag.tag_id = tag.id group by term.id, deinflections.original, deinflections.rules having term.id is not null ) -select +select -- STEP 4: clean up and join `sort_overlay` results.id, results.expression, results.reading, diff --git a/readme.md b/readme.md index 1960076..7cde012 100644 --- a/readme.md +++ b/readme.md @@ -15,23 +15,6 @@ API~, and aims to provide extra features that help with immersion and sentence mining on top of the base word lookup that Yomichan provided. **While Yomikun's scope is larger than Yomichan, it's still focused on Japanese only.** -## TODO - -- [x] working proof of concept sentence lookup using deno/sqlite3 -- [ ] port dictionaries for more advanced testing - - [x] JMdict (WIP) - - [ ] JMNedict -- [ ] add separate kanji readings/info table -- [ ] add separate frequency dictionary -- [x] add more deinflections to db/deinflections.sql -- [x] set up unit tests for sentence reading generation -- [x] port server-internal API to simple HTTP JSON API -- [ ] create primitive search page ui -- [ ] add code formatter config -- [ ] complete documentation -- [ ] remove makefiles for database initialization -- [ ] replace .sql script files with typescript sql query generation library - ## ~New features (from Yomichan)~ NONE OF THESE ARE IMPLEMENTED YET @@ -52,14 +35,15 @@ NONE OF THESE ARE IMPLEMENTED YET ## Documentation -Some general documentation is done in markdown, but other general documentation -should be done in JSDoc format in the corresponding code files. The -documentation also makes frequent references to, and uses terminology from [Tae -Kim's Japanese grammar guide][taekim], which is abbreviated to [taekim] instead -of copying the link into the source code each time. Tae Kim uses slightly -different terms for grammatical concepts. The 'Tae Kim-version' of these terms -is used for named constants in code. See [tags.ts](language/tags.ts) for an -overview of relevant grammatical terms for the Yomikun parser. +Some general project structure documentation is done in markdown, but other +specific documentation (on algorithms, data structures, etc.) should be done in +JSDoc format in the corresponding code files. The documentation also makes +frequent references to, and uses terminology from [Tae Kim's Japanese grammar +guide][taekim], which is abbreviated to [taekim] instead of copying the link +into the source code each time. Tae Kim uses slightly different terms for +grammatical concepts. The 'Tae Kim-version' of these terms is used for named +constants in code. See [tags.ts](search/tags.ts) for an overview of relevant +grammatical terms for the Yomikun parser. ## The dream diff --git a/temp/test.ts b/temp/test.ts new file mode 100644 index 0000000..5d5dc68 --- /dev/null +++ b/temp/test.ts @@ -0,0 +1,7 @@ +import Yomikun from "../api/yomikun.ts"; +import DirectCoreClient from "../core/direct/client.ts"; +var api = new Yomikun(new DirectCoreClient()); +await api.ready; +var sentence = await api.sentence("コツとかあんの?"); +console.log(sentence.words); + diff --git a/test/reading/cases.ts b/test/reading/cases.ts index bb7b6c2..3ebac9b 100644 --- a/test/reading/cases.ts +++ b/test/reading/cases.ts @@ -5914,7 +5914,7 @@ export default [ { input: "移り住んだその年の秋には種蒔きをしとかなきゃ", reading: "うつりすんだそのとしのあきにはたねまきをしとかなきゃ", - output: "[移](うつ)り[住](す)んだその[年](とし)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ", + output: "[移](うつ)り[住](す)んだその[年](ねん)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ", tags: [ "ヴィンランドサガ" ] }, { @@ -7273,10 +7273,10 @@ export default [ output: "あなたにできる[唯一](ゆいいつ)のことです", tags: [ "見える子ちゃん" ] }, - { + { // NOTE: くんえんざい is normally written as 燻煙剤 (this term is only listed correctly in my jp-jp dictionaries) input: "くん煙剤です。大量に焚いときました。", - reading: "くんけむりざいです。たいりょうにたいときました。", - output: "くん[煙](けむり)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。", + reading: "くんえんざいです。たいりょうにたいときました。", + output: "くん[煙](えん)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。", tags: [ "見える子ちゃん" ] }, { @@ -9476,4 +9476,22 @@ export default [ output: "ぜひ[目次](もくじ)を[見](み)ながら[欲](ほ)しいものを[探](さが)していってもらえると[嬉](うれ)しいです", tags: [ "YouTube", "トバログ" ] }, + { // エロマンガ先生 episode 07 @ 13:13 (test for 点 + くらい as separate words) + input: "百万点くらい面白かったです!", + reading: "ひゃくまんてんくらいおもしろかったです!", + output: "[百万](ひゃくまん)[点](てん)くらい[面白](おもしろ)かったです!", + tags: [ "エロマンガ先生" ], + }, + { // 無職転生 season 2 episode 3 @ 16:06 (test for 〜じまう -> しまう) + input: "とりあえず 今日は飲め 死ぬほど飲んじまえ", + reading: "とりあえず きょうはのめ しぬほどのんじまえ", + output: "とりあえず [今日](きょう)は[飲](の)め [死](し)ぬほど[飲](の)んじまえ", + tags: [ "無職転生:異世界行ったら本気だす" ], + }, + { // よふかしのうた episode 02 @ 02:14 (test for しまえ -> 閉める +imperative) + input: "うおー びっくりした しまえ しまえ 早く", + reading: "うおー びっくりした しまえ しまえ 早く", + output: "うおー びっくりした しまえ しまえ 早く", + tags: [ "よふかしのうた" ], + }, ] satisfies Test[]; diff --git a/test/reading/test.ts b/test/reading/test.ts index d74228c..42aa43c 100644 --- a/test/reading/test.ts +++ b/test/reading/test.ts @@ -4,7 +4,6 @@ import cases from "./cases.ts"; cases.forEach(({input, output}, i) => { // if (i != 1) return; Deno.test(`Sentence reading ${formatCaseIndex(i, cases.length)} - ${input}`, async () => { - // TODO: use domain/series tags var sentence = await api.sentence(input); assertStrDiff(output, sentence.furigana("refold-tools")); }); diff --git a/todo.md b/todo.md new file mode 100644 index 0000000..6840877 --- /dev/null +++ b/todo.md @@ -0,0 +1,150 @@ +# generic list of concrete todo items that don't need further consideration + +## 0.0.1 (standalone API) + +- [x] working proof of concept sentence lookup using deno/sqlite3 +- [ ] port dictionaries for more advanced testing + - [x] JMdict (WIP) + - [ ] JMNedict +- [x] add more deinflections to db/deinflections.sql +- [x] set up unit tests for sentence reading generation +- [x] port server-internal API to simple HTTP JSON API +- [ ] [improve DB schema](#how-to-store-multiple-readingswritings-in-db) +- [ ] finish [API examples](examples/readme.md) +- [ ] remove makefiles for database initialization +- [ ] add separate kanji readings/info table +- [ ] add separate frequency dictionary +- [ ] complete documentation +- [ ] add code formatter config +- [ ] ~replace .sql script files with typescript sql query generation library~ ([the problem](https://www.reddit.com/r/Deno/comments/ss6568/alternative_to_knexjs_on_deno/)) + +## 0.1.0 (front-end UI) + +- [ ] create primitive search page ui + +## always + +- [ ] improve sentence parser accuracy + - [ ] have the parser recursively explore N shorter terms at each word + found and rank resulting possible sentences (by frequency?) + - [ ] use domain-specific tags in reading tests (create domain-specific + dictionaries first) + - [ ] normalize dictionary before import + - [ ] remove "baked" combinations of word + suffix + - [ ] automatically create combinations of kanji replaced by kana as + alternate writings + - [ ] add more deinflections for casual speech and other colloquialisms + +# how to store multiple readings/writings in DB + +## idea 1 + +positives: +- allows multiple alternate readings/writings for terms +- easy to find primary reading or writing for a term +- efficiently stores kana-only words +- allows parser to parse alternatively written words (currently requires manual + typescript intervention to resolve `alt` field back to actual term to get + it's tags) + +negatives: +- ~creates duplicates in `text` column for readings of terms with different + kanji but the same reading~ + + I consider this a non-issue because this simplifies the sentence lookup + query. The alternative (a term\<-\>reading/writing reference table) would + save disk space in exchange for processing time and complexity. +- ~unclear how to reference a specific word without using it's `term_id` (which + can vary from user to user when different dictionaries are installed), or + *what identifies a unique term in this case?*~ + + `user.sort_overlay` needs to be able to uniquely identify a `term_id`, but + also needs to be in-/exportable by users with different imported dictionaries + (ideally with minimal failure). + + things to consider: + + options: + - ~just use (primary) writing only~ + + this doesn't work for terms with multiple readings to distinguish between + meanings, e.g. + 人気ひとけ/人気にんき + - ~identify as "term with text X and another text Y"~ + + this feels like a janky solution but is what is currently being used, where + X is always the default way of writing and Y the default reading + - directly reference `term_id` in `user.sort_overlay` and convert to matching + all known readings/writings at time of export/import + + good: + + - faster `user.sort_overlay` lookups + - still allows user preference import/exporting + + bad: + + - ~all data in `user.db` becomes useless when `dict.db` is lost or corrupted~ + + `user.sort_overlay` will be moved to `dict.db`, and `user.db` will only + be used for storing (mostly portable) user preferences and identifiers + (username, id, etc.). + - importing/exporting will take longer and require more complicated sql code + + +### example tables + +#### readwritings (should have better name) + +(indexes from LSB) +`flags[0]` = primary writing +`flags[1]` = primary reading + +|`id`|`term_id`|`text`|`flags`| +|-|-|-|-| +|1|1|繰り返す|1| +|2|1|くり返す|0| +|3|1|繰返す|0| +|4|1|繰りかえす|0| +|5|1|くりかえす|2| +|6|2|変える|1| +|7|2|かえる|2| +|8|3|帰る|1| +|9|3|かえる|2| +|10|4|にやにや|3| + +# how/where to deal with irregular readings + +WIP + +ideally one way of storing reading exceptions for: + +- 来る + する (conjugation-dependent) +- 入る as (はいる) or (いる) (not sure about this one?) +- counters (counter type + amount specific) +- numbers (exceptions for certain powers of 10) + +# way to link expressions to a list of conjugated terms + +WIP + +this may be useful for dictionaries that provide meanings for expressions but +don't provide readings for those expressions? (新和英大辞典 has some of these) + +examples: +- 村長選 -> 村長 + 選[suf] +- 花より団子 -> 花 + より[grammar] + 団子 + +# random thoughts + +this project has 0 planning so here's a list of things that may eventually need +some thought + +- how can a series-specific dictionary also 'encourage' the use of another + domain-specific category? (e.g. anime about programming makes computer domain + specific terms rank slightly higher or something?) +- maybe have a mode in the front-end that captures preedit text from a user + typing japanese text to infer readings of kanji, or rank different terms + slightly higher? (using [compositionupdate + events](https://developer.mozilla.org/en-US/docs/Web/API/Element/compositionupdate_event)) + -- cgit v1.2.3