move todo and refactor db/find.sqlHEAD master

author: lonkaars <loek@pipeframe.xyz> 2023-08-06 15:01:13 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-08-06 15:01:13 +0200
commit: e14b315a088d375b80e46978bd769c7d13e71001 (patch)
tree: d62a36d47b9865a6c38273d07c682e3082e4b86c
parent: 578af2663f9094f8b852d32cf16868b37518d648 (diff)
6 files changed, 251 insertions, 86 deletions
diff --git a/db/find.sql b/db/find.sql
index 1d31217..d678a15 100644
--- a/db/find.sql
+++ b/db/find.sql
@@ -1,74 +1,81 @@
--- this statement is prepared and run using :term and :user as inputs (see
+-- this statement is prepared and run using :term and :user as input_substrings (see
 -- db.ts or test/find)
 
 -- this file is kind of messy because it needs to be one large query, instead
 -- of separate phases creating temporary tables. queries with more than one
 -- statement can't return results because of the way sqlite3 works.
+--
+-- the innermost `with` clause contains step 1, the outermost contains step 3,
+-- the last select being step 4. the steps are:
+-- 1. Create all possible substrings from input `:term`
+-- 2. Apply deconjugation rules recursively to `:term`
+-- 3. Find terms matching deconjugations
+-- 4. Clean up and add sort_overlay
 
--- TODO: add more comments in this file to explain what is going on
-
--- explain query plan -- testing only
-with results(id, expression, reading, tags, depth, rules, original, deinflected) as (
-	-- stripped deinflection table (remove some columns and duplicates)
-	with deinflections(term, tags, depth, original, rules) as (
-		-- recursively generated deinflection table
-		with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as (
-			-- input term all substrings until length 1
-			with inputs(length, term, tags, rules, rules_in, rules_out, depth) as (
-				select length(:term), :term, '', -1, 0, 0, 0
-				union
-				select
-					inputs.length - 1,
-					substr(inputs.term, 1, inputs.length - 1),
-					inputs.tags,
-					inputs.rules,
-					inputs.rules_in,
-					inputs.rules_out,
-					inputs.depth
-				from inputs
-				where inputs.length > 1
-			)
-			select * from inputs
-			union -- join all recursive rows into one large table
-			select
-				deinflect.length,
-				substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out,
-				deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side
-				deinflection.rules_out,
-				deinflection.rules_in,
-				deinflect.rules,
-				deinflect.depth + 1
-			from deinflect -- temp table
-			inner join deinflection -- deinflection rules table
-			on
-				-- rules_in has to contain any of the current deconjugation rules
-				(deinflect.rules & deinflection.rules_in != 0) and
-				-- term.endsWith(kana_in)
-				(substr(term, length(term) - length(kana_in) + 1) = kana_in) and
-				-- can't deconjugate to length <1
-				(length(term) > 0)
-			limit 100 -- failsafe to catch any infinite loops
+-- STEP 3: match deconjugated strings to terms
+with results as (
+	-- STEP 2: apply definitions recursively
+	with deinflections as (
+		-- STEP 1: create all possible substrings from start of `:term`
+		with input_substrings as (
+			select -- first row of recursive query
+				length(:term) as length, -- original term length
+				:term         as string  -- original term
+			union
+			select -- recursive rows
+				input_substrings.length - 1, -- length = length - 1
+				substr(input_substrings.string, 1, input_substrings.length - 1) -- remove last character
+			from input_substrings
+			where input_substrings.length > 1 -- until only one character remains
 		)
-		select term, tags, depth, substr(:term, 1, deinflect.length), rules
-		from deinflect
+		select
+			input_substrings.length as length,    -- input length
+			input_substrings.string as string,    -- input substring
+			input_substrings.string as original,  -- input substring (stays unmodified)
+			''                      as tags,      -- space-separated deinflection tag list
+			-1                      as rules,     -- bitmask rules (limits order of applied deinflection rules)
+			0                       as rules_in,  -- allowed rules for current deinflection step
+			0                       as rules_out, -- replacement rules for next deinflection step
+			0                       as depth      -- amount of applied deconjugations (steps)
+		from input_substrings
+		union
+		select
+			deinflections.length, -- length (passthrough)
+			substr(deinflections.string, 1, length(deinflections.string)-length(deinflection.kana_in)) || deinflection.kana_out, -- replace kana_in with kana_out
+			deinflections.original, -- original (passthrough)
+			deinflections.tags || ' ' || deinflection.tag, -- append deinflection reason tag(s)
+			deinflection.rules_out, -- get next rules (pass rules_out to rules of next iteration)
+			deinflection.rules_in, -- get rules_in
+			deinflections.rules, -- rules (passthrough)
+			deinflections.depth + 1 -- increment depth
+		from deinflections -- recursive table
+		inner join deinflection -- internal deinflection rules table
+		on
+			-- rules_in has to contain any of the current deconjugation rules
+			(deinflections.rules & deinflection.rules_in != 0) and
+			-- string.endsWith(kana_in)
+			(substr(string, length(string) - length(kana_in) + 1) = kana_in) and
+			-- can't deconjugate to length <1
+			(length(string) > 0)
+		limit 100 -- failsafe to catch any infinite loops
 	)
 	select
-		term.id,
-		term.expression,
-		term.reading,
-		deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags,
-		deinflections.depth,
-		rules,
-		deinflections.original,
-		deinflections.term
+		term.id                                                  as id,         -- term group id
+		term.expression                                          as expression, -- term writing (kanji)
+		term.reading                                             as reading,    -- term reading (kana)
+		deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags,       -- deinflection reasons + dictionary tags
+		deinflections.depth                                      as depth,      -- deinflection depth
+		rules                                                    as rules,      -- deinflection rules (bitmask)
+		deinflections.original                                   as original,   -- original input
+		deinflections.string                                     as deinflected -- deinflected input (search term)
 	from deinflections
-	inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term)
+	inner join term on (term.expression = deinflections.string) or (term.reading = deinflections.string)
 	left join term_tag on term_tag.term_id = term.id
 	left join tag on term_tag.tag_id = tag.id
 	group by term.id, deinflections.original, deinflections.rules
 	having term.id is not null
 )
-select
+select -- STEP 4: clean up and join `sort_overlay`
 	results.id,
 	results.expression,
 	results.reading,
diff --git a/readme.md b/readme.md
index 1960076..7cde012 100644
--- a/readme.md
+++ b/readme.md
@@ -15,23 +15,6 @@ API~, and aims to provide extra features that help with immersion and sentence
 mining on top of the base word lookup that Yomichan provided. **While Yomikun's
 scope is larger than Yomichan, it's still focused on Japanese only.**
 
-## TODO
-
-- [x] working proof of concept sentence lookup using deno/sqlite3
-- [ ] port dictionaries for more advanced testing
-    - [x] JMdict (WIP)
-    - [ ] JMNedict
-- [ ] add separate kanji readings/info table
-- [ ] add separate frequency dictionary
-- [x] add more deinflections to db/deinflections.sql
-- [x] set up unit tests for sentence reading generation
-- [x] port server-internal API to simple HTTP JSON API
-- [ ] create primitive search page ui
-- [ ] add code formatter config
-- [ ] complete documentation
-- [ ] remove makefiles for database initialization
-- [ ] replace .sql script files with typescript sql query generation library
-
 ## ~New features (from Yomichan)~
 
 NONE OF THESE ARE IMPLEMENTED YET
@@ -52,14 +35,15 @@ NONE OF THESE ARE IMPLEMENTED YET
 
 ## Documentation
 
-Some general documentation is done in markdown, but other general documentation
-should be done in JSDoc format in the corresponding code files. The
-documentation also makes frequent references to, and uses terminology from [Tae
-Kim's Japanese grammar guide][taekim], which is abbreviated to [taekim] instead
-of copying the link into the source code each time. Tae Kim uses slightly
-different terms for grammatical concepts. The 'Tae Kim-version' of these terms
-is used for named constants in code. See [tags.ts](language/tags.ts) for an
-overview of relevant grammatical terms for the Yomikun parser.
+Some general project structure documentation is done in markdown, but other
+specific documentation (on algorithms, data structures, etc.) should be done in
+JSDoc format in the corresponding code files. The documentation also makes
+frequent references to, and uses terminology from [Tae Kim's Japanese grammar
+guide][taekim], which is abbreviated to [taekim] instead of copying the link
+into the source code each time. Tae Kim uses slightly different terms for
+grammatical concepts. The 'Tae Kim-version' of these terms is used for named
+constants in code. See [tags.ts](search/tags.ts) for an overview of relevant
+grammatical terms for the Yomikun parser.
 
 ## The dream
 
diff --git a/temp/test.ts b/temp/test.ts
new file mode 100644
index 0000000..5d5dc68
--- /dev/null
+++ b/temp/test.ts
@@ -0,0 +1,7 @@
+import Yomikun from "../api/yomikun.ts";
+import DirectCoreClient from "../core/direct/client.ts";
+var api = new Yomikun(new DirectCoreClient());
+await api.ready;
+var sentence = await api.sentence("コツとかあんの？");
+console.log(sentence.words);
+
diff --git a/test/reading/cases.ts b/test/reading/cases.ts
index bb7b6c2..3ebac9b 100644
--- a/test/reading/cases.ts
+++ b/test/reading/cases.ts
@@ -5914,7 +5914,7 @@ export default [
 	{
 		input: "移り住んだその年の秋には種蒔きをしとかなきゃ",
 		reading: "うつりすんだそのとしのあきにはたねまきをしとかなきゃ",
-		output: "[移](うつ)り[住](す)んだその[年](とし)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ",
+		output: "[移](うつ)り[住](す)んだその[年](ねん)の[秋](あき)には[種蒔](たねま)きをしとかなきゃ",
 		tags: [ "ヴィンランドサガ" ]
 	},
 	{
@@ -7273,10 +7273,10 @@ export default [
 		output: "あなたにできる[唯一](ゆいいつ)のことです",
 		tags: [ "見える子ちゃん" ]
 	},
-	{
+	{ // NOTE: くんえんざい is normally written as 燻煙剤 (this term is only listed correctly in my jp-jp dictionaries)
 		input: "くん煙剤です。大量に焚いときました。",
-		reading: "くんけむりざいです。たいりょうにたいときました。",
-		output: "くん[煙](けむり)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。",
+		reading: "くんえんざいです。たいりょうにたいときました。",
+		output: "くん[煙](えん)[剤](ざい)です。[大量](たいりょう)に[焚](た)いときました。",
 		tags: [ "見える子ちゃん" ]
 	},
 	{
@@ -9476,4 +9476,22 @@ export default [
 		output: "ぜひ[目次](もくじ)を[見](み)ながら[欲](ほ)しいものを[探](さが)していってもらえると[嬉](うれ)しいです",
 		tags: [ "YouTube", "トバログ" ]
 	},
+	{ // エロマンガ先生 episode 07 @ 13:13 (test for 点 + くらい as separate words)
+		input: "百万点くらい面白かったです！",
+		reading: "ひゃくまんてんくらいおもしろかったです！",
+		output: "[百万](ひゃくまん)[点](てん)くらい[面白](おもしろ)かったです！",
+		tags: [ "エロマンガ先生" ],
+	},
+	{ // 無職転生 season 2 episode 3 @ 16:06 (test for 〜じまう -> しまう)
+		input: "とりあえず 今日は飲め 死ぬほど飲んじまえ",
+		reading: "とりあえず きょうはのめ しぬほどのんじまえ",
+		output: "とりあえず [今日](きょう)は[飲](の)め [死](し)ぬほど[飲](の)んじまえ",
+		tags: [ "無職転生：異世界行ったら本気だす" ],
+	},
+	{ // よふかしのうた episode 02 @ 02:14 (test for しまえ -> 閉める +imperative)
+		input: "うおー びっくりした しまえ しまえ 早く",
+		reading: "うおー びっくりした しまえ しまえ 早く",
+		output: "うおー びっくりした しまえ しまえ 早く",
+		tags: [ "よふかしのうた" ],
+	},
 ] satisfies Test[];
diff --git a/test/reading/test.ts b/test/reading/test.ts
index d74228c..42aa43c 100644
--- a/test/reading/test.ts
+++ b/test/reading/test.ts
@@ -4,7 +4,6 @@ import cases from "./cases.ts";
 cases.forEach(({input, output}, i) => {
 	// if (i != 1) return;
 	Deno.test(`Sentence reading ${formatCaseIndex(i, cases.length)} - ${input}`, async () => {
-		// TODO: use domain/series tags
 		var sentence = await api.sentence(input);
 		assertStrDiff(output, sentence.furigana("refold-tools"));
 	});
diff --git a/todo.md b/todo.md
new file mode 100644
index 0000000..6840877
--- /dev/null
+++ b/todo.md
@@ -0,0 +1,150 @@
+# generic list of concrete todo items that don't need further consideration
+
+## 0.0.1 (standalone API)
+
+- [x] working proof of concept sentence lookup using deno/sqlite3
+- [ ] port dictionaries for more advanced testing
+    - [x] JMdict (WIP)
+    - [ ] JMNedict
+- [x] add more deinflections to db/deinflections.sql
+- [x] set up unit tests for sentence reading generation
+- [x] port server-internal API to simple HTTP JSON API
+- [ ] [improve DB schema](#how-to-store-multiple-readingswritings-in-db)
+- [ ] finish [API examples](examples/readme.md)
+- [ ] remove makefiles for database initialization
+- [ ] add separate kanji readings/info table
+- [ ] add separate frequency dictionary
+- [ ] complete documentation
+- [ ] add code formatter config
+- [ ] ~replace .sql script files with typescript sql query generation library~ ([the problem](https://www.reddit.com/r/Deno/comments/ss6568/alternative_to_knexjs_on_deno/))
+
+## 0.1.0 (front-end UI)
+
+- [ ] create primitive search page ui
+
+## always
+
+- [ ] improve sentence parser accuracy
+    - [ ] have the parser recursively explore N shorter terms at each word
+      found and rank resulting possible sentences (by frequency?)
+    - [ ] use domain-specific tags in reading tests (create domain-specific
+      dictionaries first)
+    - [ ] normalize dictionary before import
+        - [ ] remove "baked" combinations of word + suffix
+        - [ ] automatically create combinations of kanji replaced by kana as
+          alternate writings
+    - [ ] add more deinflections for casual speech and other colloquialisms
+
+# how to store multiple readings/writings in DB
+
+## idea 1
+
+positives:
+- allows multiple alternate readings/writings for terms
+- easy to find primary reading or writing for a term
+- efficiently stores kana-only words
+- allows parser to parse alternatively written words (currently requires manual
+  typescript intervention to resolve `alt` field back to actual term to get
+  it's tags)
+
+negatives:
+- ~creates duplicates in `text` column for readings of terms with different
+  kanji but the same reading~
+  
+  I consider this a non-issue because this simplifies the sentence lookup
+  query. The alternative (a term\<-\>reading/writing reference table) would
+  save disk space in exchange for processing time and complexity.
+- ~unclear how to reference a specific word without using it's `term_id` (which
+  can vary from user to user when different dictionaries are installed), or
+  *what identifies a unique term in this case?*~
+  
+  `user.sort_overlay` needs to be able to uniquely identify a `term_id`, but
+  also needs to be in-/exportable by users with different imported dictionaries
+  (ideally with minimal failure).
+  
+  things to consider:
+  
+  options:
+  - ~just use (primary) writing only~
+    
+    this doesn't work for terms with multiple readings to distinguish between
+    meanings, e.g.
+    <ruby>人気<rt>ひとけ</rt></ruby>/<ruby>人気<rt>にんき</rt></ruby>
+  - ~identify as "term with text X and another text Y"~
+    
+    this feels like a janky solution but is what is currently being used, where
+    X is always the default way of writing and Y the default reading
+  - directly reference `term_id` in `user.sort_overlay` and convert to matching
+    all known readings/writings at time of export/import
+    
+    good:
+    
+    - faster `user.sort_overlay` lookups
+    - still allows user preference import/exporting
+    
+    bad:
+    
+    - ~all data in `user.db` becomes useless when `dict.db` is lost or corrupted~
+      
+      `user.sort_overlay` will be moved to `dict.db`, and `user.db` will only
+      be used for storing (mostly portable) user preferences and identifiers
+      (username, id, etc.).
+    - importing/exporting will take longer and require more complicated sql code
+  
+
+### example tables
+
+#### readwritings (should have better name)
+
+(indexes from LSB)  
+`flags[0]` = primary writing  
+`flags[1]` = primary reading
+
+|`id`|`term_id`|`text`|`flags`|
+|-|-|-|-|
+|1|1|繰り返す|1|
+|2|1|くり返す|0|
+|3|1|繰返す|0|
+|4|1|繰りかえす|0|
+|5|1|くりかえす|2|
+|6|2|変える|1|
+|7|2|かえる|2|
+|8|3|帰る|1|
+|9|3|かえる|2|
+|10|4|にやにや|3|
+
+# how/where to deal with irregular readings
+
+WIP
+
+ideally one way of storing reading exceptions for:
+
+- 来る + する (conjugation-dependent)
+- 入る as (はいる) or (いる) (not sure about this one?)
+- counters (counter type + amount specific)
+- numbers (exceptions for certain powers of 10)
+
+# way to link expressions to a list of conjugated terms
+
+WIP
+
+this may be useful for dictionaries that provide meanings for expressions but
+don't provide readings for those expressions? (新和英大辞典 has some of these)
+
+examples:
+- 村長選 -> 村長 + 選[suf]
+- 花より団子 -> 花 + より[grammar] + 団子
+
+# random thoughts
+
+this project has 0 planning so here's a list of things that may eventually need
+some thought
+
+- how can a series-specific dictionary also 'encourage' the use of another
+  domain-specific category? (e.g. anime about programming makes computer domain
+  specific terms rank slightly higher or something?)
+- maybe have a mode in the front-end that captures preedit text from a user
+  typing japanese text to infer readings of kanji, or rank different terms
+  slightly higher? (using [compositionupdate
+  events](https://developer.mozilla.org/en-US/docs/Web/API/Element/compositionupdate_event))
+
author	lonkaars <loek@pipeframe.xyz>	2023-08-06 15:01:13 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-08-06 15:01:13 +0200
commit	e14b315a088d375b80e46978bd769c7d13e71001 (patch)
tree	d62a36d47b9865a6c38273d07c682e3082e4b86c
parent	578af2663f9094f8b852d32cf16868b37518d648 (diff)