aboutsummaryrefslogtreecommitdiff
path: root/db/find.sql
diff options
context:
space:
mode:
Diffstat (limited to 'db/find.sql')
-rw-r--r--db/find.sql119
1 files changed, 63 insertions, 56 deletions
diff --git a/db/find.sql b/db/find.sql
index 1d31217..d678a15 100644
--- a/db/find.sql
+++ b/db/find.sql
@@ -1,74 +1,81 @@
--- this statement is prepared and run using :term and :user as inputs (see
+-- this statement is prepared and run using :term and :user as input_substrings (see
-- db.ts or test/find)
-- this file is kind of messy because it needs to be one large query, instead
-- of separate phases creating temporary tables. queries with more than one
-- statement can't return results because of the way sqlite3 works.
+--
+-- the innermost `with` clause contains step 1, the outermost contains step 3,
+-- the last select being step 4. the steps are:
+-- 1. Create all possible substrings from input `:term`
+-- 2. Apply deconjugation rules recursively to `:term`
+-- 3. Find terms matching deconjugations
+-- 4. Clean up and add sort_overlay
--- TODO: add more comments in this file to explain what is going on
-
--- explain query plan -- testing only
-with results(id, expression, reading, tags, depth, rules, original, deinflected) as (
- -- stripped deinflection table (remove some columns and duplicates)
- with deinflections(term, tags, depth, original, rules) as (
- -- recursively generated deinflection table
- with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as (
- -- input term all substrings until length 1
- with inputs(length, term, tags, rules, rules_in, rules_out, depth) as (
- select length(:term), :term, '', -1, 0, 0, 0
- union
- select
- inputs.length - 1,
- substr(inputs.term, 1, inputs.length - 1),
- inputs.tags,
- inputs.rules,
- inputs.rules_in,
- inputs.rules_out,
- inputs.depth
- from inputs
- where inputs.length > 1
- )
- select * from inputs
- union -- join all recursive rows into one large table
- select
- deinflect.length,
- substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out,
- deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side
- deinflection.rules_out,
- deinflection.rules_in,
- deinflect.rules,
- deinflect.depth + 1
- from deinflect -- temp table
- inner join deinflection -- deinflection rules table
- on
- -- rules_in has to contain any of the current deconjugation rules
- (deinflect.rules & deinflection.rules_in != 0) and
- -- term.endsWith(kana_in)
- (substr(term, length(term) - length(kana_in) + 1) = kana_in) and
- -- can't deconjugate to length <1
- (length(term) > 0)
- limit 100 -- failsafe to catch any infinite loops
+-- STEP 3: match deconjugated strings to terms
+with results as (
+ -- STEP 2: apply definitions recursively
+ with deinflections as (
+ -- STEP 1: create all possible substrings from start of `:term`
+ with input_substrings as (
+ select -- first row of recursive query
+ length(:term) as length, -- original term length
+ :term as string -- original term
+ union
+ select -- recursive rows
+ input_substrings.length - 1, -- length = length - 1
+ substr(input_substrings.string, 1, input_substrings.length - 1) -- remove last character
+ from input_substrings
+ where input_substrings.length > 1 -- until only one character remains
)
- select term, tags, depth, substr(:term, 1, deinflect.length), rules
- from deinflect
+ select
+ input_substrings.length as length, -- input length
+ input_substrings.string as string, -- input substring
+ input_substrings.string as original, -- input substring (stays unmodified)
+ '' as tags, -- space-separated deinflection tag list
+ -1 as rules, -- bitmask rules (limits order of applied deinflection rules)
+ 0 as rules_in, -- allowed rules for current deinflection step
+ 0 as rules_out, -- replacement rules for next deinflection step
+ 0 as depth -- amount of applied deconjugations (steps)
+ from input_substrings
+ union
+ select
+ deinflections.length, -- length (passthrough)
+ substr(deinflections.string, 1, length(deinflections.string)-length(deinflection.kana_in)) || deinflection.kana_out, -- replace kana_in with kana_out
+ deinflections.original, -- original (passthrough)
+ deinflections.tags || ' ' || deinflection.tag, -- append deinflection reason tag(s)
+ deinflection.rules_out, -- get next rules (pass rules_out to rules of next iteration)
+ deinflection.rules_in, -- get rules_in
+ deinflections.rules, -- rules (passthrough)
+ deinflections.depth + 1 -- increment depth
+ from deinflections -- recursive table
+ inner join deinflection -- internal deinflection rules table
+ on
+ -- rules_in has to contain any of the current deconjugation rules
+ (deinflections.rules & deinflection.rules_in != 0) and
+ -- string.endsWith(kana_in)
+ (substr(string, length(string) - length(kana_in) + 1) = kana_in) and
+ -- can't deconjugate to length <1
+ (length(string) > 0)
+ limit 100 -- failsafe to catch any infinite loops
)
select
- term.id,
- term.expression,
- term.reading,
- deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags,
- deinflections.depth,
- rules,
- deinflections.original,
- deinflections.term
+ term.id as id, -- term group id
+ term.expression as expression, -- term writing (kanji)
+ term.reading as reading, -- term reading (kana)
+ deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, -- deinflection reasons + dictionary tags
+ deinflections.depth as depth, -- deinflection depth
+ rules as rules, -- deinflection rules (bitmask)
+ deinflections.original as original, -- original input
+ deinflections.string as deinflected -- deinflected input (search term)
from deinflections
- inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term)
+ inner join term on (term.expression = deinflections.string) or (term.reading = deinflections.string)
left join term_tag on term_tag.term_id = term.id
left join tag on term_tag.tag_id = tag.id
group by term.id, deinflections.original, deinflections.rules
having term.id is not null
)
-select
+select -- STEP 4: clean up and join `sort_overlay`
results.id,
results.expression,
results.reading,