diff options
Diffstat (limited to 'db')
-rw-r--r-- | db/find.sql | 119 |
1 files changed, 63 insertions, 56 deletions
diff --git a/db/find.sql b/db/find.sql index 1d31217..d678a15 100644 --- a/db/find.sql +++ b/db/find.sql @@ -1,74 +1,81 @@ --- this statement is prepared and run using :term and :user as inputs (see +-- this statement is prepared and run using :term and :user as input_substrings (see -- db.ts or test/find) -- this file is kind of messy because it needs to be one large query, instead -- of separate phases creating temporary tables. queries with more than one -- statement can't return results because of the way sqlite3 works. +-- +-- the innermost `with` clause contains step 1, the outermost contains step 3, +-- the last select being step 4. the steps are: +-- 1. Create all possible substrings from input `:term` +-- 2. Apply deconjugation rules recursively to `:term` +-- 3. Find terms matching deconjugations +-- 4. Clean up and add sort_overlay --- TODO: add more comments in this file to explain what is going on - --- explain query plan -- testing only -with results(id, expression, reading, tags, depth, rules, original, deinflected) as ( - -- stripped deinflection table (remove some columns and duplicates) - with deinflections(term, tags, depth, original, rules) as ( - -- recursively generated deinflection table - with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as ( - -- input term all substrings until length 1 - with inputs(length, term, tags, rules, rules_in, rules_out, depth) as ( - select length(:term), :term, '', -1, 0, 0, 0 - union - select - inputs.length - 1, - substr(inputs.term, 1, inputs.length - 1), - inputs.tags, - inputs.rules, - inputs.rules_in, - inputs.rules_out, - inputs.depth - from inputs - where inputs.length > 1 - ) - select * from inputs - union -- join all recursive rows into one large table - select - deinflect.length, - substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out, - deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side - deinflection.rules_out, - deinflection.rules_in, - deinflect.rules, - deinflect.depth + 1 - from deinflect -- temp table - inner join deinflection -- deinflection rules table - on - -- rules_in has to contain any of the current deconjugation rules - (deinflect.rules & deinflection.rules_in != 0) and - -- term.endsWith(kana_in) - (substr(term, length(term) - length(kana_in) + 1) = kana_in) and - -- can't deconjugate to length <1 - (length(term) > 0) - limit 100 -- failsafe to catch any infinite loops +-- STEP 3: match deconjugated strings to terms +with results as ( + -- STEP 2: apply definitions recursively + with deinflections as ( + -- STEP 1: create all possible substrings from start of `:term` + with input_substrings as ( + select -- first row of recursive query + length(:term) as length, -- original term length + :term as string -- original term + union + select -- recursive rows + input_substrings.length - 1, -- length = length - 1 + substr(input_substrings.string, 1, input_substrings.length - 1) -- remove last character + from input_substrings + where input_substrings.length > 1 -- until only one character remains ) - select term, tags, depth, substr(:term, 1, deinflect.length), rules - from deinflect + select + input_substrings.length as length, -- input length + input_substrings.string as string, -- input substring + input_substrings.string as original, -- input substring (stays unmodified) + '' as tags, -- space-separated deinflection tag list + -1 as rules, -- bitmask rules (limits order of applied deinflection rules) + 0 as rules_in, -- allowed rules for current deinflection step + 0 as rules_out, -- replacement rules for next deinflection step + 0 as depth -- amount of applied deconjugations (steps) + from input_substrings + union + select + deinflections.length, -- length (passthrough) + substr(deinflections.string, 1, length(deinflections.string)-length(deinflection.kana_in)) || deinflection.kana_out, -- replace kana_in with kana_out + deinflections.original, -- original (passthrough) + deinflections.tags || ' ' || deinflection.tag, -- append deinflection reason tag(s) + deinflection.rules_out, -- get next rules (pass rules_out to rules of next iteration) + deinflection.rules_in, -- get rules_in + deinflections.rules, -- rules (passthrough) + deinflections.depth + 1 -- increment depth + from deinflections -- recursive table + inner join deinflection -- internal deinflection rules table + on + -- rules_in has to contain any of the current deconjugation rules + (deinflections.rules & deinflection.rules_in != 0) and + -- string.endsWith(kana_in) + (substr(string, length(string) - length(kana_in) + 1) = kana_in) and + -- can't deconjugate to length <1 + (length(string) > 0) + limit 100 -- failsafe to catch any infinite loops ) select - term.id, - term.expression, - term.reading, - deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, - deinflections.depth, - rules, - deinflections.original, - deinflections.term + term.id as id, -- term group id + term.expression as expression, -- term writing (kanji) + term.reading as reading, -- term reading (kana) + deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, -- deinflection reasons + dictionary tags + deinflections.depth as depth, -- deinflection depth + rules as rules, -- deinflection rules (bitmask) + deinflections.original as original, -- original input + deinflections.string as deinflected -- deinflected input (search term) from deinflections - inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term) + inner join term on (term.expression = deinflections.string) or (term.reading = deinflections.string) left join term_tag on term_tag.term_id = term.id left join tag on term_tag.tag_id = tag.id group by term.id, deinflections.original, deinflections.rules having term.id is not null ) -select +select -- STEP 4: clean up and join `sort_overlay` results.id, results.expression, results.reading, |