From 65e7b2260d728a9c747d126f828e90ae34f05b40 Mon Sep 17 00:00:00 2001 From: lonkaars Date: Mon, 10 Jul 2023 18:58:15 +0200 Subject: constrain deinflection test cases more --- db/dict/deinflections.sql | 2 +- db/find.sql | 128 +++++++++++++++++++++++----------------------- 2 files changed, 65 insertions(+), 65 deletions(-) (limited to 'db') diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql index 0dbe19c..fd6ffc8 100644 --- a/db/dict/deinflections.sql +++ b/db/dict/deinflections.sql @@ -157,7 +157,7 @@ insert into deinflection_temp values -- obligation -- TODO: manually write these out instead of splitting particle and suffix - ('infl:must infl:tmp:must:res infl:negative', 'だめ', '', 'a', 'ot'), -- built-in negative because だめ can't be deconjugated + ('infl:negative infl:must infl:tmp:must:res', 'だめ', '', 'a', 'ot'), -- built-in negative because だめ can't be deconjugated ('infl:must infl:tmp:must:res', 'いける', '', 'ru', 'ot'), -- はいけない -> positive (stored this way because obligatory could be in past) ('infl:must infl:tmp:must:res', 'なる', '', 'u', 'ot'), -- はならない -> positive ('infl:must', 'は', '', 'ot', 'nt'), -- removes particle (negative -te + は + だめ/いけない/ならない) diff --git a/db/find.sql b/db/find.sql index e2d6ad8..6c8a80e 100644 --- a/db/find.sql +++ b/db/find.sql @@ -9,73 +9,73 @@ -- explain query plan -- testing only with results(id, expression, reading, tags, depth, rules, original, deinflected) as ( - -- stripped deinflection table (remove some columns and duplicates) - with deinflections(term, tags, depth, original, rules) as ( - -- recursively generated deinflection table - with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as ( - -- input term all substrings until length 1 - with inputs(length, term, tags, rules, rules_in, rules_out, depth) as ( - select length(:term), :term, '', -1, 0, 0, 0 - union - select - inputs.length - 1, - substr(inputs.term, 1, inputs.length - 1), - inputs.tags, - inputs.rules, - inputs.rules_in, - inputs.rules_out, - inputs.depth - from inputs - where inputs.length > 1 - ) - select * from inputs - union -- join all recursive rows into one large table - select - deinflect.length, - substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out, - deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side - deinflection.rules_out, - deinflection.rules_in, - deinflect.rules, - deinflect.depth + 1 - from deinflect -- temp table - inner join deinflection -- deinflection rules table - on - -- rules_in has to contain any of the current deconjugation rules - (deinflect.rules & deinflection.rules_in != 0) and - -- term.endsWith(kana_in) - (substr(term, length(term) - length(kana_in) + 1) = kana_in) and - -- can't deconjugate to length <1 - (length(term) > 0) - limit 100 -- failsafe to catch any infinite loops - ) - select term, tags, depth, substr(:term, 1, deinflect.length), rules - from deinflect - ) - select - term.id, - term.expression, - term.reading, - deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, - deinflections.depth, - rules, - deinflections.original, + -- stripped deinflection table (remove some columns and duplicates) + with deinflections(term, tags, depth, original, rules) as ( + -- recursively generated deinflection table + with deinflect(length, term, tags, rules, rules_in, rules_out, depth) as ( + -- input term all substrings until length 1 + with inputs(length, term, tags, rules, rules_in, rules_out, depth) as ( + select length(:term), :term, '', -1, 0, 0, 0 + union + select + inputs.length - 1, + substr(inputs.term, 1, inputs.length - 1), + inputs.tags, + inputs.rules, + inputs.rules_in, + inputs.rules_out, + inputs.depth + from inputs + where inputs.length > 1 + ) + select * from inputs + union -- join all recursive rows into one large table + select + deinflect.length, + substr(deinflect.term, 1, length(deinflect.term)-length(deinflection.kana_in)) || deinflection.kana_out, + deinflect.tags || ' ' || deinflection.tag, -- parsed to TokenTag[] on (sql) client-side + deinflection.rules_out, + deinflection.rules_in, + deinflect.rules, + deinflect.depth + 1 + from deinflect -- temp table + inner join deinflection -- deinflection rules table + on + -- rules_in has to contain any of the current deconjugation rules + (deinflect.rules & deinflection.rules_in != 0) and + -- term.endsWith(kana_in) + (substr(term, length(term) - length(kana_in) + 1) = kana_in) and + -- can't deconjugate to length <1 + (length(term) > 0) + limit 100 -- failsafe to catch any infinite loops + ) + select term, tags, depth, substr(:term, 1, deinflect.length), rules + from deinflect + ) + select + term.id, + term.expression, + term.reading, + deinflections.tags || ' ' || group_concat(tag.code, ' ') as tags, + deinflections.depth, + rules, + deinflections.original, deinflections.term - from deinflections - inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term) - inner join term_tag on term_tag.term_id = term.id - inner join tag on term_tag.tag_id = tag.id - group by term.id, deinflections.original, deinflections.rules - having term.id is not null + from deinflections + inner join term on (term.expression = deinflections.term) or (term.reading = deinflections.term) + inner join term_tag on term_tag.term_id = term.id + inner join tag on term_tag.tag_id = tag.id + group by term.id, deinflections.original, deinflections.rules + having term.id is not null ) select - results.id, - results.expression, - results.reading, - results.tags, - group_concat(deinflection_rules.tag, ' ') as rules, - results.depth, - results.original, + results.id, + results.expression, + results.reading, + results.tags, + group_concat(deinflection_rules.tag, ' ') as rules, + results.depth, + results.original, results.deinflected, root_overlay.sort as root_overlay, user_overlay.sort as user_overlay -- cgit v1.2.3