diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-07-10 16:26:13 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-07-10 16:26:13 +0200 |
commit | a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch) | |
tree | afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 | |
parent | e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff) |
small restructuring + all deinflection tests working
-rw-r--r-- | api/sentence.ts | 16 | ||||
-rw-r--r-- | api/word.ts | 8 | ||||
-rw-r--r-- | api/yomikun.ts | 1 | ||||
-rw-r--r-- | core/api.ts | 35 | ||||
-rw-r--r-- | core/http/client.ts | 41 | ||||
-rw-r--r-- | core/http/types.ts | 35 | ||||
-rw-r--r-- | core/raw/api.ts | 30 | ||||
-rw-r--r-- | db/db.ts | 6 | ||||
-rw-r--r-- | db/dict/deinflections.sql | 53 | ||||
-rw-r--r-- | db/find.sql | 2 | ||||
-rw-r--r-- | language/parser.ts | 144 | ||||
-rw-r--r-- | language/types.ts | 49 | ||||
-rw-r--r-- | makefile | 9 | ||||
-rw-r--r-- | readme.md | 2 | ||||
-rw-r--r-- | search/readme.md (renamed from language/readme.md) | 4 | ||||
-rw-r--r-- | search/search.ts | 141 | ||||
-rw-r--r-- | search/tags.ts (renamed from language/tags.ts) | 4 | ||||
-rw-r--r-- | search/types.ts | 60 | ||||
-rw-r--r-- | test/deinflection/cases.ts | 16 | ||||
-rw-r--r-- | test/deinflection/test.ts | 17 | ||||
-rw-r--r-- | util/string.ts | 2 |
21 files changed, 373 insertions, 302 deletions
diff --git a/api/sentence.ts b/api/sentence.ts index cde66a5..1d22be3 100644 --- a/api/sentence.ts +++ b/api/sentence.ts @@ -1,11 +1,11 @@ -import { ParseResult } from "../language/types.ts"; +import { SearchSentenceResult } from "../search/types.ts"; import APIBase from "./base.ts"; import { JapaneseFormatter } from "./japanese.ts"; import Word from "./word.ts"; export default class Sentence extends APIBase { public words: Array<Word> = []; - protected query?: ParseResult; + protected query?: SearchSentenceResult; protected original: string = ""; public ready: Promise<void>; @@ -23,7 +23,7 @@ export default class Sentence extends APIBase { private async fetch(input: string) { this.original = input; - this.query = await (await this.api)["core"].parseSentence(input); + this.query = await (await this.api)["core"].search.sentence(input); await this.updateWords(); this._resolveReady(); } @@ -33,15 +33,15 @@ export default class Sentence extends APIBase { let token = 0; let i = 0; while (i < this.original.length) { - this.words.push(new Word(this.query!.tokens[token]).withParent(await this.api)); + this.words.push(new Word(this.query!.words[token]).withParent(await this.api)); - i += this.query!.tokens[token].source.length; + i += this.query!.words[token].source.length; if (i == this.original.length) break; token++; - // continue if there are no unrecognized gaps between tokens - if (this.query!.tokens[token]?.start == i) continue; - var remainder = this.original.substring(i, this.query!.tokens[token]?.start); + // continue if there are no unrecognized gaps between words + if (this.query!.words[token]?.start == i) continue; + var remainder = this.original.substring(i, this.query!.words[token]?.start); this.words.push(new Word(remainder).withParent(await this.api)); i += remainder.length; diff --git a/api/word.ts b/api/word.ts index b7fc3e6..4dad4a3 100644 --- a/api/word.ts +++ b/api/word.ts @@ -1,10 +1,10 @@ import Glossary from "./glossary.ts"; import APIBase from "./base.ts"; -import { ParseToken } from "../language/types.ts"; import Japanese, { JapaneseFormatter } from "./japanese.ts"; import "../util/string.ts"; -import { Tag, TagGroup } from "../language/tags.ts"; +import { TagGroup } from "../search/tags.ts"; +import { SearchWord } from "../search/types.ts"; export default class Word extends APIBase { /** @prop dictionary form of verb if this word is a verb */ @@ -16,7 +16,7 @@ export default class Word extends APIBase { /** @prop this word represents an unrecognized sentence part between recognized terms */ protected filler: boolean; - constructor(input: string | ParseToken) { + constructor(input: string | SearchWord) { super(); if (typeof input === "string") { this.filler = true; @@ -26,7 +26,7 @@ export default class Word extends APIBase { this.outputKanji = false; } else { this.filler = false; - input = input as ParseToken; + input = input as SearchWord; this.base = new Japanese(input.writing, input.reading); if (input.tags.anyOf(TagGroup.Conjugable as string[])) { var writingCommon = input.writing.cmpLen(input.source); diff --git a/api/yomikun.ts b/api/yomikun.ts index a7f214e..696361f 100644 --- a/api/yomikun.ts +++ b/api/yomikun.ts @@ -1,6 +1,5 @@ import Core from "../core/api.ts"; import RemoteCoreClient from "../core/http/client.ts"; -import { ParseResult } from "../language/types.ts"; import Sentence from "./sentence.ts"; export default class Yomikun { diff --git a/core/api.ts b/core/api.ts index 0720c8b..77195b2 100644 --- a/core/api.ts +++ b/core/api.ts @@ -1,6 +1,33 @@ -import { InputSentenceProps, ParseResult } from "../language/types.ts"; +import { SearchSentenceProps, SearchSentenceResult, SearchTermResult } from "../search/types.ts"; import { DeepPartial } from "../util/types.ts"; +/** @interface serach-related functions */ +export interface CoreSearch { + terms(term: string): Promise<Array<SearchTermResult>>; + sentence(sentence: string, optional?: DeepPartial<SearchSentenceProps>): Promise<SearchSentenceResult>; + // glossary: (input: string) => Promise<void>; +}; + +/** @interface user management */ +export interface CoreUser { + // TODO: list + // TODO: add + // TODO: remove + // TODO: get info +}; + +/** @interface dictionary/user data import functions */ +export interface CoreImport { + // TODO: import dictionary + // TODO: import user preferences +}; + +/** @interface dictionary/user data export functions */ +export interface CoreExport { + // TODO: export dictionary + // TODO: export user preferences +}; + /** * @summary Core interface * @@ -12,7 +39,9 @@ export default abstract class Core { /** @summary resolved when ready */ abstract ready: Promise<void>; - /** @summary parse sentence */ - abstract parseSentence(input: string, options?: DeepPartial<InputSentenceProps>): Promise<ParseResult>; + abstract search: CoreSearch; + abstract user: CoreUser; + abstract import: CoreImport; + abstract export: CoreExport; }; diff --git a/core/http/client.ts b/core/http/client.ts index 6b4e1a3..80f77b3 100644 --- a/core/http/client.ts +++ b/core/http/client.ts @@ -1,10 +1,8 @@ -import { InputSentenceProps } from "../../language/types.ts"; import "../../util/array.ts"; -import { DeepPartial } from "../../util/types.ts"; -import Core from "../api.ts"; +import Core, { CoreExport, CoreImport, CoreSearch, CoreUser } from "../api.ts"; import { ConnectionProps, ConnectionPropsDefault } from "./props.ts"; -import { CoreRequest, CoreRequestParseSentence, CoreResponseParseSentence } from "./types.ts"; +import { CoreRequest, CoreRequestSearchSentence, CoreRequestSearchTerms, CoreResponseSearchSentence, CoreResponseSearchTerms } from "./types.ts"; /** * @summary HTTP Core client @@ -13,8 +11,9 @@ import { CoreRequest, CoreRequestParseSentence, CoreResponseParseSentence } from * (de)serialization automatically. */ export default class RemoteCoreClient implements Core { + public ready: Promise<void> = Promise.resolve(); + private props: ConnectionProps; - ready: Promise<void> = Promise.resolve(); constructor(options?: ConnectionProps) { this.props = { ...ConnectionPropsDefault, ...options }; @@ -32,13 +31,29 @@ export default class RemoteCoreClient implements Core { return response.json(); } - async parseSentence(input: string, options?: DeepPartial<InputSentenceProps>) { - var request: CoreRequestParseSentence = { - command: "parseSentence", - options: { input, options, }, - }; - var { response } = await this.request(request) as CoreResponseParseSentence; - return response; - } + public search: CoreSearch = { + terms: async term => { + var request: CoreRequestSearchTerms = { + command: "search.terms", + options: { term, }, + }; + var { response } = await this.request(request) as CoreResponseSearchTerms; + return response; + }, + sentence: async (sentence, optional?) => { + var request: CoreRequestSearchSentence = { + command: "search.sentence", + options: { sentence, optional, }, + }; + var { response } = await this.request(request) as CoreResponseSearchSentence; + return response; + }, + }; + + public user: CoreUser = {}; + + public import: CoreImport = {}; + + public export: CoreExport = {}; } diff --git a/core/http/types.ts b/core/http/types.ts index 3d55a98..51c221a 100644 --- a/core/http/types.ts +++ b/core/http/types.ts @@ -1,4 +1,4 @@ -import { InputSentenceProps, ParseResult } from "../../language/types.ts"; +import { SearchTermResult, SearchSentenceResult, SearchSentenceProps } from "../../search/types.ts"; import { DeepPartial } from "../../util/types.ts"; export interface CoreRequest { @@ -6,22 +6,33 @@ export interface CoreRequest { options: any; }; -export interface CoreRequestParseSentence extends CoreRequest { - command: "parseSentence"; - options: { - input: string; - options?: DeepPartial<InputSentenceProps>; - }; -}; - export interface CoreResponse { command: string; response: any; // final: boolean; }; -export interface CoreResponseParseSentence extends CoreResponse { - command: "parseSentence"; - response: ParseResult; +export interface CoreRequestSearchSentence extends CoreRequest { + command: "search.sentence"; + options: { + sentence: string; + optional?: DeepPartial<SearchSentenceProps>; + }; }; +export interface CoreResponseSearchSentence extends CoreResponse { + command: "search.sentence"; + response: SearchSentenceResult; +}; + +export interface CoreRequestSearchTerms extends CoreRequest { + command: "search.terms"; + options: { + term: string; + }; +}; + +export interface CoreResponseSearchTerms extends CoreResponse { + command: "search.terms"; + response: Array<SearchTermResult>; +}; diff --git a/core/raw/api.ts b/core/raw/api.ts index 593b932..6046a26 100644 --- a/core/raw/api.ts +++ b/core/raw/api.ts @@ -1,29 +1,39 @@ -import Core from "../api.ts"; -import Parser from "../../language/parser.ts"; +import Core, { CoreExport, CoreImport, CoreSearch, CoreUser } from "../api.ts"; import YomikunError from "../../util/error.ts"; -import { DeepPartial } from "../../util/types.ts"; -import { InputSentenceProps } from "../../language/types.ts"; +import Search from "../../search/search.ts"; /** @summary internal Core (DO NOT USE DIRECTLY) */ export default class RawCore implements Core { - private parser: Parser; public ready: Promise<void>; + private _search: Search; + constructor() { if (this.constructor === RawCore) { throw new YomikunError("RawCore instantiated! Use DirectCoreClient instead!"); } - this.parser = new Parser(); + this._search = new Search(); this.ready = new Promise(async resolve => { - await this.parser.ready; + await this._search.ready; resolve(); }) } - async parseSentence(input: string, options?: DeepPartial<InputSentenceProps>) { - return await this.parser.parse(input, options); - } + public search: CoreSearch = { + terms: async term => { + return await this._search.terms(term); + }, + sentence: async (sentence, optional?) => { + return await this._search.sentence(sentence, optional); + }, + }; + + public user: CoreUser = {}; + + public import: CoreImport = {}; + + public export: CoreExport = {}; }; @@ -1,7 +1,7 @@ import { Database, Statement } from "https://deno.land/x/sqlite3@0.9.1/mod.ts"; import * as path from 'https://deno.land/std@0.102.0/path/mod.ts'; -import { TokenTags } from "../language/tags.ts"; +import { TokenTags } from "../search/tags.ts"; import "../util/string.ts"; export interface DBDictInfo { @@ -20,7 +20,7 @@ export interface FindResult { depth: number; original: string; match: { - kanji: boolean; + writing: boolean; reading: boolean; }; } @@ -97,7 +97,7 @@ export default class DB { depth: term.depth, original: term.original, match: { - kanji: term.expression == term.deinflected, + writing: term.expression == term.deinflected, reading: term.reading == term.deinflected, }, }; diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql index a79fff0..e768d33 100644 --- a/db/dict/deinflections.sql +++ b/db/dict/deinflections.sql @@ -126,20 +126,20 @@ insert into deinflection_temp values ('infl:tense:cont', 'いる', '', 'a', 'a'), -- potential form <https://guidetojapanese.org/learn/grammar/potential> - ('infl:potential', 'られる', 'る', 'a', 'ru'), - ('infl:potential', 'える', 'う', 'a', 'u'), - ('infl:potential', 'ける', 'く', 'a', 'u'), - ('infl:potential', 'げる', 'ぐ', 'a', 'u'), - ('infl:potential', 'せる', 'す', 'a', 'u'), - ('infl:potential', 'てる', 'つ', 'a', 'u'), - ('infl:potential', 'ねる', 'ぬ', 'a', 'u'), - ('infl:potential', 'べる', 'ぶ', 'a', 'u'), - ('infl:potential', 'める', 'む', 'a', 'u'), - ('infl:potential', 'れる', 'る', 'a', 'u'), - ('infl:potential', 'できる', 'する', 'a', 's'), - ('infl:potential', 'こられる', 'くる', 'a', 'k'), - ('infl:potential', 'ありうる', 'ある', 'a', ''), -- exception - ('infl:potential', 'ありえる', 'ある', 'a', ''), -- exception + ('infl:potential', 'られる', 'る', 'ru', 'ru'), + ('infl:potential', 'える', 'う', 'ru', 'u'), + ('infl:potential', 'ける', 'く', 'ru', 'u'), + ('infl:potential', 'げる', 'ぐ', 'ru', 'u'), + ('infl:potential', 'せる', 'す', 'ru', 'u'), + ('infl:potential', 'てる', 'つ', 'ru', 'u'), + ('infl:potential', 'ねる', 'ぬ', 'ru', 'u'), + ('infl:potential', 'べる', 'ぶ', 'ru', 'u'), + ('infl:potential', 'める', 'む', 'ru', 'u'), + ('infl:potential', 'れる', 'る', 'ru', 'u'), + ('infl:potential', 'できる', 'する', 'ru', 's'), + ('infl:potential', 'こられる', 'くる', 'ru', 'k'), + ('infl:potential', 'ありうる', 'ある', 'ru', ''), -- exception + ('infl:potential', 'ありえる', 'ある', 'ru', ''), -- exception -- conditionals <https://guidetojapanese.org/learn/grammar/conditionals> ('infl:cond:ba', 'えば', 'う', 'nt', 'u'), @@ -271,19 +271,18 @@ insert into deinflection_temp values ('infl:causative', '来さす', '来る', 'a', 'k'), -- passive <https://guidetojapanese.org/learn/grammar/causepass> - ('infl:passive', 'られる', 'る', 'a', 'ru'), - ('infl:passive', 'われる', 'う', 'a', 'u'), - ('infl:passive', 'かれる', 'く', 'a', 'u'), - ('infl:passive', 'がれる', 'ぐ', 'a', 'u'), - ('infl:passive', 'される', 'す', 'a', 'u'), - ('infl:passive', 'たれる', 'つ', 'a', 'u'), - ('infl:passive', 'なれる', 'ぬ', 'a', 'u'), - ('infl:passive', 'ばれる', 'ぶ', 'a', 'u'), - ('infl:passive', 'まれる', 'む', 'a', 'u'), - ('infl:passive', 'られる', 'る', 'a', 'u'), - ('infl:passive', 'される', 'する', 'a', 's'), - ('infl:passive', 'こられる', 'くる', 'a', 'k'), - ('infl:passive', '来られる', '来る', 'a', 'k'), + ('infl:passive', 'られる', 'る', 'ru', 'ru u'), + ('infl:passive', 'われる', 'う', 'ru', 'u'), + ('infl:passive', 'かれる', 'く', 'ru', 'u'), + ('infl:passive', 'がれる', 'ぐ', 'ru', 'u'), + ('infl:passive', 'される', 'す', 'ru', 'u'), + ('infl:passive', 'たれる', 'つ', 'ru', 'u'), + ('infl:passive', 'なれる', 'ぬ', 'ru', 'u'), + ('infl:passive', 'ばれる', 'ぶ', 'ru', 'u'), + ('infl:passive', 'まれる', 'む', 'ru', 'u'), + ('infl:passive', 'される', 'する', 'ru', 's'), + ('infl:passive', 'こられる', 'くる', 'ru', 'k'), + ('infl:passive', '来られる', '来る', 'ru', 'k'), -- auxiliary rules ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem diff --git a/db/find.sql b/db/find.sql index dd6a011..e2d6ad8 100644 --- a/db/find.sql +++ b/db/find.sql @@ -47,7 +47,7 @@ with results(id, expression, reading, tags, depth, rules, original, deinflected) (substr(term, length(term) - length(kana_in) + 1) = kana_in) and -- can't deconjugate to length <1 (length(term) > 0) - limit 50 -- failsafe to catch any infinite loops + limit 100 -- failsafe to catch any infinite loops ) select term, tags, depth, substr(:term, 1, deinflect.length), rules from deinflect diff --git a/language/parser.ts b/language/parser.ts deleted file mode 100644 index 7fd3981..0000000 --- a/language/parser.ts +++ /dev/null @@ -1,144 +0,0 @@ -import { Tag, TagGroup } from "./tags.ts"; -import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; -import DB from "../db/db.ts"; -import "../util/array.ts"; -import "../util/set.ts"; -import { DeepPartial } from "../util/types.ts"; - -// TODO: rename Parser to Search -/** @summary main Parser class */ -export default class Parser { - db: DB; - ready: Promise<void>; - - constructor() { - this.db = new DB(); - - this.ready = new Promise<void>(async resolve => { - await this.db.ready; - resolve(); - }); - } - - // Search.sentence() - async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> { - await this.ready; - - // initialize default options - var props: InputSentenceProps = { - lookahead: optional?.lookahead ?? 15, - depth: optional?.depth ?? ParseDepth.Term, - priorityMod: { - high: optional?.priorityMod?.high ?? 10, - low: optional?.priorityMod?.low ?? -10, - }, - breaks: optional?.breaks ?? [], - } - - let parseResult = await this.parseTerms(sentence, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - parseResult = await this.addGlossary(parseResult, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - return parseResult; - } - - /** @summary parse sentence into terms with readings */ - private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> { - var parseResult: ParseResult = { - tokens: [], - depth: ParseDepth.Term, - input: sentence, - }; - - for (let start = 0; start < sentence.length; start++) { - var lookahead = options.lookahead; - - var results = await this.db.findTerm(sentence.substring(start, start + lookahead)); - // current starting point did not yield results, try again at next character or until end of input - if (results.length == 0) continue; - - results = results.filter(result => { - // ignore ignored by user terms - if (result.sort < 0) return false; - - // deconjugated words - if (result.depth > 0) { - // check if this word can be conjugated at all - if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; - - // ignore other wrong deconjugations - if (result.tags.includes(Tag.Class.Verb.U) && - !result.tags.includes(Tag.Inflection.Reason.U)) return false; - if (result.tags.includes(Tag.Class.Verb.Ru) && - !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; - if (result.tags.includes(Tag.Class.Verb.Suru) && - !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; - if (result.tags.includes(Tag.Class.Adjective.I) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; - if (result.tags.includes(Tag.Class.Adjective.Na) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; - } - - // all other results should be valid grammatically - return true; - }); - - // no valid results left after filter, try again at next character or until end of input - if (results.length == 0) continue; - - // bias search results by modifying sort value - results = results.map(result => { - // true if last token was a name else false - const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); - - // give higher priority to suffixes when last token was a name, else lower priority - if (result.tags.includes(Tag.Class.Suffix)) - result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low; - - // give lower priority to terms matched only by their readings, and are - // usually written in kanji - if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) - result.sort += options.priorityMod.low; - - return result; - }); - - - results.sort((a, b) => { - // sort by original string length (long to short) - if (a.original.length != b.original.length) return b.original.length - a.original.length; - // then by sort index (high to low) - if (a.sort != b.sort) return b.sort - a.sort; - // then by depth (high to low) - if (a.depth != b.depth) return b.depth - a.depth; - // else keep current order (random) - return 0; - }); - - // pick top result - const result = results[0]; - - parseResult.tokens.push({ - writing: result.expression, - reading: result.reading, - tags: result.tags, - term_id: result.id, - source: result.original, - start: start, - }); - - start += result.original.length - 1; // -1 because loop already increments start - continue; // extra verbose end of iteration - } - return parseResult; - } - - private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> { - // TODO: annotate input with glossaries from DB - options; // prevent unused warning - return input; - } -}; - diff --git a/language/types.ts b/language/types.ts deleted file mode 100644 index d3585f8..0000000 --- a/language/types.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { TokenTags } from "./tags.ts"; - -export enum ParseDepth { - Term, - Glossary, -}; - -export interface GlossaryDefinition { - -}; - -export interface Glossary { - id: number; - definitions: GlossaryDefinition[]; -}; - -export interface ParseToken { - writing: string; - reading: string; - tags: TokenTags; - glossary?: Glossary; - term_id: number; - source: string; - start: number; -}; - -export interface ParseResult { - depth: ParseDepth; - tokens: ParseToken[]; - input: string; -}; - -/** @summary option struct for Parser */ -export interface InputSentenceProps { - /** @prop max amount of characters to look ahead when attempting to deconjugate */ - lookahead: number; - /** @prop amount of detail to return in search results */ - depth: ParseDepth; - /** @prop search bias multipliers */ - priorityMod: { - /** @prop multiplier for negative bias */ - low: number; - /** @prop multiplier for positive bias */ - high: number; - }; - /** @prop list of breaks treated as delimiter */ - breaks: Array<number>; -}; - @@ -1,14 +1,7 @@ TARGET = yomikun +# TODO: figure out distribution SRCS += ./main.ts -SRCS += ./core/yomikun.ts -SRCS += ./db/db.ts -SRCS += ./language/tags.ts -SRCS += ./language/translator.ts -SRCS += ./language/types.ts -SRCS += ./util/array.ts -SRCS += ./util/error.ts -SRCS += ./util/string.ts DENO_FLAGS += --unstable DENO_FLAGS += --allow-ffi @@ -29,6 +29,8 @@ scope is larger than Yomichan, it's still focused on Japanese only.** - [ ] create primitive search page ui - [ ] add code formatter config - [ ] complete documentation +- [ ] remove makefiles for database initialization +- [ ] replace .sql script files with typescript sql query generation library ## ~New features (from Yomichan)~ diff --git a/language/readme.md b/search/readme.md index 99a7d69..400c8ce 100644 --- a/language/readme.md +++ b/search/readme.md @@ -1,7 +1,7 @@ -# Language +# Search This directory contains files that provide an abstracted interface with the -database for looking up sentences ~and words~. +database for looking up sentences and words. ## Tags diff --git a/search/search.ts b/search/search.ts new file mode 100644 index 0000000..0a50773 --- /dev/null +++ b/search/search.ts @@ -0,0 +1,141 @@ +import { Tag, TagGroup } from "./tags.ts"; +import { SearchSentenceProps, SearchSentenceResult, SearchTermResult, SearchWord } from "./types.ts"; +import DB from "../db/db.ts"; +import "../util/array.ts"; +import "../util/set.ts"; +import { DeepPartial } from "../util/types.ts"; + +/** @summary main Search class */ +export default class Search { + db: DB; + ready: Promise<void>; + + constructor() { + this.db = new DB(); + + this.ready = new Promise<void>(async resolve => { + await this.db.ready; + resolve(); + }); + } + + /** @summary find possible terms at start of string by deconjugating */ + public async terms(term: string): Promise<Array<SearchTermResult>> { + await this.ready; + + var results = await this.db.findTerm(term); + + // skip filtering valid results if there are none + if (results.length == 0) return []; + + // filter invalid deconjugations/results + results = results.filter(result => { + // ignore ignored by user terms + if (result.sort < 0) return false; + + // deconjugated words + if (result.depth > 0) { + // check if this word can be conjugated at all + if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; + + // ignore other wrong deconjugations + if (result.tags.includes(Tag.Class.Verb.U) && + !result.tags.includes(Tag.Inflection.Reason.U)) return false; + if (result.tags.includes(Tag.Class.Verb.Ru) && + !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; + if (result.tags.includes(Tag.Class.Verb.Suru) && + !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; + if (result.tags.includes(Tag.Class.Adjective.I) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; + if (result.tags.includes(Tag.Class.Adjective.Na) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; + } + + // all other results should be valid + return true; + }); + + return results.map(result => ({ + id: result.id, + writing: result.expression, + reading: result.reading, + tags: result.tags, + source: result.original, + sort: result.sort, + depth: result.depth, + match: { + reading: result.match.reading, + writing: result.match.writing, + }, + })); + } + + /** @summary parse sentence into terms with readings */ + public async sentence(sentence: string, optional?: DeepPartial<SearchSentenceProps>): Promise<SearchSentenceResult> { + await this.ready; + + var props: SearchSentenceProps = { + lookahead: optional?.lookahead ?? 15, + priorityMod: { + high: optional?.priorityMod?.high ?? 10, + low: optional?.priorityMod?.low ?? -10, + }, + breaks: optional?.breaks ?? [], + } + + var parseResult: SearchSentenceResult = { + input: sentence, + words: [], + }; + + for (let start = 0; start < sentence.length; start++) { + var lookahead = props.lookahead; // TODO: stop at next delimiter (optimization) + var term = sentence.substring(start, start + lookahead); + var results = (await this.terms(term)).map(term => { + var word = term as SearchWord; + word.start = start; + return word; + }); + + // current starting point did not yield results, try again at next character or until end of input + if (results.length == 0) continue; + + // bias search results by modifying sort value + results = results.map(result => { + // true if last token was a name else false + const lastTokenName = parseResult.words.peek()?.tags.anyOf(Object.values(Tag.Name)); + + // give higher priority to suffixes when last token was a name, else lower priority + if (result.tags.includes(Tag.Class.Suffix)) + result.sort += lastTokenName ? props.priorityMod.high : props.priorityMod.low; + + // give lower priority to terms matched only by their readings, and are + // usually written in kanji + if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.writing) + result.sort += props.priorityMod.low; + + return result; + }); + + results.sort((a, b) => { + // sort by original string length (long to short) + if (a.source.length != b.source.length) return b.source.length - a.source.length; + // then by sort index (high to low) + if (a.sort != b.sort) return b.sort - a.sort; + // then by depth (high to low) + if (a.depth != b.depth) return b.depth - a.depth; + // else keep current order (random) + return 0; + }); + + // pick top result + const result = results[0]; + + parseResult.words.push(result); + start += result.source.length - 1; // -1 because loop already increments start + continue; // extra verbose end of iteration + } + return parseResult; + } +}; + diff --git a/language/tags.ts b/search/tags.ts index 72840fe..92279c5 100644 --- a/language/tags.ts +++ b/search/tags.ts @@ -200,6 +200,10 @@ export function parseTags(input: string) { if (tag == Tag.Inflection.Desirable.Volitional && filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue; + // skip conditional 〜ば if used for obligatory inflection + if (tag == Tag.Inflection.Conditional.Ba && + filteredTags.anyOf([Tag.Inflection.Obligatory])) continue; + // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative if (tag == Tag.Inflection.Negative) { negationCount++; diff --git a/search/types.ts b/search/types.ts new file mode 100644 index 0000000..d90afd6 --- /dev/null +++ b/search/types.ts @@ -0,0 +1,60 @@ +import { TokenTags } from "./tags.ts"; + +export interface SearchGlossaryDefinition { + +}; + +export interface SearchGlossary { + id: number; + definitions: SearchGlossaryDefinition[]; +}; + +export interface SearchTermResult { + /** @property dictionary term id */ + id: number; + /** @property (preferably) kanji writing of term */ + writing: string; + /** @property kana-only reading of term */ + reading: string; + /** @property word tags including deconjugation tags */ + tags: TokenTags; + /** @property original conjugated string */ + source: string; + /** @property numeric sorting value for term */ + sort: number; + /** @property amount of steps that were needed to deconjugate */ + depth: number; + /** @property matching results */ + match: { + /** @property term matched by writing */ + writing: boolean; + /** @property term matched by reading */ + reading: boolean; + } +}; + +export interface SearchWord extends SearchTermResult { + /** @property starting index of word in sentence */ + start: number; +}; + +export interface SearchSentenceResult { + words: SearchWord[]; + input: string; +}; + +/** @summary options for Search.sentence() */ +export interface SearchSentenceProps { + /** @prop max amount of characters to look ahead when attempting to deconjugate words */ + lookahead: number; + /** @prop search bias values */ + priorityMod: { + /** @prop offset for negative bias */ + low: number; + /** @prop offset for positive bias */ + high: number; + }; + /** @prop list of breaks treated as delimiter */ + breaks: Array<number>; +}; + diff --git a/test/deinflection/cases.ts b/test/deinflection/cases.ts index 08517d4..c29bdf1 100644 --- a/test/deinflection/cases.ts +++ b/test/deinflection/cases.ts @@ -1,11 +1,11 @@ -import { TokenTags, Tag } from "../../language/tags.ts"; +import { TokenTags, Tag } from "../../search/tags.ts"; const { Inflection } = Tag; interface Test { input: string; mustHave: TokenTags; mustNotHave: TokenTags; - forceID?: number; + force?: { reading: string, writing: string }; }; export default [ @@ -32,7 +32,7 @@ export default [ { input: "取るな", mustHave: [ Inflection.Negative, Inflection.Command ], mustNotHave: [], }, // other tests { input: "取ったり", mustHave: [ Inflection.Suffix.Tari ], mustNotHave: [ Inflection.Tense.Past ], }, - { input: "早く", mustHave: [ Inflection.Adverb ], mustNotHave: [], }, + { input: "早く", force: { reading: "はやい", writing: "早い" }, mustHave: [ Inflection.Adverb ], mustNotHave: [], }, { input: "遊んだり", mustHave: [ Inflection.Suffix.Tari ], mustNotHave: [ Inflection.Tense.Past ], }, { input: "聞け", mustHave: [ Inflection.Command ], mustNotHave: [], }, { input: "食べさせる", mustHave: [ Inflection.Causative ], mustNotHave: [], }, @@ -51,13 +51,12 @@ export default [ { input: "しなくてはいけなかった", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "行かないとだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "しないといけない", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, - { input: "行かなければいけません", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, - { input: "しなければだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, + { input: "行かなければいけません", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative, Inflection.Conditional.Ba ], }, + { input: "しなければだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative, Inflection.Conditional.Ba ], }, { input: "行かないと", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "買いたい", mustHave: [ Inflection.Desirable.Itai ], mustNotHave: [], }, { input: "寝よう", mustHave: [ Inflection.Desirable.Volitional ], mustNotHave: [], }, - // TODO: for this test to work, a parseSentencePart function needs to be made that returns all possible words (currently clipped) - // { input: "しましょう", forceID: 17327, mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, + { input: "しましょう", force: { reading: "する", writing: "為る" }, mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "きましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "寝ましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "行きましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, @@ -76,9 +75,10 @@ export default [ { input: "聞きなさい", mustHave: [ Inflection.Polite.Nasai ], mustNotHave: [], }, { input: "座りなさい", mustHave: [ Inflection.Polite.Nasai ], mustNotHave: [], }, { input: "食べさせられる", mustHave: [ Inflection.Passive, Inflection.Causative ], mustNotHave: [], }, + { input: "見極めなければならない", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Conditional.Ba, Inflection.Negative ] } // TODO: りゃ for いることは // TODO: じゃ for では // TODO: なきゃ + なくちゃ // and more! -] as Test[]; +] satisfies Test[] as Test[]; diff --git a/test/deinflection/test.ts b/test/deinflection/test.ts index 017a5c7..1d2f172 100644 --- a/test/deinflection/test.ts +++ b/test/deinflection/test.ts @@ -1,17 +1,18 @@ import cases from "./cases.ts"; import { core } from '../base.ts'; -import { TokenTag } from '../../language/tags.ts'; +import { TokenTag } from "../../search/tags.ts"; -cases.forEach(({ input, mustHave, mustNotHave, forceID }) => { +cases.forEach(({ input, mustHave, mustNotHave, force }) => { Deno.test(`deinflection - ${input}`, async () => { - var { tokens } = await core.parseSentence(input); + var terms = await core.search.terms(input); - if (tokens.length == 0) - throw new Error("No parsed tokens for input"); + if (terms.length == 0) + throw new Error("No parsed terms for input"); - // console.log(tokens); - var result = tokens.find(t => t.source == input); - if (forceID) result = tokens.find(t => t.term_id == forceID); + // console.log(terms); + var result = terms.find(t => t.source == input); + if (force) + result = terms.find(t => t.reading == force.reading && t.writing == force.writing); if (!result) throw new Error("No deconjugation found for input"); diff --git a/util/string.ts b/util/string.ts index b362f06..4704d03 100644 --- a/util/string.ts +++ b/util/string.ts @@ -1,4 +1,4 @@ -import { TokenTags, parseTags } from "../language/tags.ts"; +import { TokenTags, parseTags } from "../search/tags.ts"; import { Wrapper } from "./wrap.ts"; declare global { |