From a3a81530a0a30ba02b5253b762e2ccd77d3b01fc Mon Sep 17 00:00:00 2001 From: lonkaars Date: Mon, 10 Jul 2023 16:26:13 +0200 Subject: small restructuring + all deinflection tests working --- api/sentence.ts | 16 ++-- api/word.ts | 8 +- api/yomikun.ts | 1 - core/api.ts | 35 ++++++- core/http/client.ts | 41 +++++--- core/http/types.ts | 35 ++++--- core/raw/api.ts | 30 ++++-- db/db.ts | 6 +- db/dict/deinflections.sql | 53 +++++------ db/find.sql | 2 +- language/parser.ts | 144 ---------------------------- language/readme.md | 53 ----------- language/tags.ts | 228 -------------------------------------------- language/types.ts | 49 ---------- makefile | 9 +- readme.md | 2 + search/readme.md | 53 +++++++++++ search/search.ts | 141 +++++++++++++++++++++++++++ search/tags.ts | 232 +++++++++++++++++++++++++++++++++++++++++++++ search/types.ts | 60 ++++++++++++ test/deinflection/cases.ts | 16 ++-- test/deinflection/test.ts | 17 ++-- util/string.ts | 2 +- 23 files changed, 652 insertions(+), 581 deletions(-) delete mode 100644 language/parser.ts delete mode 100644 language/readme.md delete mode 100644 language/tags.ts delete mode 100644 language/types.ts create mode 100644 search/readme.md create mode 100644 search/search.ts create mode 100644 search/tags.ts create mode 100644 search/types.ts diff --git a/api/sentence.ts b/api/sentence.ts index cde66a5..1d22be3 100644 --- a/api/sentence.ts +++ b/api/sentence.ts @@ -1,11 +1,11 @@ -import { ParseResult } from "../language/types.ts"; +import { SearchSentenceResult } from "../search/types.ts"; import APIBase from "./base.ts"; import { JapaneseFormatter } from "./japanese.ts"; import Word from "./word.ts"; export default class Sentence extends APIBase { public words: Array = []; - protected query?: ParseResult; + protected query?: SearchSentenceResult; protected original: string = ""; public ready: Promise; @@ -23,7 +23,7 @@ export default class Sentence extends APIBase { private async fetch(input: string) { this.original = input; - this.query = await (await this.api)["core"].parseSentence(input); + this.query = await (await this.api)["core"].search.sentence(input); await this.updateWords(); this._resolveReady(); } @@ -33,15 +33,15 @@ export default class Sentence extends APIBase { let token = 0; let i = 0; while (i < this.original.length) { - this.words.push(new Word(this.query!.tokens[token]).withParent(await this.api)); + this.words.push(new Word(this.query!.words[token]).withParent(await this.api)); - i += this.query!.tokens[token].source.length; + i += this.query!.words[token].source.length; if (i == this.original.length) break; token++; - // continue if there are no unrecognized gaps between tokens - if (this.query!.tokens[token]?.start == i) continue; - var remainder = this.original.substring(i, this.query!.tokens[token]?.start); + // continue if there are no unrecognized gaps between words + if (this.query!.words[token]?.start == i) continue; + var remainder = this.original.substring(i, this.query!.words[token]?.start); this.words.push(new Word(remainder).withParent(await this.api)); i += remainder.length; diff --git a/api/word.ts b/api/word.ts index b7fc3e6..4dad4a3 100644 --- a/api/word.ts +++ b/api/word.ts @@ -1,10 +1,10 @@ import Glossary from "./glossary.ts"; import APIBase from "./base.ts"; -import { ParseToken } from "../language/types.ts"; import Japanese, { JapaneseFormatter } from "./japanese.ts"; import "../util/string.ts"; -import { Tag, TagGroup } from "../language/tags.ts"; +import { TagGroup } from "../search/tags.ts"; +import { SearchWord } from "../search/types.ts"; export default class Word extends APIBase { /** @prop dictionary form of verb if this word is a verb */ @@ -16,7 +16,7 @@ export default class Word extends APIBase { /** @prop this word represents an unrecognized sentence part between recognized terms */ protected filler: boolean; - constructor(input: string | ParseToken) { + constructor(input: string | SearchWord) { super(); if (typeof input === "string") { this.filler = true; @@ -26,7 +26,7 @@ export default class Word extends APIBase { this.outputKanji = false; } else { this.filler = false; - input = input as ParseToken; + input = input as SearchWord; this.base = new Japanese(input.writing, input.reading); if (input.tags.anyOf(TagGroup.Conjugable as string[])) { var writingCommon = input.writing.cmpLen(input.source); diff --git a/api/yomikun.ts b/api/yomikun.ts index a7f214e..696361f 100644 --- a/api/yomikun.ts +++ b/api/yomikun.ts @@ -1,6 +1,5 @@ import Core from "../core/api.ts"; import RemoteCoreClient from "../core/http/client.ts"; -import { ParseResult } from "../language/types.ts"; import Sentence from "./sentence.ts"; export default class Yomikun { diff --git a/core/api.ts b/core/api.ts index 0720c8b..77195b2 100644 --- a/core/api.ts +++ b/core/api.ts @@ -1,6 +1,33 @@ -import { InputSentenceProps, ParseResult } from "../language/types.ts"; +import { SearchSentenceProps, SearchSentenceResult, SearchTermResult } from "../search/types.ts"; import { DeepPartial } from "../util/types.ts"; +/** @interface serach-related functions */ +export interface CoreSearch { + terms(term: string): Promise>; + sentence(sentence: string, optional?: DeepPartial): Promise; + // glossary: (input: string) => Promise; +}; + +/** @interface user management */ +export interface CoreUser { + // TODO: list + // TODO: add + // TODO: remove + // TODO: get info +}; + +/** @interface dictionary/user data import functions */ +export interface CoreImport { + // TODO: import dictionary + // TODO: import user preferences +}; + +/** @interface dictionary/user data export functions */ +export interface CoreExport { + // TODO: export dictionary + // TODO: export user preferences +}; + /** * @summary Core interface * @@ -12,7 +39,9 @@ export default abstract class Core { /** @summary resolved when ready */ abstract ready: Promise; - /** @summary parse sentence */ - abstract parseSentence(input: string, options?: DeepPartial): Promise; + abstract search: CoreSearch; + abstract user: CoreUser; + abstract import: CoreImport; + abstract export: CoreExport; }; diff --git a/core/http/client.ts b/core/http/client.ts index 6b4e1a3..80f77b3 100644 --- a/core/http/client.ts +++ b/core/http/client.ts @@ -1,10 +1,8 @@ -import { InputSentenceProps } from "../../language/types.ts"; import "../../util/array.ts"; -import { DeepPartial } from "../../util/types.ts"; -import Core from "../api.ts"; +import Core, { CoreExport, CoreImport, CoreSearch, CoreUser } from "../api.ts"; import { ConnectionProps, ConnectionPropsDefault } from "./props.ts"; -import { CoreRequest, CoreRequestParseSentence, CoreResponseParseSentence } from "./types.ts"; +import { CoreRequest, CoreRequestSearchSentence, CoreRequestSearchTerms, CoreResponseSearchSentence, CoreResponseSearchTerms } from "./types.ts"; /** * @summary HTTP Core client @@ -13,8 +11,9 @@ import { CoreRequest, CoreRequestParseSentence, CoreResponseParseSentence } from * (de)serialization automatically. */ export default class RemoteCoreClient implements Core { + public ready: Promise = Promise.resolve(); + private props: ConnectionProps; - ready: Promise = Promise.resolve(); constructor(options?: ConnectionProps) { this.props = { ...ConnectionPropsDefault, ...options }; @@ -32,13 +31,29 @@ export default class RemoteCoreClient implements Core { return response.json(); } - async parseSentence(input: string, options?: DeepPartial) { - var request: CoreRequestParseSentence = { - command: "parseSentence", - options: { input, options, }, - }; - var { response } = await this.request(request) as CoreResponseParseSentence; - return response; - } + public search: CoreSearch = { + terms: async term => { + var request: CoreRequestSearchTerms = { + command: "search.terms", + options: { term, }, + }; + var { response } = await this.request(request) as CoreResponseSearchTerms; + return response; + }, + sentence: async (sentence, optional?) => { + var request: CoreRequestSearchSentence = { + command: "search.sentence", + options: { sentence, optional, }, + }; + var { response } = await this.request(request) as CoreResponseSearchSentence; + return response; + }, + }; + + public user: CoreUser = {}; + + public import: CoreImport = {}; + + public export: CoreExport = {}; } diff --git a/core/http/types.ts b/core/http/types.ts index 3d55a98..51c221a 100644 --- a/core/http/types.ts +++ b/core/http/types.ts @@ -1,4 +1,4 @@ -import { InputSentenceProps, ParseResult } from "../../language/types.ts"; +import { SearchTermResult, SearchSentenceResult, SearchSentenceProps } from "../../search/types.ts"; import { DeepPartial } from "../../util/types.ts"; export interface CoreRequest { @@ -6,22 +6,33 @@ export interface CoreRequest { options: any; }; -export interface CoreRequestParseSentence extends CoreRequest { - command: "parseSentence"; - options: { - input: string; - options?: DeepPartial; - }; -}; - export interface CoreResponse { command: string; response: any; // final: boolean; }; -export interface CoreResponseParseSentence extends CoreResponse { - command: "parseSentence"; - response: ParseResult; +export interface CoreRequestSearchSentence extends CoreRequest { + command: "search.sentence"; + options: { + sentence: string; + optional?: DeepPartial; + }; }; +export interface CoreResponseSearchSentence extends CoreResponse { + command: "search.sentence"; + response: SearchSentenceResult; +}; + +export interface CoreRequestSearchTerms extends CoreRequest { + command: "search.terms"; + options: { + term: string; + }; +}; + +export interface CoreResponseSearchTerms extends CoreResponse { + command: "search.terms"; + response: Array; +}; diff --git a/core/raw/api.ts b/core/raw/api.ts index 593b932..6046a26 100644 --- a/core/raw/api.ts +++ b/core/raw/api.ts @@ -1,29 +1,39 @@ -import Core from "../api.ts"; -import Parser from "../../language/parser.ts"; +import Core, { CoreExport, CoreImport, CoreSearch, CoreUser } from "../api.ts"; import YomikunError from "../../util/error.ts"; -import { DeepPartial } from "../../util/types.ts"; -import { InputSentenceProps } from "../../language/types.ts"; +import Search from "../../search/search.ts"; /** @summary internal Core (DO NOT USE DIRECTLY) */ export default class RawCore implements Core { - private parser: Parser; public ready: Promise; + private _search: Search; + constructor() { if (this.constructor === RawCore) { throw new YomikunError("RawCore instantiated! Use DirectCoreClient instead!"); } - this.parser = new Parser(); + this._search = new Search(); this.ready = new Promise(async resolve => { - await this.parser.ready; + await this._search.ready; resolve(); }) } - async parseSentence(input: string, options?: DeepPartial) { - return await this.parser.parse(input, options); - } + public search: CoreSearch = { + terms: async term => { + return await this._search.terms(term); + }, + sentence: async (sentence, optional?) => { + return await this._search.sentence(sentence, optional); + }, + }; + + public user: CoreUser = {}; + + public import: CoreImport = {}; + + public export: CoreExport = {}; }; diff --git a/db/db.ts b/db/db.ts index 5605f40..7bb315c 100644 --- a/db/db.ts +++ b/db/db.ts @@ -1,7 +1,7 @@ import { Database, Statement } from "https://deno.land/x/sqlite3@0.9.1/mod.ts"; import * as path from 'https://deno.land/std@0.102.0/path/mod.ts'; -import { TokenTags } from "../language/tags.ts"; +import { TokenTags } from "../search/tags.ts"; import "../util/string.ts"; export interface DBDictInfo { @@ -20,7 +20,7 @@ export interface FindResult { depth: number; original: string; match: { - kanji: boolean; + writing: boolean; reading: boolean; }; } @@ -97,7 +97,7 @@ export default class DB { depth: term.depth, original: term.original, match: { - kanji: term.expression == term.deinflected, + writing: term.expression == term.deinflected, reading: term.reading == term.deinflected, }, }; diff --git a/db/dict/deinflections.sql b/db/dict/deinflections.sql index a79fff0..e768d33 100644 --- a/db/dict/deinflections.sql +++ b/db/dict/deinflections.sql @@ -126,20 +126,20 @@ insert into deinflection_temp values ('infl:tense:cont', 'いる', '', 'a', 'a'), -- potential form - ('infl:potential', 'られる', 'る', 'a', 'ru'), - ('infl:potential', 'える', 'う', 'a', 'u'), - ('infl:potential', 'ける', 'く', 'a', 'u'), - ('infl:potential', 'げる', 'ぐ', 'a', 'u'), - ('infl:potential', 'せる', 'す', 'a', 'u'), - ('infl:potential', 'てる', 'つ', 'a', 'u'), - ('infl:potential', 'ねる', 'ぬ', 'a', 'u'), - ('infl:potential', 'べる', 'ぶ', 'a', 'u'), - ('infl:potential', 'める', 'む', 'a', 'u'), - ('infl:potential', 'れる', 'る', 'a', 'u'), - ('infl:potential', 'できる', 'する', 'a', 's'), - ('infl:potential', 'こられる', 'くる', 'a', 'k'), - ('infl:potential', 'ありうる', 'ある', 'a', ''), -- exception - ('infl:potential', 'ありえる', 'ある', 'a', ''), -- exception + ('infl:potential', 'られる', 'る', 'ru', 'ru'), + ('infl:potential', 'える', 'う', 'ru', 'u'), + ('infl:potential', 'ける', 'く', 'ru', 'u'), + ('infl:potential', 'げる', 'ぐ', 'ru', 'u'), + ('infl:potential', 'せる', 'す', 'ru', 'u'), + ('infl:potential', 'てる', 'つ', 'ru', 'u'), + ('infl:potential', 'ねる', 'ぬ', 'ru', 'u'), + ('infl:potential', 'べる', 'ぶ', 'ru', 'u'), + ('infl:potential', 'める', 'む', 'ru', 'u'), + ('infl:potential', 'れる', 'る', 'ru', 'u'), + ('infl:potential', 'できる', 'する', 'ru', 's'), + ('infl:potential', 'こられる', 'くる', 'ru', 'k'), + ('infl:potential', 'ありうる', 'ある', 'ru', ''), -- exception + ('infl:potential', 'ありえる', 'ある', 'ru', ''), -- exception -- conditionals ('infl:cond:ba', 'えば', 'う', 'nt', 'u'), @@ -271,19 +271,18 @@ insert into deinflection_temp values ('infl:causative', '来さす', '来る', 'a', 'k'), -- passive - ('infl:passive', 'られる', 'る', 'a', 'ru'), - ('infl:passive', 'われる', 'う', 'a', 'u'), - ('infl:passive', 'かれる', 'く', 'a', 'u'), - ('infl:passive', 'がれる', 'ぐ', 'a', 'u'), - ('infl:passive', 'される', 'す', 'a', 'u'), - ('infl:passive', 'たれる', 'つ', 'a', 'u'), - ('infl:passive', 'なれる', 'ぬ', 'a', 'u'), - ('infl:passive', 'ばれる', 'ぶ', 'a', 'u'), - ('infl:passive', 'まれる', 'む', 'a', 'u'), - ('infl:passive', 'られる', 'る', 'a', 'u'), - ('infl:passive', 'される', 'する', 'a', 's'), - ('infl:passive', 'こられる', 'くる', 'a', 'k'), - ('infl:passive', '来られる', '来る', 'a', 'k'), + ('infl:passive', 'られる', 'る', 'ru', 'ru u'), + ('infl:passive', 'われる', 'う', 'ru', 'u'), + ('infl:passive', 'かれる', 'く', 'ru', 'u'), + ('infl:passive', 'がれる', 'ぐ', 'ru', 'u'), + ('infl:passive', 'される', 'す', 'ru', 'u'), + ('infl:passive', 'たれる', 'つ', 'ru', 'u'), + ('infl:passive', 'なれる', 'ぬ', 'ru', 'u'), + ('infl:passive', 'ばれる', 'ぶ', 'ru', 'u'), + ('infl:passive', 'まれる', 'む', 'ru', 'u'), + ('infl:passive', 'される', 'する', 'ru', 's'), + ('infl:passive', 'こられる', 'くる', 'ru', 'k'), + ('infl:passive', '来られる', '来る', 'ru', 'k'), -- auxiliary rules ('class:verb:suru-included', 'する', '', 's', ''); -- deconjugate suru verbs into stem diff --git a/db/find.sql b/db/find.sql index dd6a011..e2d6ad8 100644 --- a/db/find.sql +++ b/db/find.sql @@ -47,7 +47,7 @@ with results(id, expression, reading, tags, depth, rules, original, deinflected) (substr(term, length(term) - length(kana_in) + 1) = kana_in) and -- can't deconjugate to length <1 (length(term) > 0) - limit 50 -- failsafe to catch any infinite loops + limit 100 -- failsafe to catch any infinite loops ) select term, tags, depth, substr(:term, 1, deinflect.length), rules from deinflect diff --git a/language/parser.ts b/language/parser.ts deleted file mode 100644 index 7fd3981..0000000 --- a/language/parser.ts +++ /dev/null @@ -1,144 +0,0 @@ -import { Tag, TagGroup } from "./tags.ts"; -import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; -import DB from "../db/db.ts"; -import "../util/array.ts"; -import "../util/set.ts"; -import { DeepPartial } from "../util/types.ts"; - -// TODO: rename Parser to Search -/** @summary main Parser class */ -export default class Parser { - db: DB; - ready: Promise; - - constructor() { - this.db = new DB(); - - this.ready = new Promise(async resolve => { - await this.db.ready; - resolve(); - }); - } - - // Search.sentence() - async parse(sentence: string, optional?: DeepPartial): Promise { - await this.ready; - - // initialize default options - var props: InputSentenceProps = { - lookahead: optional?.lookahead ?? 15, - depth: optional?.depth ?? ParseDepth.Term, - priorityMod: { - high: optional?.priorityMod?.high ?? 10, - low: optional?.priorityMod?.low ?? -10, - }, - breaks: optional?.breaks ?? [], - } - - let parseResult = await this.parseTerms(sentence, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - parseResult = await this.addGlossary(parseResult, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - return parseResult; - } - - /** @summary parse sentence into terms with readings */ - private async parseTerms(sentence: string, options: InputSentenceProps): Promise { - var parseResult: ParseResult = { - tokens: [], - depth: ParseDepth.Term, - input: sentence, - }; - - for (let start = 0; start < sentence.length; start++) { - var lookahead = options.lookahead; - - var results = await this.db.findTerm(sentence.substring(start, start + lookahead)); - // current starting point did not yield results, try again at next character or until end of input - if (results.length == 0) continue; - - results = results.filter(result => { - // ignore ignored by user terms - if (result.sort < 0) return false; - - // deconjugated words - if (result.depth > 0) { - // check if this word can be conjugated at all - if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; - - // ignore other wrong deconjugations - if (result.tags.includes(Tag.Class.Verb.U) && - !result.tags.includes(Tag.Inflection.Reason.U)) return false; - if (result.tags.includes(Tag.Class.Verb.Ru) && - !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; - if (result.tags.includes(Tag.Class.Verb.Suru) && - !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; - if (result.tags.includes(Tag.Class.Adjective.I) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; - if (result.tags.includes(Tag.Class.Adjective.Na) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; - } - - // all other results should be valid grammatically - return true; - }); - - // no valid results left after filter, try again at next character or until end of input - if (results.length == 0) continue; - - // bias search results by modifying sort value - results = results.map(result => { - // true if last token was a name else false - const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); - - // give higher priority to suffixes when last token was a name, else lower priority - if (result.tags.includes(Tag.Class.Suffix)) - result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low; - - // give lower priority to terms matched only by their readings, and are - // usually written in kanji - if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) - result.sort += options.priorityMod.low; - - return result; - }); - - - results.sort((a, b) => { - // sort by original string length (long to short) - if (a.original.length != b.original.length) return b.original.length - a.original.length; - // then by sort index (high to low) - if (a.sort != b.sort) return b.sort - a.sort; - // then by depth (high to low) - if (a.depth != b.depth) return b.depth - a.depth; - // else keep current order (random) - return 0; - }); - - // pick top result - const result = results[0]; - - parseResult.tokens.push({ - writing: result.expression, - reading: result.reading, - tags: result.tags, - term_id: result.id, - source: result.original, - start: start, - }); - - start += result.original.length - 1; // -1 because loop already increments start - continue; // extra verbose end of iteration - } - return parseResult; - } - - private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise { - // TODO: annotate input with glossaries from DB - options; // prevent unused warning - return input; - } -}; - diff --git a/language/readme.md b/language/readme.md deleted file mode 100644 index 99a7d69..0000000 --- a/language/readme.md +++ /dev/null @@ -1,53 +0,0 @@ -# Language - -This directory contains files that provide an abstracted interface with the -database for looking up sentences ~and words~. - -## Tags - -All dictionary entries have tags. Tags are combined from term info, dictionary -info, and glossary info. Tags can have subcategories separated by `:`. A -separate tags table handles displaying tags for different display languages, -including abbreviated versions. - -Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts). -Dictionary importers should map the dictionary-specific version of these tags -to Yomikun's tags for compatibility. Other tags include: - -|tag|description| -|-|-| -|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`. -|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`| -|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc. -|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.) - -### Behavior-altering tags - -Some tag classes impact the parser's behavior. For example, the input text -「完了しました」 will be parsed as just 「完了」, but with the -`class:verb:suru-included` tag added by the parser. This is because the word -「完了」 has the tag `class:verb:suru` in the database, which allows the parser -to deconjugate a noun with the verb 「する」 back into the stem. - -Other uses of this behavior include more accurate automatic kanji reading -generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」 -because 「ハイラル」 has the tag `name:place` in the database, and -「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`. - -Yomikun encourages homebrew dictionary sharing, and encourages using -behavior-altering tags for fixing readings for cases like the above examples. -As another example of this, it is encouraged that a dictionary for (for -example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`, -instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji -「湖(みずうみ)」. - -If Yomikun doesn't generate the correct reading, and the reading isn't based on -natural language context (=a computer *could* accurately decide which reading -is correct based on other words/tags in the sentence), please submit a pull -request with the sentence and its (expected) reading. An example of a -non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be -read as both 「なん」 in which case 「何で」 turns into a single word, or -「なに」 where 「何」 is a regular word and 「で」 is particle. - -[taekim]: https://guidetojapanese.org/learn/ - diff --git a/language/tags.ts b/language/tags.ts deleted file mode 100644 index 72840fe..0000000 --- a/language/tags.ts +++ /dev/null @@ -1,228 +0,0 @@ -import "../util/array.ts"; - -/** @constant Tags that have significant meaning to the parser */ -export const Tag = { - /** @constant grammatical classes */ - Class: { - /** @constant verb subgroup */ - Verb: { - /** @constant noun that can be conjugated into a verb by adding する and する itself */ - Suru: "class:verb:suru", - /** - * @constant verb stored as conjugated noun in database (nominal verb) - * - * @deprecated The use of conjugated forms in dictionaries is discouraged. - * - * This tag is added by the deconjugation code to check for a legal - * deconjugation if する has been deconjugated away for a word marked - * suru-verb. - */ - SuruIncluded: "class:verb:suru-included", - /** @constant 〜う verbs in [taekim] (godan) */ - U: "class:verb:u", - /** @constant 〜る verbs in [taekim] (ichidan) */ - Ru: "class:verb:ru", - /** @constant kuru (来る) */ - Kuru: "class:verb:kuru", - }, - Adjective: { - /** @constant adjectives that end in 〜い */ - I: "class:adj:i", - /** @constant adjectives that need to be conjugated using な */ - Na: "class:adj:na", - }, - /** @constant regular nouns or words that can be treated as nouns */ - Noun: "class:noun", - /** @constant terms that are read differently when used as a suffix */ - Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types - /** @constant grammatical particles (e.g. の, と, は, を, etc.) */ - Particle: "class:part", - /** @constant expressions and idioms - * - * Can also be used for longer strings that are read in a special way, but - * is discouraged. - * - * @see ./readme.md#behavior-altering-tags - */ - Expression: "class:expr", - /** @constant adverbs (e.g. 早く) */ - Adverb: "class:adverb", - }, - /** @constant types of names */ - Name: { - /** @constant name of a place/location. allows suffixes */ - Place: "name:place", - /** @constant feminine name. allows suffixes and honorifics */ - Female: "name:female", - /** @constant masculine name. allows suffixes and honorifics */ - Male: "name:male", - }, - /** - * @constant added to a word when deconjugated by the deinflection table - * - * Some inflections are used as steps in others, like the -tari suffix which - * is conjugated after the past tense. In this case, the past tense tag would - * be removed when it comes after the -tari tag. (see ../util/string.ts) - * - * e.g. 来ない -> 来る [infl:negative] - */ - Inflection: { - /** - * @constant affirmative conjugations - * - * This conjugation should not be added by any deconjugation rules, but is - * calculated based on the amount of negations. Even counts of negative - * inflections (including 0) add this tag, while odd counts don't add this - * tag. - */ - Affirmative: "infl:affirmative", - /** @constant negative conjugations */ - Negative: "infl:negative", - /** @constant time-related conjugations */ - Tense: { - /** @constant past tense (e.g. 叩いた) */ - Past: "infl:tense:past", - /** @constant continuous tense (e.g. 喋っている) */ - Continuous: "infl:tense:cont", - }, - /** @constant adverbs (e.g. 早く) */ - Adverb: "infl:adverb", - /** @constant polite conjugations */ - Polite: { - /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */ - Masu: "infl:polite:masu", - /** @constant 〜なさい conjugations (e.g. 座りなさい) */ - Nasai: "infl:polite:nasai", - }, - /** @constant common ending conjugations */ - Suffix: { - /** @constant -te ending (e.g. 売って) */ - Te: "infl:suffix:te", - /** @constant -tari ending (e.g. 遊んだり) */ - Tari: "infl:suffix:tari", - }, - /** @constant internal deinflection rules */ - Reason: { - /** @constant applied if word was deconjugated as -ru (ichidan) verb */ - Ru: "infl:reason:ru", - /** @constant applied if word was deconjugated as -u (godan) verb */ - U: "infl:reason:u", - /** @constant applied if word was deconjugated as suru verb */ - Suru: "infl:reason:suru", - /** @constant applied if word was deconjugated as kuru verb */ - Kuru: "infl:reason:kuru", - /** @constant applied if word was deconjugated as i-adjective */ - Adjective: { - I: "infl:reason:adj:i", - Na: "infl:reason:adj:na", - }, - }, - /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */ - Passive: "infl:passive", - /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */ - Potential: "infl:potential", - /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */ - Causative: "infl:causative", - /** @constant imperative form (e.g. 聞け) */ - Command: "infl:command", - /** @constant conditional forms */ - Conditional: { - /** @constant -ba ending (e.g. 泳げれば) */ - Ba: "infl:cond:ba", - /** @constant -ra ending (e.g. 取ったら) */ - Ra: "infl:cond:ra", - }, - /** @constant makes a verb obligatory (e.g. 入ってはいけない) */ - Obligatory: "infl:must", - /** @constant verbs that someone wants to do / be done */ - Desirable: { - /** @constant 〜たい endings (e.g. 買いたい) */ - Itai: "infl:desire:itai", - /** @constant 〜おう endings (e.g. 寝よう) */ - Volitional: "infl:desire:volitional", - }, - /** @constant makes a verb an attempt */ - Attempt: { - /** @constant 〜みる to try something out (e.g. 飲んでみた) */ - Miru: "infl:attempt:miru", - /** @constant 〜とする attempts (e.g. 入ろうとしている) */ - ToSuru: "infl:attempt:tosuru", - }, - /** @constant temporary tags (removed by parseTags) */ - Temporary: { - /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */ - ObligatoryParticle: "infl:tmp:must:prt", - /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */ - ObligatoryResult: "infl:tmp:must:res", - }, - }, - /** @constant uncategorized tags */ - Auxiliary: { - /** @constant word usually written using only kana (but also has kanji) */ - UsuallyKana: "aux:uk", - }, -} as const; - -export const TagGroup = { - /** @constant array that contains all tags of word classes that can be conjugated */ - Conjugable: [ - ...Object.values(Tag.Class.Verb), - ...Object.values(Tag.Class.Adjective), - ], -} as const; - -export type TokenTag = string; // no way around it - -export type TokenTags = Array; - -/** @summary parse concatenated tag string to TokenTags */ -export function parseTags(input: string) { - var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; - var filteredTags: TokenTag[] = []; - var negationCount = 0; - for (var tag of tags) { - // conjugations that are used as "stepping stones" for others should be - // filtered in this loop. checking if a combination of tags is valid should - // be done in ./parser.ts - - // skip past tense tag if used as step for -te and -tari inflection - if (tag == Tag.Inflection.Tense.Past && - filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; - - // skip -te suffix tag if used for - if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([ - Tag.Inflection.Tense.Continuous, // base for continuous tense - Tag.Inflection.Obligatory, // base for obligatory inflection - Tag.Inflection.Attempt.Miru, // base for 〜みる attempt - ])) continue; - - // skip volitional tag if used for 〜とする attempt - if (tag == Tag.Inflection.Desirable.Volitional && - filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue; - - // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative - if (tag == Tag.Inflection.Negative) { - negationCount++; - continue; - } - - filteredTags.push(tag); - } - - // negative + と without resulting action = implicit affirmative obligatory - if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) && - !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) { - negationCount = 0; // -> make resulting tags affirmative - } - - // normalize affirmative/negative - filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative); - - // filter any remaining temporary tags - type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary]; - filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag)); - - // filter any duplicates - return filteredTags.set().arr() as TokenTags; -} - diff --git a/language/types.ts b/language/types.ts deleted file mode 100644 index d3585f8..0000000 --- a/language/types.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { TokenTags } from "./tags.ts"; - -export enum ParseDepth { - Term, - Glossary, -}; - -export interface GlossaryDefinition { - -}; - -export interface Glossary { - id: number; - definitions: GlossaryDefinition[]; -}; - -export interface ParseToken { - writing: string; - reading: string; - tags: TokenTags; - glossary?: Glossary; - term_id: number; - source: string; - start: number; -}; - -export interface ParseResult { - depth: ParseDepth; - tokens: ParseToken[]; - input: string; -}; - -/** @summary option struct for Parser */ -export interface InputSentenceProps { - /** @prop max amount of characters to look ahead when attempting to deconjugate */ - lookahead: number; - /** @prop amount of detail to return in search results */ - depth: ParseDepth; - /** @prop search bias multipliers */ - priorityMod: { - /** @prop multiplier for negative bias */ - low: number; - /** @prop multiplier for positive bias */ - high: number; - }; - /** @prop list of breaks treated as delimiter */ - breaks: Array; -}; - diff --git a/makefile b/makefile index 6136ad6..f32cbcf 100644 --- a/makefile +++ b/makefile @@ -1,14 +1,7 @@ TARGET = yomikun +# TODO: figure out distribution SRCS += ./main.ts -SRCS += ./core/yomikun.ts -SRCS += ./db/db.ts -SRCS += ./language/tags.ts -SRCS += ./language/translator.ts -SRCS += ./language/types.ts -SRCS += ./util/array.ts -SRCS += ./util/error.ts -SRCS += ./util/string.ts DENO_FLAGS += --unstable DENO_FLAGS += --allow-ffi diff --git a/readme.md b/readme.md index 60f1eda..f27e247 100644 --- a/readme.md +++ b/readme.md @@ -29,6 +29,8 @@ scope is larger than Yomichan, it's still focused on Japanese only.** - [ ] create primitive search page ui - [ ] add code formatter config - [ ] complete documentation +- [ ] remove makefiles for database initialization +- [ ] replace .sql script files with typescript sql query generation library ## ~New features (from Yomichan)~ diff --git a/search/readme.md b/search/readme.md new file mode 100644 index 0000000..400c8ce --- /dev/null +++ b/search/readme.md @@ -0,0 +1,53 @@ +# Search + +This directory contains files that provide an abstracted interface with the +database for looking up sentences and words. + +## Tags + +All dictionary entries have tags. Tags are combined from term info, dictionary +info, and glossary info. Tags can have subcategories separated by `:`. A +separate tags table handles displaying tags for different display languages, +including abbreviated versions. + +Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts). +Dictionary importers should map the dictionary-specific version of these tags +to Yomikun's tags for compatibility. Other tags include: + +|tag|description| +|-|-| +|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`. +|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`| +|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc. +|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.) + +### Behavior-altering tags + +Some tag classes impact the parser's behavior. For example, the input text +「完了しました」 will be parsed as just 「完了」, but with the +`class:verb:suru-included` tag added by the parser. This is because the word +「完了」 has the tag `class:verb:suru` in the database, which allows the parser +to deconjugate a noun with the verb 「する」 back into the stem. + +Other uses of this behavior include more accurate automatic kanji reading +generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」 +because 「ハイラル」 has the tag `name:place` in the database, and +「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`. + +Yomikun encourages homebrew dictionary sharing, and encourages using +behavior-altering tags for fixing readings for cases like the above examples. +As another example of this, it is encouraged that a dictionary for (for +example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`, +instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji +「湖(みずうみ)」. + +If Yomikun doesn't generate the correct reading, and the reading isn't based on +natural language context (=a computer *could* accurately decide which reading +is correct based on other words/tags in the sentence), please submit a pull +request with the sentence and its (expected) reading. An example of a +non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be +read as both 「なん」 in which case 「何で」 turns into a single word, or +「なに」 where 「何」 is a regular word and 「で」 is particle. + +[taekim]: https://guidetojapanese.org/learn/ + diff --git a/search/search.ts b/search/search.ts new file mode 100644 index 0000000..0a50773 --- /dev/null +++ b/search/search.ts @@ -0,0 +1,141 @@ +import { Tag, TagGroup } from "./tags.ts"; +import { SearchSentenceProps, SearchSentenceResult, SearchTermResult, SearchWord } from "./types.ts"; +import DB from "../db/db.ts"; +import "../util/array.ts"; +import "../util/set.ts"; +import { DeepPartial } from "../util/types.ts"; + +/** @summary main Search class */ +export default class Search { + db: DB; + ready: Promise; + + constructor() { + this.db = new DB(); + + this.ready = new Promise(async resolve => { + await this.db.ready; + resolve(); + }); + } + + /** @summary find possible terms at start of string by deconjugating */ + public async terms(term: string): Promise> { + await this.ready; + + var results = await this.db.findTerm(term); + + // skip filtering valid results if there are none + if (results.length == 0) return []; + + // filter invalid deconjugations/results + results = results.filter(result => { + // ignore ignored by user terms + if (result.sort < 0) return false; + + // deconjugated words + if (result.depth > 0) { + // check if this word can be conjugated at all + if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; + + // ignore other wrong deconjugations + if (result.tags.includes(Tag.Class.Verb.U) && + !result.tags.includes(Tag.Inflection.Reason.U)) return false; + if (result.tags.includes(Tag.Class.Verb.Ru) && + !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; + if (result.tags.includes(Tag.Class.Verb.Suru) && + !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; + if (result.tags.includes(Tag.Class.Adjective.I) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; + if (result.tags.includes(Tag.Class.Adjective.Na) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; + } + + // all other results should be valid + return true; + }); + + return results.map(result => ({ + id: result.id, + writing: result.expression, + reading: result.reading, + tags: result.tags, + source: result.original, + sort: result.sort, + depth: result.depth, + match: { + reading: result.match.reading, + writing: result.match.writing, + }, + })); + } + + /** @summary parse sentence into terms with readings */ + public async sentence(sentence: string, optional?: DeepPartial): Promise { + await this.ready; + + var props: SearchSentenceProps = { + lookahead: optional?.lookahead ?? 15, + priorityMod: { + high: optional?.priorityMod?.high ?? 10, + low: optional?.priorityMod?.low ?? -10, + }, + breaks: optional?.breaks ?? [], + } + + var parseResult: SearchSentenceResult = { + input: sentence, + words: [], + }; + + for (let start = 0; start < sentence.length; start++) { + var lookahead = props.lookahead; // TODO: stop at next delimiter (optimization) + var term = sentence.substring(start, start + lookahead); + var results = (await this.terms(term)).map(term => { + var word = term as SearchWord; + word.start = start; + return word; + }); + + // current starting point did not yield results, try again at next character or until end of input + if (results.length == 0) continue; + + // bias search results by modifying sort value + results = results.map(result => { + // true if last token was a name else false + const lastTokenName = parseResult.words.peek()?.tags.anyOf(Object.values(Tag.Name)); + + // give higher priority to suffixes when last token was a name, else lower priority + if (result.tags.includes(Tag.Class.Suffix)) + result.sort += lastTokenName ? props.priorityMod.high : props.priorityMod.low; + + // give lower priority to terms matched only by their readings, and are + // usually written in kanji + if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.writing) + result.sort += props.priorityMod.low; + + return result; + }); + + results.sort((a, b) => { + // sort by original string length (long to short) + if (a.source.length != b.source.length) return b.source.length - a.source.length; + // then by sort index (high to low) + if (a.sort != b.sort) return b.sort - a.sort; + // then by depth (high to low) + if (a.depth != b.depth) return b.depth - a.depth; + // else keep current order (random) + return 0; + }); + + // pick top result + const result = results[0]; + + parseResult.words.push(result); + start += result.source.length - 1; // -1 because loop already increments start + continue; // extra verbose end of iteration + } + return parseResult; + } +}; + diff --git a/search/tags.ts b/search/tags.ts new file mode 100644 index 0000000..92279c5 --- /dev/null +++ b/search/tags.ts @@ -0,0 +1,232 @@ +import "../util/array.ts"; + +/** @constant Tags that have significant meaning to the parser */ +export const Tag = { + /** @constant grammatical classes */ + Class: { + /** @constant verb subgroup */ + Verb: { + /** @constant noun that can be conjugated into a verb by adding する and する itself */ + Suru: "class:verb:suru", + /** + * @constant verb stored as conjugated noun in database (nominal verb) + * + * @deprecated The use of conjugated forms in dictionaries is discouraged. + * + * This tag is added by the deconjugation code to check for a legal + * deconjugation if する has been deconjugated away for a word marked + * suru-verb. + */ + SuruIncluded: "class:verb:suru-included", + /** @constant 〜う verbs in [taekim] (godan) */ + U: "class:verb:u", + /** @constant 〜る verbs in [taekim] (ichidan) */ + Ru: "class:verb:ru", + /** @constant kuru (来る) */ + Kuru: "class:verb:kuru", + }, + Adjective: { + /** @constant adjectives that end in 〜い */ + I: "class:adj:i", + /** @constant adjectives that need to be conjugated using な */ + Na: "class:adj:na", + }, + /** @constant regular nouns or words that can be treated as nouns */ + Noun: "class:noun", + /** @constant terms that are read differently when used as a suffix */ + Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types + /** @constant grammatical particles (e.g. の, と, は, を, etc.) */ + Particle: "class:part", + /** @constant expressions and idioms + * + * Can also be used for longer strings that are read in a special way, but + * is discouraged. + * + * @see ./readme.md#behavior-altering-tags + */ + Expression: "class:expr", + /** @constant adverbs (e.g. 早く) */ + Adverb: "class:adverb", + }, + /** @constant types of names */ + Name: { + /** @constant name of a place/location. allows suffixes */ + Place: "name:place", + /** @constant feminine name. allows suffixes and honorifics */ + Female: "name:female", + /** @constant masculine name. allows suffixes and honorifics */ + Male: "name:male", + }, + /** + * @constant added to a word when deconjugated by the deinflection table + * + * Some inflections are used as steps in others, like the -tari suffix which + * is conjugated after the past tense. In this case, the past tense tag would + * be removed when it comes after the -tari tag. (see ../util/string.ts) + * + * e.g. 来ない -> 来る [infl:negative] + */ + Inflection: { + /** + * @constant affirmative conjugations + * + * This conjugation should not be added by any deconjugation rules, but is + * calculated based on the amount of negations. Even counts of negative + * inflections (including 0) add this tag, while odd counts don't add this + * tag. + */ + Affirmative: "infl:affirmative", + /** @constant negative conjugations */ + Negative: "infl:negative", + /** @constant time-related conjugations */ + Tense: { + /** @constant past tense (e.g. 叩いた) */ + Past: "infl:tense:past", + /** @constant continuous tense (e.g. 喋っている) */ + Continuous: "infl:tense:cont", + }, + /** @constant adverbs (e.g. 早く) */ + Adverb: "infl:adverb", + /** @constant polite conjugations */ + Polite: { + /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */ + Masu: "infl:polite:masu", + /** @constant 〜なさい conjugations (e.g. 座りなさい) */ + Nasai: "infl:polite:nasai", + }, + /** @constant common ending conjugations */ + Suffix: { + /** @constant -te ending (e.g. 売って) */ + Te: "infl:suffix:te", + /** @constant -tari ending (e.g. 遊んだり) */ + Tari: "infl:suffix:tari", + }, + /** @constant internal deinflection rules */ + Reason: { + /** @constant applied if word was deconjugated as -ru (ichidan) verb */ + Ru: "infl:reason:ru", + /** @constant applied if word was deconjugated as -u (godan) verb */ + U: "infl:reason:u", + /** @constant applied if word was deconjugated as suru verb */ + Suru: "infl:reason:suru", + /** @constant applied if word was deconjugated as kuru verb */ + Kuru: "infl:reason:kuru", + /** @constant applied if word was deconjugated as i-adjective */ + Adjective: { + I: "infl:reason:adj:i", + Na: "infl:reason:adj:na", + }, + }, + /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */ + Passive: "infl:passive", + /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */ + Potential: "infl:potential", + /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */ + Causative: "infl:causative", + /** @constant imperative form (e.g. 聞け) */ + Command: "infl:command", + /** @constant conditional forms */ + Conditional: { + /** @constant -ba ending (e.g. 泳げれば) */ + Ba: "infl:cond:ba", + /** @constant -ra ending (e.g. 取ったら) */ + Ra: "infl:cond:ra", + }, + /** @constant makes a verb obligatory (e.g. 入ってはいけない) */ + Obligatory: "infl:must", + /** @constant verbs that someone wants to do / be done */ + Desirable: { + /** @constant 〜たい endings (e.g. 買いたい) */ + Itai: "infl:desire:itai", + /** @constant 〜おう endings (e.g. 寝よう) */ + Volitional: "infl:desire:volitional", + }, + /** @constant makes a verb an attempt */ + Attempt: { + /** @constant 〜みる to try something out (e.g. 飲んでみた) */ + Miru: "infl:attempt:miru", + /** @constant 〜とする attempts (e.g. 入ろうとしている) */ + ToSuru: "infl:attempt:tosuru", + }, + /** @constant temporary tags (removed by parseTags) */ + Temporary: { + /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */ + ObligatoryParticle: "infl:tmp:must:prt", + /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */ + ObligatoryResult: "infl:tmp:must:res", + }, + }, + /** @constant uncategorized tags */ + Auxiliary: { + /** @constant word usually written using only kana (but also has kanji) */ + UsuallyKana: "aux:uk", + }, +} as const; + +export const TagGroup = { + /** @constant array that contains all tags of word classes that can be conjugated */ + Conjugable: [ + ...Object.values(Tag.Class.Verb), + ...Object.values(Tag.Class.Adjective), + ], +} as const; + +export type TokenTag = string; // no way around it + +export type TokenTags = Array; + +/** @summary parse concatenated tag string to TokenTags */ +export function parseTags(input: string) { + var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + var negationCount = 0; + for (var tag of tags) { + // conjugations that are used as "stepping stones" for others should be + // filtered in this loop. checking if a combination of tags is valid should + // be done in ./parser.ts + + // skip past tense tag if used as step for -te and -tari inflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + // skip -te suffix tag if used for + if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([ + Tag.Inflection.Tense.Continuous, // base for continuous tense + Tag.Inflection.Obligatory, // base for obligatory inflection + Tag.Inflection.Attempt.Miru, // base for 〜みる attempt + ])) continue; + + // skip volitional tag if used for 〜とする attempt + if (tag == Tag.Inflection.Desirable.Volitional && + filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue; + + // skip conditional 〜ば if used for obligatory inflection + if (tag == Tag.Inflection.Conditional.Ba && + filteredTags.anyOf([Tag.Inflection.Obligatory])) continue; + + // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative + if (tag == Tag.Inflection.Negative) { + negationCount++; + continue; + } + + filteredTags.push(tag); + } + + // negative + と without resulting action = implicit affirmative obligatory + if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) && + !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) { + negationCount = 0; // -> make resulting tags affirmative + } + + // normalize affirmative/negative + filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative); + + // filter any remaining temporary tags + type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary]; + filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag)); + + // filter any duplicates + return filteredTags.set().arr() as TokenTags; +} + diff --git a/search/types.ts b/search/types.ts new file mode 100644 index 0000000..d90afd6 --- /dev/null +++ b/search/types.ts @@ -0,0 +1,60 @@ +import { TokenTags } from "./tags.ts"; + +export interface SearchGlossaryDefinition { + +}; + +export interface SearchGlossary { + id: number; + definitions: SearchGlossaryDefinition[]; +}; + +export interface SearchTermResult { + /** @property dictionary term id */ + id: number; + /** @property (preferably) kanji writing of term */ + writing: string; + /** @property kana-only reading of term */ + reading: string; + /** @property word tags including deconjugation tags */ + tags: TokenTags; + /** @property original conjugated string */ + source: string; + /** @property numeric sorting value for term */ + sort: number; + /** @property amount of steps that were needed to deconjugate */ + depth: number; + /** @property matching results */ + match: { + /** @property term matched by writing */ + writing: boolean; + /** @property term matched by reading */ + reading: boolean; + } +}; + +export interface SearchWord extends SearchTermResult { + /** @property starting index of word in sentence */ + start: number; +}; + +export interface SearchSentenceResult { + words: SearchWord[]; + input: string; +}; + +/** @summary options for Search.sentence() */ +export interface SearchSentenceProps { + /** @prop max amount of characters to look ahead when attempting to deconjugate words */ + lookahead: number; + /** @prop search bias values */ + priorityMod: { + /** @prop offset for negative bias */ + low: number; + /** @prop offset for positive bias */ + high: number; + }; + /** @prop list of breaks treated as delimiter */ + breaks: Array; +}; + diff --git a/test/deinflection/cases.ts b/test/deinflection/cases.ts index 08517d4..c29bdf1 100644 --- a/test/deinflection/cases.ts +++ b/test/deinflection/cases.ts @@ -1,11 +1,11 @@ -import { TokenTags, Tag } from "../../language/tags.ts"; +import { TokenTags, Tag } from "../../search/tags.ts"; const { Inflection } = Tag; interface Test { input: string; mustHave: TokenTags; mustNotHave: TokenTags; - forceID?: number; + force?: { reading: string, writing: string }; }; export default [ @@ -32,7 +32,7 @@ export default [ { input: "取るな", mustHave: [ Inflection.Negative, Inflection.Command ], mustNotHave: [], }, // other tests { input: "取ったり", mustHave: [ Inflection.Suffix.Tari ], mustNotHave: [ Inflection.Tense.Past ], }, - { input: "早く", mustHave: [ Inflection.Adverb ], mustNotHave: [], }, + { input: "早く", force: { reading: "はやい", writing: "早い" }, mustHave: [ Inflection.Adverb ], mustNotHave: [], }, { input: "遊んだり", mustHave: [ Inflection.Suffix.Tari ], mustNotHave: [ Inflection.Tense.Past ], }, { input: "聞け", mustHave: [ Inflection.Command ], mustNotHave: [], }, { input: "食べさせる", mustHave: [ Inflection.Causative ], mustNotHave: [], }, @@ -51,13 +51,12 @@ export default [ { input: "しなくてはいけなかった", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "行かないとだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "しないといけない", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, - { input: "行かなければいけません", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, - { input: "しなければだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, + { input: "行かなければいけません", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative, Inflection.Conditional.Ba ], }, + { input: "しなければだめ", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative, Inflection.Conditional.Ba ], }, { input: "行かないと", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Negative ], }, { input: "買いたい", mustHave: [ Inflection.Desirable.Itai ], mustNotHave: [], }, { input: "寝よう", mustHave: [ Inflection.Desirable.Volitional ], mustNotHave: [], }, - // TODO: for this test to work, a parseSentencePart function needs to be made that returns all possible words (currently clipped) - // { input: "しましょう", forceID: 17327, mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, + { input: "しましょう", force: { reading: "する", writing: "為る" }, mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "きましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "寝ましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, { input: "行きましょう", mustHave: [ Inflection.Desirable.Volitional, Inflection.Polite.Masu ], mustNotHave: [], }, @@ -76,9 +75,10 @@ export default [ { input: "聞きなさい", mustHave: [ Inflection.Polite.Nasai ], mustNotHave: [], }, { input: "座りなさい", mustHave: [ Inflection.Polite.Nasai ], mustNotHave: [], }, { input: "食べさせられる", mustHave: [ Inflection.Passive, Inflection.Causative ], mustNotHave: [], }, + { input: "見極めなければならない", mustHave: [ Inflection.Obligatory, Inflection.Affirmative ], mustNotHave: [ Inflection.Conditional.Ba, Inflection.Negative ] } // TODO: りゃ for いることは // TODO: じゃ for では // TODO: なきゃ + なくちゃ // and more! -] as Test[]; +] satisfies Test[] as Test[]; diff --git a/test/deinflection/test.ts b/test/deinflection/test.ts index 017a5c7..1d2f172 100644 --- a/test/deinflection/test.ts +++ b/test/deinflection/test.ts @@ -1,17 +1,18 @@ import cases from "./cases.ts"; import { core } from '../base.ts'; -import { TokenTag } from '../../language/tags.ts'; +import { TokenTag } from "../../search/tags.ts"; -cases.forEach(({ input, mustHave, mustNotHave, forceID }) => { +cases.forEach(({ input, mustHave, mustNotHave, force }) => { Deno.test(`deinflection - ${input}`, async () => { - var { tokens } = await core.parseSentence(input); + var terms = await core.search.terms(input); - if (tokens.length == 0) - throw new Error("No parsed tokens for input"); + if (terms.length == 0) + throw new Error("No parsed terms for input"); - // console.log(tokens); - var result = tokens.find(t => t.source == input); - if (forceID) result = tokens.find(t => t.term_id == forceID); + // console.log(terms); + var result = terms.find(t => t.source == input); + if (force) + result = terms.find(t => t.reading == force.reading && t.writing == force.writing); if (!result) throw new Error("No deconjugation found for input"); diff --git a/util/string.ts b/util/string.ts index b362f06..4704d03 100644 --- a/util/string.ts +++ b/util/string.ts @@ -1,4 +1,4 @@ -import { TokenTags, parseTags } from "../language/tags.ts"; +import { TokenTags, parseTags } from "../search/tags.ts"; import { Wrapper } from "./wrap.ts"; declare global { -- cgit v1.2.3