From a3a81530a0a30ba02b5253b762e2ccd77d3b01fc Mon Sep 17 00:00:00 2001 From: lonkaars Date: Mon, 10 Jul 2023 16:26:13 +0200 Subject: small restructuring + all deinflection tests working --- search/readme.md | 53 +++++++++++++ search/search.ts | 141 +++++++++++++++++++++++++++++++++ search/tags.ts | 232 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ search/types.ts | 60 ++++++++++++++ 4 files changed, 486 insertions(+) create mode 100644 search/readme.md create mode 100644 search/search.ts create mode 100644 search/tags.ts create mode 100644 search/types.ts (limited to 'search') diff --git a/search/readme.md b/search/readme.md new file mode 100644 index 0000000..400c8ce --- /dev/null +++ b/search/readme.md @@ -0,0 +1,53 @@ +# Search + +This directory contains files that provide an abstracted interface with the +database for looking up sentences and words. + +## Tags + +All dictionary entries have tags. Tags are combined from term info, dictionary +info, and glossary info. Tags can have subcategories separated by `:`. A +separate tags table handles displaying tags for different display languages, +including abbreviated versions. + +Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts). +Dictionary importers should map the dictionary-specific version of these tags +to Yomikun's tags for compatibility. Other tags include: + +|tag|description| +|-|-| +|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`. +|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`| +|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc. +|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.) + +### Behavior-altering tags + +Some tag classes impact the parser's behavior. For example, the input text +「完了しました」 will be parsed as just 「完了」, but with the +`class:verb:suru-included` tag added by the parser. This is because the word +「完了」 has the tag `class:verb:suru` in the database, which allows the parser +to deconjugate a noun with the verb 「する」 back into the stem. + +Other uses of this behavior include more accurate automatic kanji reading +generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」 +because 「ハイラル」 has the tag `name:place` in the database, and +「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`. + +Yomikun encourages homebrew dictionary sharing, and encourages using +behavior-altering tags for fixing readings for cases like the above examples. +As another example of this, it is encouraged that a dictionary for (for +example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`, +instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji +「湖(みずうみ)」. + +If Yomikun doesn't generate the correct reading, and the reading isn't based on +natural language context (=a computer *could* accurately decide which reading +is correct based on other words/tags in the sentence), please submit a pull +request with the sentence and its (expected) reading. An example of a +non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be +read as both 「なん」 in which case 「何で」 turns into a single word, or +「なに」 where 「何」 is a regular word and 「で」 is particle. + +[taekim]: https://guidetojapanese.org/learn/ + diff --git a/search/search.ts b/search/search.ts new file mode 100644 index 0000000..0a50773 --- /dev/null +++ b/search/search.ts @@ -0,0 +1,141 @@ +import { Tag, TagGroup } from "./tags.ts"; +import { SearchSentenceProps, SearchSentenceResult, SearchTermResult, SearchWord } from "./types.ts"; +import DB from "../db/db.ts"; +import "../util/array.ts"; +import "../util/set.ts"; +import { DeepPartial } from "../util/types.ts"; + +/** @summary main Search class */ +export default class Search { + db: DB; + ready: Promise; + + constructor() { + this.db = new DB(); + + this.ready = new Promise(async resolve => { + await this.db.ready; + resolve(); + }); + } + + /** @summary find possible terms at start of string by deconjugating */ + public async terms(term: string): Promise> { + await this.ready; + + var results = await this.db.findTerm(term); + + // skip filtering valid results if there are none + if (results.length == 0) return []; + + // filter invalid deconjugations/results + results = results.filter(result => { + // ignore ignored by user terms + if (result.sort < 0) return false; + + // deconjugated words + if (result.depth > 0) { + // check if this word can be conjugated at all + if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; + + // ignore other wrong deconjugations + if (result.tags.includes(Tag.Class.Verb.U) && + !result.tags.includes(Tag.Inflection.Reason.U)) return false; + if (result.tags.includes(Tag.Class.Verb.Ru) && + !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; + if (result.tags.includes(Tag.Class.Verb.Suru) && + !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; + if (result.tags.includes(Tag.Class.Adjective.I) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; + if (result.tags.includes(Tag.Class.Adjective.Na) && + !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; + } + + // all other results should be valid + return true; + }); + + return results.map(result => ({ + id: result.id, + writing: result.expression, + reading: result.reading, + tags: result.tags, + source: result.original, + sort: result.sort, + depth: result.depth, + match: { + reading: result.match.reading, + writing: result.match.writing, + }, + })); + } + + /** @summary parse sentence into terms with readings */ + public async sentence(sentence: string, optional?: DeepPartial): Promise { + await this.ready; + + var props: SearchSentenceProps = { + lookahead: optional?.lookahead ?? 15, + priorityMod: { + high: optional?.priorityMod?.high ?? 10, + low: optional?.priorityMod?.low ?? -10, + }, + breaks: optional?.breaks ?? [], + } + + var parseResult: SearchSentenceResult = { + input: sentence, + words: [], + }; + + for (let start = 0; start < sentence.length; start++) { + var lookahead = props.lookahead; // TODO: stop at next delimiter (optimization) + var term = sentence.substring(start, start + lookahead); + var results = (await this.terms(term)).map(term => { + var word = term as SearchWord; + word.start = start; + return word; + }); + + // current starting point did not yield results, try again at next character or until end of input + if (results.length == 0) continue; + + // bias search results by modifying sort value + results = results.map(result => { + // true if last token was a name else false + const lastTokenName = parseResult.words.peek()?.tags.anyOf(Object.values(Tag.Name)); + + // give higher priority to suffixes when last token was a name, else lower priority + if (result.tags.includes(Tag.Class.Suffix)) + result.sort += lastTokenName ? props.priorityMod.high : props.priorityMod.low; + + // give lower priority to terms matched only by their readings, and are + // usually written in kanji + if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.writing) + result.sort += props.priorityMod.low; + + return result; + }); + + results.sort((a, b) => { + // sort by original string length (long to short) + if (a.source.length != b.source.length) return b.source.length - a.source.length; + // then by sort index (high to low) + if (a.sort != b.sort) return b.sort - a.sort; + // then by depth (high to low) + if (a.depth != b.depth) return b.depth - a.depth; + // else keep current order (random) + return 0; + }); + + // pick top result + const result = results[0]; + + parseResult.words.push(result); + start += result.source.length - 1; // -1 because loop already increments start + continue; // extra verbose end of iteration + } + return parseResult; + } +}; + diff --git a/search/tags.ts b/search/tags.ts new file mode 100644 index 0000000..92279c5 --- /dev/null +++ b/search/tags.ts @@ -0,0 +1,232 @@ +import "../util/array.ts"; + +/** @constant Tags that have significant meaning to the parser */ +export const Tag = { + /** @constant grammatical classes */ + Class: { + /** @constant verb subgroup */ + Verb: { + /** @constant noun that can be conjugated into a verb by adding する and する itself */ + Suru: "class:verb:suru", + /** + * @constant verb stored as conjugated noun in database (nominal verb) + * + * @deprecated The use of conjugated forms in dictionaries is discouraged. + * + * This tag is added by the deconjugation code to check for a legal + * deconjugation if する has been deconjugated away for a word marked + * suru-verb. + */ + SuruIncluded: "class:verb:suru-included", + /** @constant 〜う verbs in [taekim] (godan) */ + U: "class:verb:u", + /** @constant 〜る verbs in [taekim] (ichidan) */ + Ru: "class:verb:ru", + /** @constant kuru (来る) */ + Kuru: "class:verb:kuru", + }, + Adjective: { + /** @constant adjectives that end in 〜い */ + I: "class:adj:i", + /** @constant adjectives that need to be conjugated using な */ + Na: "class:adj:na", + }, + /** @constant regular nouns or words that can be treated as nouns */ + Noun: "class:noun", + /** @constant terms that are read differently when used as a suffix */ + Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types + /** @constant grammatical particles (e.g. の, と, は, を, etc.) */ + Particle: "class:part", + /** @constant expressions and idioms + * + * Can also be used for longer strings that are read in a special way, but + * is discouraged. + * + * @see ./readme.md#behavior-altering-tags + */ + Expression: "class:expr", + /** @constant adverbs (e.g. 早く) */ + Adverb: "class:adverb", + }, + /** @constant types of names */ + Name: { + /** @constant name of a place/location. allows suffixes */ + Place: "name:place", + /** @constant feminine name. allows suffixes and honorifics */ + Female: "name:female", + /** @constant masculine name. allows suffixes and honorifics */ + Male: "name:male", + }, + /** + * @constant added to a word when deconjugated by the deinflection table + * + * Some inflections are used as steps in others, like the -tari suffix which + * is conjugated after the past tense. In this case, the past tense tag would + * be removed when it comes after the -tari tag. (see ../util/string.ts) + * + * e.g. 来ない -> 来る [infl:negative] + */ + Inflection: { + /** + * @constant affirmative conjugations + * + * This conjugation should not be added by any deconjugation rules, but is + * calculated based on the amount of negations. Even counts of negative + * inflections (including 0) add this tag, while odd counts don't add this + * tag. + */ + Affirmative: "infl:affirmative", + /** @constant negative conjugations */ + Negative: "infl:negative", + /** @constant time-related conjugations */ + Tense: { + /** @constant past tense (e.g. 叩いた) */ + Past: "infl:tense:past", + /** @constant continuous tense (e.g. 喋っている) */ + Continuous: "infl:tense:cont", + }, + /** @constant adverbs (e.g. 早く) */ + Adverb: "infl:adverb", + /** @constant polite conjugations */ + Polite: { + /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */ + Masu: "infl:polite:masu", + /** @constant 〜なさい conjugations (e.g. 座りなさい) */ + Nasai: "infl:polite:nasai", + }, + /** @constant common ending conjugations */ + Suffix: { + /** @constant -te ending (e.g. 売って) */ + Te: "infl:suffix:te", + /** @constant -tari ending (e.g. 遊んだり) */ + Tari: "infl:suffix:tari", + }, + /** @constant internal deinflection rules */ + Reason: { + /** @constant applied if word was deconjugated as -ru (ichidan) verb */ + Ru: "infl:reason:ru", + /** @constant applied if word was deconjugated as -u (godan) verb */ + U: "infl:reason:u", + /** @constant applied if word was deconjugated as suru verb */ + Suru: "infl:reason:suru", + /** @constant applied if word was deconjugated as kuru verb */ + Kuru: "infl:reason:kuru", + /** @constant applied if word was deconjugated as i-adjective */ + Adjective: { + I: "infl:reason:adj:i", + Na: "infl:reason:adj:na", + }, + }, + /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */ + Passive: "infl:passive", + /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */ + Potential: "infl:potential", + /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */ + Causative: "infl:causative", + /** @constant imperative form (e.g. 聞け) */ + Command: "infl:command", + /** @constant conditional forms */ + Conditional: { + /** @constant -ba ending (e.g. 泳げれば) */ + Ba: "infl:cond:ba", + /** @constant -ra ending (e.g. 取ったら) */ + Ra: "infl:cond:ra", + }, + /** @constant makes a verb obligatory (e.g. 入ってはいけない) */ + Obligatory: "infl:must", + /** @constant verbs that someone wants to do / be done */ + Desirable: { + /** @constant 〜たい endings (e.g. 買いたい) */ + Itai: "infl:desire:itai", + /** @constant 〜おう endings (e.g. 寝よう) */ + Volitional: "infl:desire:volitional", + }, + /** @constant makes a verb an attempt */ + Attempt: { + /** @constant 〜みる to try something out (e.g. 飲んでみた) */ + Miru: "infl:attempt:miru", + /** @constant 〜とする attempts (e.g. 入ろうとしている) */ + ToSuru: "infl:attempt:tosuru", + }, + /** @constant temporary tags (removed by parseTags) */ + Temporary: { + /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */ + ObligatoryParticle: "infl:tmp:must:prt", + /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */ + ObligatoryResult: "infl:tmp:must:res", + }, + }, + /** @constant uncategorized tags */ + Auxiliary: { + /** @constant word usually written using only kana (but also has kanji) */ + UsuallyKana: "aux:uk", + }, +} as const; + +export const TagGroup = { + /** @constant array that contains all tags of word classes that can be conjugated */ + Conjugable: [ + ...Object.values(Tag.Class.Verb), + ...Object.values(Tag.Class.Adjective), + ], +} as const; + +export type TokenTag = string; // no way around it + +export type TokenTags = Array; + +/** @summary parse concatenated tag string to TokenTags */ +export function parseTags(input: string) { + var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + var negationCount = 0; + for (var tag of tags) { + // conjugations that are used as "stepping stones" for others should be + // filtered in this loop. checking if a combination of tags is valid should + // be done in ./parser.ts + + // skip past tense tag if used as step for -te and -tari inflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + // skip -te suffix tag if used for + if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([ + Tag.Inflection.Tense.Continuous, // base for continuous tense + Tag.Inflection.Obligatory, // base for obligatory inflection + Tag.Inflection.Attempt.Miru, // base for 〜みる attempt + ])) continue; + + // skip volitional tag if used for 〜とする attempt + if (tag == Tag.Inflection.Desirable.Volitional && + filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue; + + // skip conditional 〜ば if used for obligatory inflection + if (tag == Tag.Inflection.Conditional.Ba && + filteredTags.anyOf([Tag.Inflection.Obligatory])) continue; + + // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative + if (tag == Tag.Inflection.Negative) { + negationCount++; + continue; + } + + filteredTags.push(tag); + } + + // negative + と without resulting action = implicit affirmative obligatory + if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) && + !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) { + negationCount = 0; // -> make resulting tags affirmative + } + + // normalize affirmative/negative + filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative); + + // filter any remaining temporary tags + type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary]; + filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag)); + + // filter any duplicates + return filteredTags.set().arr() as TokenTags; +} + diff --git a/search/types.ts b/search/types.ts new file mode 100644 index 0000000..d90afd6 --- /dev/null +++ b/search/types.ts @@ -0,0 +1,60 @@ +import { TokenTags } from "./tags.ts"; + +export interface SearchGlossaryDefinition { + +}; + +export interface SearchGlossary { + id: number; + definitions: SearchGlossaryDefinition[]; +}; + +export interface SearchTermResult { + /** @property dictionary term id */ + id: number; + /** @property (preferably) kanji writing of term */ + writing: string; + /** @property kana-only reading of term */ + reading: string; + /** @property word tags including deconjugation tags */ + tags: TokenTags; + /** @property original conjugated string */ + source: string; + /** @property numeric sorting value for term */ + sort: number; + /** @property amount of steps that were needed to deconjugate */ + depth: number; + /** @property matching results */ + match: { + /** @property term matched by writing */ + writing: boolean; + /** @property term matched by reading */ + reading: boolean; + } +}; + +export interface SearchWord extends SearchTermResult { + /** @property starting index of word in sentence */ + start: number; +}; + +export interface SearchSentenceResult { + words: SearchWord[]; + input: string; +}; + +/** @summary options for Search.sentence() */ +export interface SearchSentenceProps { + /** @prop max amount of characters to look ahead when attempting to deconjugate words */ + lookahead: number; + /** @prop search bias values */ + priorityMod: { + /** @prop offset for negative bias */ + low: number; + /** @prop offset for positive bias */ + high: number; + }; + /** @prop list of breaks treated as delimiter */ + breaks: Array; +}; + -- cgit v1.2.3