diff options
author | lonkaars <loek@pipeframe.xyz> | 2023-07-10 16:26:13 +0200 |
---|---|---|
committer | lonkaars <loek@pipeframe.xyz> | 2023-07-10 16:26:13 +0200 |
commit | a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch) | |
tree | afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /language | |
parent | e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff) |
small restructuring + all deinflection tests working
Diffstat (limited to 'language')
-rw-r--r-- | language/parser.ts | 144 | ||||
-rw-r--r-- | language/readme.md | 53 | ||||
-rw-r--r-- | language/tags.ts | 228 | ||||
-rw-r--r-- | language/types.ts | 49 |
4 files changed, 0 insertions, 474 deletions
diff --git a/language/parser.ts b/language/parser.ts deleted file mode 100644 index 7fd3981..0000000 --- a/language/parser.ts +++ /dev/null @@ -1,144 +0,0 @@ -import { Tag, TagGroup } from "./tags.ts"; -import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; -import DB from "../db/db.ts"; -import "../util/array.ts"; -import "../util/set.ts"; -import { DeepPartial } from "../util/types.ts"; - -// TODO: rename Parser to Search -/** @summary main Parser class */ -export default class Parser { - db: DB; - ready: Promise<void>; - - constructor() { - this.db = new DB(); - - this.ready = new Promise<void>(async resolve => { - await this.db.ready; - resolve(); - }); - } - - // Search.sentence() - async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> { - await this.ready; - - // initialize default options - var props: InputSentenceProps = { - lookahead: optional?.lookahead ?? 15, - depth: optional?.depth ?? ParseDepth.Term, - priorityMod: { - high: optional?.priorityMod?.high ?? 10, - low: optional?.priorityMod?.low ?? -10, - }, - breaks: optional?.breaks ?? [], - } - - let parseResult = await this.parseTerms(sentence, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - parseResult = await this.addGlossary(parseResult, props); - if (props.depth <= ParseDepth.Term) return parseResult; - - return parseResult; - } - - /** @summary parse sentence into terms with readings */ - private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> { - var parseResult: ParseResult = { - tokens: [], - depth: ParseDepth.Term, - input: sentence, - }; - - for (let start = 0; start < sentence.length; start++) { - var lookahead = options.lookahead; - - var results = await this.db.findTerm(sentence.substring(start, start + lookahead)); - // current starting point did not yield results, try again at next character or until end of input - if (results.length == 0) continue; - - results = results.filter(result => { - // ignore ignored by user terms - if (result.sort < 0) return false; - - // deconjugated words - if (result.depth > 0) { - // check if this word can be conjugated at all - if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; - - // ignore other wrong deconjugations - if (result.tags.includes(Tag.Class.Verb.U) && - !result.tags.includes(Tag.Inflection.Reason.U)) return false; - if (result.tags.includes(Tag.Class.Verb.Ru) && - !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; - if (result.tags.includes(Tag.Class.Verb.Suru) && - !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; - if (result.tags.includes(Tag.Class.Adjective.I) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; - if (result.tags.includes(Tag.Class.Adjective.Na) && - !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; - } - - // all other results should be valid grammatically - return true; - }); - - // no valid results left after filter, try again at next character or until end of input - if (results.length == 0) continue; - - // bias search results by modifying sort value - results = results.map(result => { - // true if last token was a name else false - const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); - - // give higher priority to suffixes when last token was a name, else lower priority - if (result.tags.includes(Tag.Class.Suffix)) - result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low; - - // give lower priority to terms matched only by their readings, and are - // usually written in kanji - if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) - result.sort += options.priorityMod.low; - - return result; - }); - - - results.sort((a, b) => { - // sort by original string length (long to short) - if (a.original.length != b.original.length) return b.original.length - a.original.length; - // then by sort index (high to low) - if (a.sort != b.sort) return b.sort - a.sort; - // then by depth (high to low) - if (a.depth != b.depth) return b.depth - a.depth; - // else keep current order (random) - return 0; - }); - - // pick top result - const result = results[0]; - - parseResult.tokens.push({ - writing: result.expression, - reading: result.reading, - tags: result.tags, - term_id: result.id, - source: result.original, - start: start, - }); - - start += result.original.length - 1; // -1 because loop already increments start - continue; // extra verbose end of iteration - } - return parseResult; - } - - private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> { - // TODO: annotate input with glossaries from DB - options; // prevent unused warning - return input; - } -}; - diff --git a/language/readme.md b/language/readme.md deleted file mode 100644 index 99a7d69..0000000 --- a/language/readme.md +++ /dev/null @@ -1,53 +0,0 @@ -# Language - -This directory contains files that provide an abstracted interface with the -database for looking up sentences ~and words~. - -## Tags - -All dictionary entries have tags. Tags are combined from term info, dictionary -info, and glossary info. Tags can have subcategories separated by `:`. A -separate tags table handles displaying tags for different display languages, -including abbreviated versions. - -Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts). -Dictionary importers should map the dictionary-specific version of these tags -to Yomikun's tags for compatibility. Other tags include: - -|tag|description| -|-|-| -|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`. -|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`| -|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc. -|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.) - -### Behavior-altering tags - -Some tag classes impact the parser's behavior. For example, the input text -「完了しました」 will be parsed as just 「完了」, but with the -`class:verb:suru-included` tag added by the parser. This is because the word -「完了」 has the tag `class:verb:suru` in the database, which allows the parser -to deconjugate a noun with the verb 「する」 back into the stem. - -Other uses of this behavior include more accurate automatic kanji reading -generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」 -because 「ハイラル」 has the tag `name:place` in the database, and -「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`. - -Yomikun encourages homebrew dictionary sharing, and encourages using -behavior-altering tags for fixing readings for cases like the above examples. -As another example of this, it is encouraged that a dictionary for (for -example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`, -instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji -「湖(みずうみ)」. - -If Yomikun doesn't generate the correct reading, and the reading isn't based on -natural language context (=a computer *could* accurately decide which reading -is correct based on other words/tags in the sentence), please submit a pull -request with the sentence and its (expected) reading. An example of a -non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be -read as both 「なん」 in which case 「何で」 turns into a single word, or -「なに」 where 「何」 is a regular word and 「で」 is particle. - -[taekim]: https://guidetojapanese.org/learn/ - diff --git a/language/tags.ts b/language/tags.ts deleted file mode 100644 index 72840fe..0000000 --- a/language/tags.ts +++ /dev/null @@ -1,228 +0,0 @@ -import "../util/array.ts"; - -/** @constant Tags that have significant meaning to the parser */ -export const Tag = { - /** @constant grammatical classes */ - Class: { - /** @constant verb subgroup */ - Verb: { - /** @constant noun that can be conjugated into a verb by adding する and する itself */ - Suru: "class:verb:suru", - /** - * @constant verb stored as conjugated noun in database (nominal verb) - * - * @deprecated The use of conjugated forms in dictionaries is discouraged. - * - * This tag is added by the deconjugation code to check for a legal - * deconjugation if する has been deconjugated away for a word marked - * suru-verb. - */ - SuruIncluded: "class:verb:suru-included", - /** @constant 〜う verbs in [taekim] (godan) */ - U: "class:verb:u", - /** @constant 〜る verbs in [taekim] (ichidan) */ - Ru: "class:verb:ru", - /** @constant kuru (来る) */ - Kuru: "class:verb:kuru", - }, - Adjective: { - /** @constant adjectives that end in 〜い */ - I: "class:adj:i", - /** @constant adjectives that need to be conjugated using な */ - Na: "class:adj:na", - }, - /** @constant regular nouns or words that can be treated as nouns */ - Noun: "class:noun", - /** @constant terms that are read differently when used as a suffix */ - Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types - /** @constant grammatical particles (e.g. の, と, は, を, etc.) */ - Particle: "class:part", - /** @constant expressions and idioms - * - * Can also be used for longer strings that are read in a special way, but - * is discouraged. - * - * @see ./readme.md#behavior-altering-tags - */ - Expression: "class:expr", - /** @constant adverbs (e.g. 早く) */ - Adverb: "class:adverb", - }, - /** @constant types of names */ - Name: { - /** @constant name of a place/location. allows suffixes */ - Place: "name:place", - /** @constant feminine name. allows suffixes and honorifics */ - Female: "name:female", - /** @constant masculine name. allows suffixes and honorifics */ - Male: "name:male", - }, - /** - * @constant added to a word when deconjugated by the deinflection table - * - * Some inflections are used as steps in others, like the -tari suffix which - * is conjugated after the past tense. In this case, the past tense tag would - * be removed when it comes after the -tari tag. (see ../util/string.ts) - * - * e.g. 来ない -> 来る [infl:negative] - */ - Inflection: { - /** - * @constant affirmative conjugations - * - * This conjugation should not be added by any deconjugation rules, but is - * calculated based on the amount of negations. Even counts of negative - * inflections (including 0) add this tag, while odd counts don't add this - * tag. - */ - Affirmative: "infl:affirmative", - /** @constant negative conjugations */ - Negative: "infl:negative", - /** @constant time-related conjugations */ - Tense: { - /** @constant past tense (e.g. 叩いた) */ - Past: "infl:tense:past", - /** @constant continuous tense (e.g. 喋っている) */ - Continuous: "infl:tense:cont", - }, - /** @constant adverbs (e.g. 早く) */ - Adverb: "infl:adverb", - /** @constant polite conjugations */ - Polite: { - /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */ - Masu: "infl:polite:masu", - /** @constant 〜なさい conjugations (e.g. 座りなさい) */ - Nasai: "infl:polite:nasai", - }, - /** @constant common ending conjugations */ - Suffix: { - /** @constant -te ending (e.g. 売って) */ - Te: "infl:suffix:te", - /** @constant -tari ending (e.g. 遊んだり) */ - Tari: "infl:suffix:tari", - }, - /** @constant internal deinflection rules */ - Reason: { - /** @constant applied if word was deconjugated as -ru (ichidan) verb */ - Ru: "infl:reason:ru", - /** @constant applied if word was deconjugated as -u (godan) verb */ - U: "infl:reason:u", - /** @constant applied if word was deconjugated as suru verb */ - Suru: "infl:reason:suru", - /** @constant applied if word was deconjugated as kuru verb */ - Kuru: "infl:reason:kuru", - /** @constant applied if word was deconjugated as i-adjective */ - Adjective: { - I: "infl:reason:adj:i", - Na: "infl:reason:adj:na", - }, - }, - /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */ - Passive: "infl:passive", - /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */ - Potential: "infl:potential", - /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */ - Causative: "infl:causative", - /** @constant imperative form (e.g. 聞け) */ - Command: "infl:command", - /** @constant conditional forms */ - Conditional: { - /** @constant -ba ending (e.g. 泳げれば) */ - Ba: "infl:cond:ba", - /** @constant -ra ending (e.g. 取ったら) */ - Ra: "infl:cond:ra", - }, - /** @constant makes a verb obligatory (e.g. 入ってはいけない) */ - Obligatory: "infl:must", - /** @constant verbs that someone wants to do / be done */ - Desirable: { - /** @constant 〜たい endings (e.g. 買いたい) */ - Itai: "infl:desire:itai", - /** @constant 〜おう endings (e.g. 寝よう) */ - Volitional: "infl:desire:volitional", - }, - /** @constant makes a verb an attempt */ - Attempt: { - /** @constant 〜みる to try something out (e.g. 飲んでみた) */ - Miru: "infl:attempt:miru", - /** @constant 〜とする attempts (e.g. 入ろうとしている) */ - ToSuru: "infl:attempt:tosuru", - }, - /** @constant temporary tags (removed by parseTags) */ - Temporary: { - /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */ - ObligatoryParticle: "infl:tmp:must:prt", - /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */ - ObligatoryResult: "infl:tmp:must:res", - }, - }, - /** @constant uncategorized tags */ - Auxiliary: { - /** @constant word usually written using only kana (but also has kanji) */ - UsuallyKana: "aux:uk", - }, -} as const; - -export const TagGroup = { - /** @constant array that contains all tags of word classes that can be conjugated */ - Conjugable: [ - ...Object.values(Tag.Class.Verb), - ...Object.values(Tag.Class.Adjective), - ], -} as const; - -export type TokenTag = string; // no way around it - -export type TokenTags = Array<TokenTag>; - -/** @summary parse concatenated tag string to TokenTags */ -export function parseTags(input: string) { - var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; - var filteredTags: TokenTag[] = []; - var negationCount = 0; - for (var tag of tags) { - // conjugations that are used as "stepping stones" for others should be - // filtered in this loop. checking if a combination of tags is valid should - // be done in ./parser.ts - - // skip past tense tag if used as step for -te and -tari inflection - if (tag == Tag.Inflection.Tense.Past && - filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; - - // skip -te suffix tag if used for - if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([ - Tag.Inflection.Tense.Continuous, // base for continuous tense - Tag.Inflection.Obligatory, // base for obligatory inflection - Tag.Inflection.Attempt.Miru, // base for 〜みる attempt - ])) continue; - - // skip volitional tag if used for 〜とする attempt - if (tag == Tag.Inflection.Desirable.Volitional && - filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue; - - // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative - if (tag == Tag.Inflection.Negative) { - negationCount++; - continue; - } - - filteredTags.push(tag); - } - - // negative + と without resulting action = implicit affirmative obligatory - if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) && - !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) { - negationCount = 0; // -> make resulting tags affirmative - } - - // normalize affirmative/negative - filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative); - - // filter any remaining temporary tags - type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary]; - filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag)); - - // filter any duplicates - return filteredTags.set().arr() as TokenTags; -} - diff --git a/language/types.ts b/language/types.ts deleted file mode 100644 index d3585f8..0000000 --- a/language/types.ts +++ /dev/null @@ -1,49 +0,0 @@ -import { TokenTags } from "./tags.ts"; - -export enum ParseDepth { - Term, - Glossary, -}; - -export interface GlossaryDefinition { - -}; - -export interface Glossary { - id: number; - definitions: GlossaryDefinition[]; -}; - -export interface ParseToken { - writing: string; - reading: string; - tags: TokenTags; - glossary?: Glossary; - term_id: number; - source: string; - start: number; -}; - -export interface ParseResult { - depth: ParseDepth; - tokens: ParseToken[]; - input: string; -}; - -/** @summary option struct for Parser */ -export interface InputSentenceProps { - /** @prop max amount of characters to look ahead when attempting to deconjugate */ - lookahead: number; - /** @prop amount of detail to return in search results */ - depth: ParseDepth; - /** @prop search bias multipliers */ - priorityMod: { - /** @prop multiplier for negative bias */ - low: number; - /** @prop multiplier for positive bias */ - high: number; - }; - /** @prop list of breaks treated as delimiter */ - breaks: Array<number>; -}; - |