aboutsummaryrefslogtreecommitdiff
path: root/language
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-10 16:26:13 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-10 16:26:13 +0200
commita3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
treeafad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /language
parente430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)
small restructuring + all deinflection tests working
Diffstat (limited to 'language')
-rw-r--r--language/parser.ts144
-rw-r--r--language/readme.md53
-rw-r--r--language/tags.ts228
-rw-r--r--language/types.ts49
4 files changed, 0 insertions, 474 deletions
diff --git a/language/parser.ts b/language/parser.ts
deleted file mode 100644
index 7fd3981..0000000
--- a/language/parser.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-import { Tag, TagGroup } from "./tags.ts";
-import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts";
-import DB from "../db/db.ts";
-import "../util/array.ts";
-import "../util/set.ts";
-import { DeepPartial } from "../util/types.ts";
-
-// TODO: rename Parser to Search
-/** @summary main Parser class */
-export default class Parser {
- db: DB;
- ready: Promise<void>;
-
- constructor() {
- this.db = new DB();
-
- this.ready = new Promise<void>(async resolve => {
- await this.db.ready;
- resolve();
- });
- }
-
- // Search.sentence()
- async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> {
- await this.ready;
-
- // initialize default options
- var props: InputSentenceProps = {
- lookahead: optional?.lookahead ?? 15,
- depth: optional?.depth ?? ParseDepth.Term,
- priorityMod: {
- high: optional?.priorityMod?.high ?? 10,
- low: optional?.priorityMod?.low ?? -10,
- },
- breaks: optional?.breaks ?? [],
- }
-
- let parseResult = await this.parseTerms(sentence, props);
- if (props.depth <= ParseDepth.Term) return parseResult;
-
- parseResult = await this.addGlossary(parseResult, props);
- if (props.depth <= ParseDepth.Term) return parseResult;
-
- return parseResult;
- }
-
- /** @summary parse sentence into terms with readings */
- private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> {
- var parseResult: ParseResult = {
- tokens: [],
- depth: ParseDepth.Term,
- input: sentence,
- };
-
- for (let start = 0; start < sentence.length; start++) {
- var lookahead = options.lookahead;
-
- var results = await this.db.findTerm(sentence.substring(start, start + lookahead));
- // current starting point did not yield results, try again at next character or until end of input
- if (results.length == 0) continue;
-
- results = results.filter(result => {
- // ignore ignored by user terms
- if (result.sort < 0) return false;
-
- // deconjugated words
- if (result.depth > 0) {
- // check if this word can be conjugated at all
- if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;
-
- // ignore other wrong deconjugations
- if (result.tags.includes(Tag.Class.Verb.U) &&
- !result.tags.includes(Tag.Inflection.Reason.U)) return false;
- if (result.tags.includes(Tag.Class.Verb.Ru) &&
- !result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
- if (result.tags.includes(Tag.Class.Verb.Suru) &&
- !result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
- if (result.tags.includes(Tag.Class.Adjective.I) &&
- !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
- if (result.tags.includes(Tag.Class.Adjective.Na) &&
- !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
- }
-
- // all other results should be valid grammatically
- return true;
- });
-
- // no valid results left after filter, try again at next character or until end of input
- if (results.length == 0) continue;
-
- // bias search results by modifying sort value
- results = results.map(result => {
- // true if last token was a name else false
- const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name));
-
- // give higher priority to suffixes when last token was a name, else lower priority
- if (result.tags.includes(Tag.Class.Suffix))
- result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low;
-
- // give lower priority to terms matched only by their readings, and are
- // usually written in kanji
- if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji)
- result.sort += options.priorityMod.low;
-
- return result;
- });
-
-
- results.sort((a, b) => {
- // sort by original string length (long to short)
- if (a.original.length != b.original.length) return b.original.length - a.original.length;
- // then by sort index (high to low)
- if (a.sort != b.sort) return b.sort - a.sort;
- // then by depth (high to low)
- if (a.depth != b.depth) return b.depth - a.depth;
- // else keep current order (random)
- return 0;
- });
-
- // pick top result
- const result = results[0];
-
- parseResult.tokens.push({
- writing: result.expression,
- reading: result.reading,
- tags: result.tags,
- term_id: result.id,
- source: result.original,
- start: start,
- });
-
- start += result.original.length - 1; // -1 because loop already increments start
- continue; // extra verbose end of iteration
- }
- return parseResult;
- }
-
- private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> {
- // TODO: annotate input with glossaries from DB
- options; // prevent unused warning
- return input;
- }
-};
-
diff --git a/language/readme.md b/language/readme.md
deleted file mode 100644
index 99a7d69..0000000
--- a/language/readme.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Language
-
-This directory contains files that provide an abstracted interface with the
-database for looking up sentences ~and words~.
-
-## Tags
-
-All dictionary entries have tags. Tags are combined from term info, dictionary
-info, and glossary info. Tags can have subcategories separated by `:`. A
-separate tags table handles displaying tags for different display languages,
-including abbreviated versions.
-
-Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts).
-Dictionary importers should map the dictionary-specific version of these tags
-to Yomikun's tags for compatibility. Other tags include:
-
-|tag|description|
-|-|-|
-|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`.
-|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`|
-|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc.
-|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.)
-
-### Behavior-altering tags
-
-Some tag classes impact the parser's behavior. For example, the input text
-「完了しました」 will be parsed as just 「完了」, but with the
-`class:verb:suru-included` tag added by the parser. This is because the word
-「完了」 has the tag `class:verb:suru` in the database, which allows the parser
-to deconjugate a noun with the verb 「する」 back into the stem.
-
-Other uses of this behavior include more accurate automatic kanji reading
-generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」
-because 「ハイラル」 has the tag `name:place` in the database, and
-「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`.
-
-Yomikun encourages homebrew dictionary sharing, and encourages using
-behavior-altering tags for fixing readings for cases like the above examples.
-As another example of this, it is encouraged that a dictionary for (for
-example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`,
-instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji
-「湖(みずうみ)」.
-
-If Yomikun doesn't generate the correct reading, and the reading isn't based on
-natural language context (=a computer *could* accurately decide which reading
-is correct based on other words/tags in the sentence), please submit a pull
-request with the sentence and its (expected) reading. An example of a
-non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be
-read as both 「なん」 in which case 「何で」 turns into a single word, or
-「なに」 where 「何」 is a regular word and 「で」 is particle.
-
-[taekim]: https://guidetojapanese.org/learn/
-
diff --git a/language/tags.ts b/language/tags.ts
deleted file mode 100644
index 72840fe..0000000
--- a/language/tags.ts
+++ /dev/null
@@ -1,228 +0,0 @@
-import "../util/array.ts";
-
-/** @constant Tags that have significant meaning to the parser */
-export const Tag = {
- /** @constant grammatical classes */
- Class: {
- /** @constant verb subgroup */
- Verb: {
- /** @constant noun that can be conjugated into a verb by adding する and する itself */
- Suru: "class:verb:suru",
- /**
- * @constant verb stored as conjugated noun in database (nominal verb)
- *
- * @deprecated The use of conjugated forms in dictionaries is discouraged.
- *
- * This tag is added by the deconjugation code to check for a legal
- * deconjugation if する has been deconjugated away for a word marked
- * suru-verb.
- */
- SuruIncluded: "class:verb:suru-included",
- /** @constant 〜う verbs in [taekim] (godan) */
- U: "class:verb:u",
- /** @constant 〜る verbs in [taekim] (ichidan) */
- Ru: "class:verb:ru",
- /** @constant kuru (来る) */
- Kuru: "class:verb:kuru",
- },
- Adjective: {
- /** @constant adjectives that end in 〜い */
- I: "class:adj:i",
- /** @constant adjectives that need to be conjugated using な */
- Na: "class:adj:na",
- },
- /** @constant regular nouns or words that can be treated as nouns */
- Noun: "class:noun",
- /** @constant terms that are read differently when used as a suffix */
- Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types
- /** @constant grammatical particles (e.g. の, と, は, を, etc.) */
- Particle: "class:part",
- /** @constant expressions and idioms
- *
- * Can also be used for longer strings that are read in a special way, but
- * is discouraged.
- *
- * @see ./readme.md#behavior-altering-tags
- */
- Expression: "class:expr",
- /** @constant adverbs (e.g. 早く) */
- Adverb: "class:adverb",
- },
- /** @constant types of names */
- Name: {
- /** @constant name of a place/location. allows suffixes */
- Place: "name:place",
- /** @constant feminine name. allows suffixes and honorifics */
- Female: "name:female",
- /** @constant masculine name. allows suffixes and honorifics */
- Male: "name:male",
- },
- /**
- * @constant added to a word when deconjugated by the deinflection table
- *
- * Some inflections are used as steps in others, like the -tari suffix which
- * is conjugated after the past tense. In this case, the past tense tag would
- * be removed when it comes after the -tari tag. (see ../util/string.ts)
- *
- * e.g. 来ない -> 来る [infl:negative]
- */
- Inflection: {
- /**
- * @constant affirmative conjugations
- *
- * This conjugation should not be added by any deconjugation rules, but is
- * calculated based on the amount of negations. Even counts of negative
- * inflections (including 0) add this tag, while odd counts don't add this
- * tag.
- */
- Affirmative: "infl:affirmative",
- /** @constant negative conjugations */
- Negative: "infl:negative",
- /** @constant time-related conjugations */
- Tense: {
- /** @constant past tense (e.g. 叩いた) */
- Past: "infl:tense:past",
- /** @constant continuous tense (e.g. 喋っている) */
- Continuous: "infl:tense:cont",
- },
- /** @constant adverbs (e.g. 早く) */
- Adverb: "infl:adverb",
- /** @constant polite conjugations */
- Polite: {
- /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */
- Masu: "infl:polite:masu",
- /** @constant 〜なさい conjugations (e.g. 座りなさい) */
- Nasai: "infl:polite:nasai",
- },
- /** @constant common ending conjugations */
- Suffix: {
- /** @constant -te ending (e.g. 売って) */
- Te: "infl:suffix:te",
- /** @constant -tari ending (e.g. 遊んだり) */
- Tari: "infl:suffix:tari",
- },
- /** @constant internal deinflection rules */
- Reason: {
- /** @constant applied if word was deconjugated as -ru (ichidan) verb */
- Ru: "infl:reason:ru",
- /** @constant applied if word was deconjugated as -u (godan) verb */
- U: "infl:reason:u",
- /** @constant applied if word was deconjugated as suru verb */
- Suru: "infl:reason:suru",
- /** @constant applied if word was deconjugated as kuru verb */
- Kuru: "infl:reason:kuru",
- /** @constant applied if word was deconjugated as i-adjective */
- Adjective: {
- I: "infl:reason:adj:i",
- Na: "infl:reason:adj:na",
- },
- },
- /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */
- Passive: "infl:passive",
- /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */
- Potential: "infl:potential",
- /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */
- Causative: "infl:causative",
- /** @constant imperative form (e.g. 聞け) */
- Command: "infl:command",
- /** @constant conditional forms */
- Conditional: {
- /** @constant -ba ending (e.g. 泳げれば) */
- Ba: "infl:cond:ba",
- /** @constant -ra ending (e.g. 取ったら) */
- Ra: "infl:cond:ra",
- },
- /** @constant makes a verb obligatory (e.g. 入ってはいけない) */
- Obligatory: "infl:must",
- /** @constant verbs that someone wants to do / be done */
- Desirable: {
- /** @constant 〜たい endings (e.g. 買いたい) */
- Itai: "infl:desire:itai",
- /** @constant 〜おう endings (e.g. 寝よう) */
- Volitional: "infl:desire:volitional",
- },
- /** @constant makes a verb an attempt */
- Attempt: {
- /** @constant 〜みる to try something out (e.g. 飲んでみた) */
- Miru: "infl:attempt:miru",
- /** @constant 〜とする attempts (e.g. 入ろうとしている) */
- ToSuru: "infl:attempt:tosuru",
- },
- /** @constant temporary tags (removed by parseTags) */
- Temporary: {
- /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */
- ObligatoryParticle: "infl:tmp:must:prt",
- /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */
- ObligatoryResult: "infl:tmp:must:res",
- },
- },
- /** @constant uncategorized tags */
- Auxiliary: {
- /** @constant word usually written using only kana (but also has kanji) */
- UsuallyKana: "aux:uk",
- },
-} as const;
-
-export const TagGroup = {
- /** @constant array that contains all tags of word classes that can be conjugated */
- Conjugable: [
- ...Object.values(Tag.Class.Verb),
- ...Object.values(Tag.Class.Adjective),
- ],
-} as const;
-
-export type TokenTag = string; // no way around it
-
-export type TokenTags = Array<TokenTag>;
-
-/** @summary parse concatenated tag string to TokenTags */
-export function parseTags(input: string) {
- var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
- var filteredTags: TokenTag[] = [];
- var negationCount = 0;
- for (var tag of tags) {
- // conjugations that are used as "stepping stones" for others should be
- // filtered in this loop. checking if a combination of tags is valid should
- // be done in ./parser.ts
-
- // skip past tense tag if used as step for -te and -tari inflection
- if (tag == Tag.Inflection.Tense.Past &&
- filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
-
- // skip -te suffix tag if used for
- if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([
- Tag.Inflection.Tense.Continuous, // base for continuous tense
- Tag.Inflection.Obligatory, // base for obligatory inflection
- Tag.Inflection.Attempt.Miru, // base for 〜みる attempt
- ])) continue;
-
- // skip volitional tag if used for 〜とする attempt
- if (tag == Tag.Inflection.Desirable.Volitional &&
- filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue;
-
- // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative
- if (tag == Tag.Inflection.Negative) {
- negationCount++;
- continue;
- }
-
- filteredTags.push(tag);
- }
-
- // negative + と without resulting action = implicit affirmative obligatory
- if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) &&
- !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) {
- negationCount = 0; // -> make resulting tags affirmative
- }
-
- // normalize affirmative/negative
- filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative);
-
- // filter any remaining temporary tags
- type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary];
- filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag));
-
- // filter any duplicates
- return filteredTags.set().arr() as TokenTags;
-}
-
diff --git a/language/types.ts b/language/types.ts
deleted file mode 100644
index d3585f8..0000000
--- a/language/types.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-import { TokenTags } from "./tags.ts";
-
-export enum ParseDepth {
- Term,
- Glossary,
-};
-
-export interface GlossaryDefinition {
-
-};
-
-export interface Glossary {
- id: number;
- definitions: GlossaryDefinition[];
-};
-
-export interface ParseToken {
- writing: string;
- reading: string;
- tags: TokenTags;
- glossary?: Glossary;
- term_id: number;
- source: string;
- start: number;
-};
-
-export interface ParseResult {
- depth: ParseDepth;
- tokens: ParseToken[];
- input: string;
-};
-
-/** @summary option struct for Parser */
-export interface InputSentenceProps {
- /** @prop max amount of characters to look ahead when attempting to deconjugate */
- lookahead: number;
- /** @prop amount of detail to return in search results */
- depth: ParseDepth;
- /** @prop search bias multipliers */
- priorityMod: {
- /** @prop multiplier for negative bias */
- low: number;
- /** @prop multiplier for positive bias */
- high: number;
- };
- /** @prop list of breaks treated as delimiter */
- breaks: Array<number>;
-};
-