small restructuring + all deinflection tests working

author: lonkaars <loek@pipeframe.xyz> 2023-07-10 16:26:13 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-07-10 16:26:13 +0200
commit: a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
tree: afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /language
parent: e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)
4 files changed, 0 insertions, 474 deletions
diff --git a/language/parser.ts b/language/parser.ts
deleted file mode 100644
index 7fd3981..0000000
--- a/language/parser.ts
+++ /dev/null
@@ -1,144 +0,0 @@
-import { Tag, TagGroup } from "./tags.ts";
-import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts";
-import DB from "../db/db.ts";
-import "../util/array.ts";
-import "../util/set.ts";
-import { DeepPartial } from "../util/types.ts";
-
-// TODO: rename Parser to Search
-/** @summary main Parser class */
-export default class Parser {
-	db: DB;
-	ready: Promise<void>;
-
-	constructor() {
-		this.db = new DB();
-
-		this.ready = new Promise<void>(async resolve => {
-			await this.db.ready;
-			resolve();
-		});
-	}
-
-	// Search.sentence()
-	async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> {
-		await this.ready;
-
-		// initialize default options
-		var props: InputSentenceProps = {
-			lookahead: optional?.lookahead ?? 15,
-			depth: optional?.depth ?? ParseDepth.Term,
-			priorityMod: {
-				high: optional?.priorityMod?.high ?? 10,
-				low: optional?.priorityMod?.low ?? -10,
-			},
-			breaks: optional?.breaks ?? [],
-		}
-
-		let parseResult = await this.parseTerms(sentence, props);
-		if (props.depth <= ParseDepth.Term) return parseResult;
-
-		parseResult = await this.addGlossary(parseResult, props);
-		if (props.depth <= ParseDepth.Term) return parseResult;
-
-		return parseResult;
-	}
-
-	/** @summary parse sentence into terms with readings */
-	private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> {
-		var parseResult: ParseResult = {
-			tokens: [],
-			depth: ParseDepth.Term,
-			input: sentence,
-		};
-
-		for (let start = 0; start < sentence.length; start++) {
-			var lookahead = options.lookahead;
-
-			var results = await this.db.findTerm(sentence.substring(start, start + lookahead));
-			// current starting point did not yield results, try again at next character or until end of input
-			if (results.length == 0) continue;
-
-			results = results.filter(result => {
-				// ignore ignored by user terms
-				if (result.sort < 0) return false;
-
-				// deconjugated words
-				if (result.depth > 0) {
-					// check if this word can be conjugated at all
-					if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;
-
-					// ignore other wrong deconjugations
-					if (result.tags.includes(Tag.Class.Verb.U) &&
-							!result.tags.includes(Tag.Inflection.Reason.U)) return false;
-					if (result.tags.includes(Tag.Class.Verb.Ru) &&
-							!result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
-					if (result.tags.includes(Tag.Class.Verb.Suru) &&
-							!result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
-					if (result.tags.includes(Tag.Class.Adjective.I) &&
-						  !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
-					if (result.tags.includes(Tag.Class.Adjective.Na) &&
-						  !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
-				}
-
-				// all other results should be valid grammatically
-				return true;
-			});
-
-			// no valid results left after filter, try again at next character or until end of input
-			if (results.length == 0) continue;
-	
-			// bias search results by modifying sort value
-			results = results.map(result => {
-				// true if last token was a name else false
-				const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name));
-
-				// give higher priority to suffixes when last token was a name, else lower priority
-				if (result.tags.includes(Tag.Class.Suffix))
-					result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low;
-
-				// give lower priority to terms matched only by their readings, and are
-				// usually written in kanji
-				if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji)
-					result.sort += options.priorityMod.low;
-
-				return result;
-			});
-
-
-			results.sort((a, b) => {
-				// sort by original string length (long to short)
-				if (a.original.length != b.original.length) return b.original.length - a.original.length;
-				// then by sort index (high to low)
-				if (a.sort != b.sort) return b.sort - a.sort;
-				// then by depth (high to low)
-				if (a.depth != b.depth) return b.depth - a.depth;
-				// else keep current order (random)
-				return 0;
-			});
-
-			// pick top result
-			const result = results[0];
-
-			parseResult.tokens.push({
-				writing: result.expression,
-				reading: result.reading,
-				tags: result.tags,
-				term_id: result.id,
-				source: result.original,
-				start: start,
-			});
-
-			start += result.original.length - 1; // -1 because loop already increments start
-			continue; // extra verbose end of iteration
-		}
-		return parseResult;
-	}
-
-	private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> {
-		// TODO: annotate input with glossaries from DB
-		options; // prevent unused warning
-		return input;
-	}
-};
-
diff --git a/language/readme.md b/language/readme.md
deleted file mode 100644
index 99a7d69..0000000
--- a/language/readme.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# Language
-
-This directory contains files that provide an abstracted interface with the
-database for looking up sentences ~and words~.
-
-## Tags
-
-All dictionary entries have tags. Tags are combined from term info, dictionary
-info, and glossary info. Tags can have subcategories separated by `:`. A
-separate tags table handles displaying tags for different display languages,
-including abbreviated versions.
-
-Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts).
-Dictionary importers should map the dictionary-specific version of these tags
-to Yomikun's tags for compatibility. Other tags include:
-
-|tag|description|
-|-|-|
-|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`.
-|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`|
-|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc.
-|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.)
-
-### Behavior-altering tags
-
-Some tag classes impact the parser's behavior. For example, the input text
-「完了しました」 will be parsed as just 「完了」, but with the
-`class:verb:suru-included` tag added by the parser. This is because the word
-「完了」 has the tag `class:verb:suru` in the database, which allows the parser
-to deconjugate a noun with the verb 「する」 back into the stem.
-
-Other uses of this behavior include more accurate automatic kanji reading
-generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」
-because 「ハイラル」 has the tag `name:place` in the database, and
-「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`.
-
-Yomikun encourages homebrew dictionary sharing, and encourages using
-behavior-altering tags for fixing readings for cases like the above examples.
-As another example of this, it is encouraged that a dictionary for (for
-example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`,
-instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji
-「湖(みずうみ)」.
-
-If Yomikun doesn't generate the correct reading, and the reading isn't based on
-natural language context (=a computer *could* accurately decide which reading
-is correct based on other words/tags in the sentence), please submit a pull
-request with the sentence and its (expected) reading. An example of a
-non-deterministic reading is 「何」 in the sentence 「何できた？」 which can be
-read as both 「なん」 in which case 「何で」 turns into a single word, or
-「なに」 where 「何」 is a regular word and 「で」 is particle.
-
-[taekim]: https://guidetojapanese.org/learn/
-
diff --git a/language/tags.ts b/language/tags.ts
deleted file mode 100644
index 72840fe..0000000
--- a/language/tags.ts
+++ /dev/null
@@ -1,228 +0,0 @@
-import "../util/array.ts";
-
-/** @constant Tags that have significant meaning to the parser */
-export const Tag = {
-	/** @constant grammatical classes */
-	Class: {
-		/** @constant verb subgroup */
-		Verb: {
-			/** @constant noun that can be conjugated into a verb by adding する and する itself */
-			Suru: "class:verb:suru",
-			/**
-			 * @constant verb stored as conjugated noun in database (nominal verb)
-			 *
-			 * @deprecated The use of conjugated forms in dictionaries is discouraged.
-			 *
-			 * This tag is added by the deconjugation code to check for a legal
-			 * deconjugation if する has been deconjugated away for a word marked
-			 * suru-verb.
-			 */
-			SuruIncluded: "class:verb:suru-included",
-			/** @constant 〜う verbs in [taekim] (godan) */
-			U: "class:verb:u",
-			/** @constant 〜る verbs in [taekim] (ichidan) */
-			Ru: "class:verb:ru",
-			/** @constant kuru (来る) */
-			Kuru: "class:verb:kuru",
-		},
-		Adjective: {
-			/** @constant adjectives that end in 〜い */
-			I: "class:adj:i",
-			/** @constant adjectives that need to be conjugated using な */
-			Na: "class:adj:na",
-		},
-		/** @constant regular nouns or words that can be treated as nouns */
-		Noun: "class:noun",
-		/** @constant terms that are read differently when used as a suffix */
-		Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types
-		/** @constant grammatical particles (e.g. の, と, は, を, etc.) */
-		Particle: "class:part",
-		/** @constant expressions and idioms
-		 *
-		 * Can also be used for longer strings that are read in a special way, but
-		 * is discouraged.
-		 *
-		 * @see ./readme.md#behavior-altering-tags
-		 */
-		Expression: "class:expr",
-		/** @constant adverbs (e.g. 早く) */
-		Adverb: "class:adverb",
-	},
-	/** @constant types of names */
-	Name: {
-		/** @constant name of a place/location. allows suffixes */
-		Place: "name:place",
-		/** @constant feminine name. allows suffixes and honorifics */
-		Female: "name:female",
-		/** @constant masculine name. allows suffixes and honorifics */
-		Male: "name:male",
-	},
-	/**
-	 * @constant added to a word when deconjugated by the deinflection table
-	 *
-	 * Some inflections are used as steps in others, like the -tari suffix which
-	 * is conjugated after the past tense. In this case, the past tense tag would
-	 * be removed when it comes after the -tari tag. (see ../util/string.ts)
-	 *
-	 * e.g. 来ない -> 来る [infl:negative]
-	 */
-	Inflection: {
-		/**
-		 * @constant affirmative conjugations
-		 *
-		 * This conjugation should not be added by any deconjugation rules, but is
-		 * calculated based on the amount of negations. Even counts of negative
-		 * inflections (including 0) add this tag, while odd counts don't add this
-		 * tag.
-		 */
-		Affirmative: "infl:affirmative",
-		/** @constant negative conjugations */
-		Negative: "infl:negative",
-		/** @constant time-related conjugations */
-		Tense: {
-			/** @constant past tense (e.g. 叩いた) */
-			Past: "infl:tense:past",
-			/** @constant continuous tense (e.g. 喋っている) */
-			Continuous: "infl:tense:cont",
-		},
-		/** @constant adverbs (e.g. 早く) */
-		Adverb: "infl:adverb",
-		/** @constant polite conjugations */
-		Polite: {
-			/** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */
-			Masu: "infl:polite:masu",
-			/** @constant 〜なさい conjugations (e.g. 座りなさい) */
-			Nasai: "infl:polite:nasai",
-		},
-		/** @constant common ending conjugations */
-		Suffix: {
-			/** @constant -te ending (e.g. 売って) */
-			Te: "infl:suffix:te",
-			/** @constant -tari ending (e.g. 遊んだり) */
-			Tari: "infl:suffix:tari",
-		},
-		/** @constant internal deinflection rules */
-		Reason: {
-			/** @constant applied if word was deconjugated as -ru (ichidan) verb */
-			Ru: "infl:reason:ru",
-			/** @constant applied if word was deconjugated as -u (godan) verb */
-			U: "infl:reason:u",
-			/** @constant applied if word was deconjugated as suru verb */
-			Suru: "infl:reason:suru",
-			/** @constant applied if word was deconjugated as kuru verb */
-			Kuru: "infl:reason:kuru",
-			/** @constant applied if word was deconjugated as i-adjective */
-			Adjective: {
-				I: "infl:reason:adj:i",
-				Na: "infl:reason:adj:na",
-			},
-		},
-		/** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */
-		Passive: "infl:passive",
-		/** @constant indicates that a verb *can* happen (e.g. 落ちられる) */
-		Potential: "infl:potential",
-		/** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */
-		Causative: "infl:causative",
-		/** @constant imperative form (e.g. 聞け) */
-		Command: "infl:command",
-		/** @constant conditional forms */
-		Conditional: {
-			/** @constant -ba ending (e.g. 泳げれば) */
-			Ba: "infl:cond:ba",
-			/** @constant -ra ending (e.g. 取ったら) */
-			Ra: "infl:cond:ra",
-		},
-		/** @constant makes a verb obligatory (e.g. 入ってはいけない) */
-		Obligatory: "infl:must",
-		/** @constant verbs that someone wants to do / be done */
-		Desirable: {
-			/** @constant 〜たい endings (e.g. 買いたい) */
-			Itai: "infl:desire:itai",
-			/** @constant 〜おう endings (e.g. 寝よう) */
-			Volitional: "infl:desire:volitional",
-		},
-		/** @constant makes a verb an attempt */
-		Attempt: {
-			/** @constant 〜みる to try something out (e.g. 飲んでみた) */
-			Miru: "infl:attempt:miru",
-			/** @constant 〜とする attempts (e.g. 入ろうとしている) */
-			ToSuru: "infl:attempt:tosuru",
-		},
-		/** @constant temporary tags (removed by parseTags) */
-		Temporary: {
-			/** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */
-			ObligatoryParticle: "infl:tmp:must:prt",
-			/** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */
-			ObligatoryResult: "infl:tmp:must:res",
-		},
-	},
-	/** @constant uncategorized tags */
-	Auxiliary: {
-		/** @constant word usually written using only kana (but also has kanji) */
-		UsuallyKana: "aux:uk",
-	},
-} as const;
-
-export const TagGroup = {
-	/** @constant array that contains all tags of word classes that can be conjugated */
-	Conjugable: [
-		...Object.values(Tag.Class.Verb),
-		...Object.values(Tag.Class.Adjective),
-	],
-} as const;
-
-export type TokenTag = string; // no way around it
-
-export type TokenTags = Array<TokenTag>;
-
-/** @summary parse concatenated tag string to TokenTags */
-export function parseTags(input: string) {
-	var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
-	var filteredTags: TokenTag[] = [];
-	var negationCount = 0;
-	for (var tag of tags) {
-		// conjugations that are used as "stepping stones" for others should be
-		// filtered in this loop. checking if a combination of tags is valid should
-		// be done in ./parser.ts
-
-		// skip past tense tag if used as step for -te and -tari inflection
-		if (tag == Tag.Inflection.Tense.Past &&
-				filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
-
-		// skip -te suffix tag if used for
-		if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([
-			Tag.Inflection.Tense.Continuous, // base for continuous tense
-			Tag.Inflection.Obligatory, // base for obligatory inflection
-			Tag.Inflection.Attempt.Miru, // base for 〜みる attempt
-		])) continue;
-	
-		// skip volitional tag if used for 〜とする attempt
-		if (tag == Tag.Inflection.Desirable.Volitional &&
-				filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue;
-
-		// normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative
-		if (tag == Tag.Inflection.Negative) {
-			negationCount++;
-			continue;
-		}
-
-		filteredTags.push(tag);
-	}
-
-	// negative + と without resulting action = implicit affirmative obligatory
-	if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) &&
-			!filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) {
-		negationCount = 0; // -> make resulting tags affirmative
-	}
-
-	// normalize affirmative/negative
-	filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative);
-
-	// filter any remaining temporary tags
-	type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary];
-	filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag));
-
-	// filter any duplicates
-	return filteredTags.set().arr() as TokenTags;
-}
-
diff --git a/language/types.ts b/language/types.ts
deleted file mode 100644
index d3585f8..0000000
--- a/language/types.ts
+++ /dev/null
@@ -1,49 +0,0 @@
-import { TokenTags } from "./tags.ts";
-
-export enum ParseDepth {
-	Term,
-	Glossary,
-};
-
-export interface GlossaryDefinition {
-	
-};
-
-export interface Glossary {
-	id: number;
-	definitions: GlossaryDefinition[];
-};
-
-export interface ParseToken {
-	writing: string;
-	reading: string;
-	tags: TokenTags;
-	glossary?: Glossary;
-	term_id: number;
-	source: string;
-	start: number;
-};
-
-export interface ParseResult {
-	depth: ParseDepth;
-	tokens: ParseToken[];
-	input: string;
-};
-
-/** @summary option struct for Parser */
-export interface InputSentenceProps {
-	/** @prop max amount of characters to look ahead when attempting to deconjugate */
-	lookahead: number;
-	/** @prop amount of detail to return in search results */
-	depth: ParseDepth;
-	/** @prop search bias multipliers */
-	priorityMod: {
-		/** @prop multiplier for negative bias */
-		low: number;
-		/** @prop multiplier for positive bias */
-		high: number;
-	};
-	/** @prop list of breaks treated as delimiter */
-	breaks: Array<number>;
-};
-
author	lonkaars <loek@pipeframe.xyz>	2023-07-10 16:26:13 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-07-10 16:26:13 +0200
commit	a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
tree	afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /language
parent	e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)