small restructuring + all deinflection tests working

author: lonkaars <loek@pipeframe.xyz> 2023-07-10 16:26:13 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-07-10 16:26:13 +0200
commit: a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
tree: afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /search
parent: e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)
4 files changed, 486 insertions, 0 deletions
diff --git a/search/readme.md b/search/readme.md
new file mode 100644
index 0000000..400c8ce
--- /dev/null
+++ b/search/readme.md
@@ -0,0 +1,53 @@
+# Search
+
+This directory contains files that provide an abstracted interface with the
+database for looking up sentences and words.
+
+## Tags
+
+All dictionary entries have tags. Tags are combined from term info, dictionary
+info, and glossary info. Tags can have subcategories separated by `:`. A
+separate tags table handles displaying tags for different display languages,
+including abbreviated versions.
+
+Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts).
+Dictionary importers should map the dictionary-specific version of these tags
+to Yomikun's tags for compatibility. Other tags include:
+
+|tag|description|
+|-|-|
+|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`.
+|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`|
+|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc.
+|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.)
+
+### Behavior-altering tags
+
+Some tag classes impact the parser's behavior. For example, the input text
+「完了しました」 will be parsed as just 「完了」, but with the
+`class:verb:suru-included` tag added by the parser. This is because the word
+「完了」 has the tag `class:verb:suru` in the database, which allows the parser
+to deconjugate a noun with the verb 「する」 back into the stem.
+
+Other uses of this behavior include more accurate automatic kanji reading
+generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」
+because 「ハイラル」 has the tag `name:place` in the database, and
+「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`.
+
+Yomikun encourages homebrew dictionary sharing, and encourages using
+behavior-altering tags for fixing readings for cases like the above examples.
+As another example of this, it is encouraged that a dictionary for (for
+example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`,
+instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji
+「湖(みずうみ)」.
+
+If Yomikun doesn't generate the correct reading, and the reading isn't based on
+natural language context (=a computer *could* accurately decide which reading
+is correct based on other words/tags in the sentence), please submit a pull
+request with the sentence and its (expected) reading. An example of a
+non-deterministic reading is 「何」 in the sentence 「何できた？」 which can be
+read as both 「なん」 in which case 「何で」 turns into a single word, or
+「なに」 where 「何」 is a regular word and 「で」 is particle.
+
+[taekim]: https://guidetojapanese.org/learn/
+
diff --git a/search/search.ts b/search/search.ts
new file mode 100644
index 0000000..0a50773
--- /dev/null
+++ b/search/search.ts
@@ -0,0 +1,141 @@
+import { Tag, TagGroup } from "./tags.ts";
+import { SearchSentenceProps, SearchSentenceResult, SearchTermResult, SearchWord } from "./types.ts";
+import DB from "../db/db.ts";
+import "../util/array.ts";
+import "../util/set.ts";
+import { DeepPartial } from "../util/types.ts";
+
+/** @summary main Search class */
+export default class Search {
+	db: DB;
+	ready: Promise<void>;
+
+	constructor() {
+		this.db = new DB();
+
+		this.ready = new Promise<void>(async resolve => {
+			await this.db.ready;
+			resolve();
+		});
+	}
+
+  /** @summary find possible terms at start of string by deconjugating */
+  public async terms(term: string): Promise<Array<SearchTermResult>> {
+    await this.ready;
+
+    var results = await this.db.findTerm(term);
+
+    // skip filtering valid results if there are none
+    if (results.length == 0) return [];
+
+    // filter invalid deconjugations/results
+    results = results.filter(result => {
+      // ignore ignored by user terms
+      if (result.sort < 0) return false;
+
+      // deconjugated words
+      if (result.depth > 0) {
+        // check if this word can be conjugated at all
+        if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;
+
+        // ignore other wrong deconjugations
+        if (result.tags.includes(Tag.Class.Verb.U) &&
+            !result.tags.includes(Tag.Inflection.Reason.U)) return false;
+        if (result.tags.includes(Tag.Class.Verb.Ru) &&
+            !result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
+        if (result.tags.includes(Tag.Class.Verb.Suru) &&
+            !result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
+        if (result.tags.includes(Tag.Class.Adjective.I) &&
+            !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
+        if (result.tags.includes(Tag.Class.Adjective.Na) &&
+            !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
+      }
+
+      // all other results should be valid
+      return true;
+    });
+
+    return results.map(result => ({
+      id: result.id,
+      writing: result.expression,
+      reading: result.reading,
+      tags: result.tags,
+      source: result.original,
+      sort: result.sort,
+      depth: result.depth,
+      match: {
+        reading: result.match.reading,
+        writing: result.match.writing,
+      },
+    }));
+  }
+
+	/** @summary parse sentence into terms with readings */
+	public async sentence(sentence: string, optional?: DeepPartial<SearchSentenceProps>): Promise<SearchSentenceResult> {
+		await this.ready;
+
+		var props: SearchSentenceProps = {
+			lookahead: optional?.lookahead ?? 15,
+			priorityMod: {
+				high: optional?.priorityMod?.high ?? 10,
+				low: optional?.priorityMod?.low ?? -10,
+			},
+			breaks: optional?.breaks ?? [],
+		}
+
+		var parseResult: SearchSentenceResult = {
+			input: sentence,
+			words: [],
+		};
+
+		for (let start = 0; start < sentence.length; start++) {
+			var lookahead = props.lookahead; // TODO: stop at next delimiter (optimization)
+      var term = sentence.substring(start, start + lookahead);
+      var results = (await this.terms(term)).map(term => {
+        var word = term as SearchWord;
+        word.start = start;
+        return word;
+      });
+
+			// current starting point did not yield results, try again at next character or until end of input
+			if (results.length == 0) continue;
+	
+			// bias search results by modifying sort value
+			results = results.map(result => {
+				// true if last token was a name else false
+				const lastTokenName = parseResult.words.peek()?.tags.anyOf(Object.values(Tag.Name));
+
+				// give higher priority to suffixes when last token was a name, else lower priority
+				if (result.tags.includes(Tag.Class.Suffix))
+					result.sort += lastTokenName ? props.priorityMod.high : props.priorityMod.low;
+
+				// give lower priority to terms matched only by their readings, and are
+				// usually written in kanji
+				if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.writing)
+					result.sort += props.priorityMod.low;
+
+				return result;
+			});
+
+			results.sort((a, b) => {
+				// sort by original string length (long to short)
+				if (a.source.length != b.source.length) return b.source.length - a.source.length;
+				// then by sort index (high to low)
+				if (a.sort != b.sort) return b.sort - a.sort;
+				// then by depth (high to low)
+				if (a.depth != b.depth) return b.depth - a.depth;
+				// else keep current order (random)
+				return 0;
+			});
+
+			// pick top result
+			const result = results[0];
+
+			parseResult.words.push(result);
+			start += result.source.length - 1; // -1 because loop already increments start
+			continue; // extra verbose end of iteration
+		}
+		return parseResult;
+	}
+};
+
diff --git a/search/tags.ts b/search/tags.ts
new file mode 100644
index 0000000..92279c5
--- /dev/null
+++ b/search/tags.ts
@@ -0,0 +1,232 @@
+import "../util/array.ts";
+
+/** @constant Tags that have significant meaning to the parser */
+export const Tag = {
+	/** @constant grammatical classes */
+	Class: {
+		/** @constant verb subgroup */
+		Verb: {
+			/** @constant noun that can be conjugated into a verb by adding する and する itself */
+			Suru: "class:verb:suru",
+			/**
+			 * @constant verb stored as conjugated noun in database (nominal verb)
+			 *
+			 * @deprecated The use of conjugated forms in dictionaries is discouraged.
+			 *
+			 * This tag is added by the deconjugation code to check for a legal
+			 * deconjugation if する has been deconjugated away for a word marked
+			 * suru-verb.
+			 */
+			SuruIncluded: "class:verb:suru-included",
+			/** @constant 〜う verbs in [taekim] (godan) */
+			U: "class:verb:u",
+			/** @constant 〜る verbs in [taekim] (ichidan) */
+			Ru: "class:verb:ru",
+			/** @constant kuru (来る) */
+			Kuru: "class:verb:kuru",
+		},
+		Adjective: {
+			/** @constant adjectives that end in 〜い */
+			I: "class:adj:i",
+			/** @constant adjectives that need to be conjugated using な */
+			Na: "class:adj:na",
+		},
+		/** @constant regular nouns or words that can be treated as nouns */
+		Noun: "class:noun",
+		/** @constant terms that are read differently when used as a suffix */
+		Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types
+		/** @constant grammatical particles (e.g. の, と, は, を, etc.) */
+		Particle: "class:part",
+		/** @constant expressions and idioms
+		 *
+		 * Can also be used for longer strings that are read in a special way, but
+		 * is discouraged.
+		 *
+		 * @see ./readme.md#behavior-altering-tags
+		 */
+		Expression: "class:expr",
+		/** @constant adverbs (e.g. 早く) */
+		Adverb: "class:adverb",
+	},
+	/** @constant types of names */
+	Name: {
+		/** @constant name of a place/location. allows suffixes */
+		Place: "name:place",
+		/** @constant feminine name. allows suffixes and honorifics */
+		Female: "name:female",
+		/** @constant masculine name. allows suffixes and honorifics */
+		Male: "name:male",
+	},
+	/**
+	 * @constant added to a word when deconjugated by the deinflection table
+	 *
+	 * Some inflections are used as steps in others, like the -tari suffix which
+	 * is conjugated after the past tense. In this case, the past tense tag would
+	 * be removed when it comes after the -tari tag. (see ../util/string.ts)
+	 *
+	 * e.g. 来ない -> 来る [infl:negative]
+	 */
+	Inflection: {
+		/**
+		 * @constant affirmative conjugations
+		 *
+		 * This conjugation should not be added by any deconjugation rules, but is
+		 * calculated based on the amount of negations. Even counts of negative
+		 * inflections (including 0) add this tag, while odd counts don't add this
+		 * tag.
+		 */
+		Affirmative: "infl:affirmative",
+		/** @constant negative conjugations */
+		Negative: "infl:negative",
+		/** @constant time-related conjugations */
+		Tense: {
+			/** @constant past tense (e.g. 叩いた) */
+			Past: "infl:tense:past",
+			/** @constant continuous tense (e.g. 喋っている) */
+			Continuous: "infl:tense:cont",
+		},
+		/** @constant adverbs (e.g. 早く) */
+		Adverb: "infl:adverb",
+		/** @constant polite conjugations */
+		Polite: {
+			/** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */
+			Masu: "infl:polite:masu",
+			/** @constant 〜なさい conjugations (e.g. 座りなさい) */
+			Nasai: "infl:polite:nasai",
+		},
+		/** @constant common ending conjugations */
+		Suffix: {
+			/** @constant -te ending (e.g. 売って) */
+			Te: "infl:suffix:te",
+			/** @constant -tari ending (e.g. 遊んだり) */
+			Tari: "infl:suffix:tari",
+		},
+		/** @constant internal deinflection rules */
+		Reason: {
+			/** @constant applied if word was deconjugated as -ru (ichidan) verb */
+			Ru: "infl:reason:ru",
+			/** @constant applied if word was deconjugated as -u (godan) verb */
+			U: "infl:reason:u",
+			/** @constant applied if word was deconjugated as suru verb */
+			Suru: "infl:reason:suru",
+			/** @constant applied if word was deconjugated as kuru verb */
+			Kuru: "infl:reason:kuru",
+			/** @constant applied if word was deconjugated as i-adjective */
+			Adjective: {
+				I: "infl:reason:adj:i",
+				Na: "infl:reason:adj:na",
+			},
+		},
+		/** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */
+		Passive: "infl:passive",
+		/** @constant indicates that a verb *can* happen (e.g. 落ちられる) */
+		Potential: "infl:potential",
+		/** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */
+		Causative: "infl:causative",
+		/** @constant imperative form (e.g. 聞け) */
+		Command: "infl:command",
+		/** @constant conditional forms */
+		Conditional: {
+			/** @constant -ba ending (e.g. 泳げれば) */
+			Ba: "infl:cond:ba",
+			/** @constant -ra ending (e.g. 取ったら) */
+			Ra: "infl:cond:ra",
+		},
+		/** @constant makes a verb obligatory (e.g. 入ってはいけない) */
+		Obligatory: "infl:must",
+		/** @constant verbs that someone wants to do / be done */
+		Desirable: {
+			/** @constant 〜たい endings (e.g. 買いたい) */
+			Itai: "infl:desire:itai",
+			/** @constant 〜おう endings (e.g. 寝よう) */
+			Volitional: "infl:desire:volitional",
+		},
+		/** @constant makes a verb an attempt */
+		Attempt: {
+			/** @constant 〜みる to try something out (e.g. 飲んでみた) */
+			Miru: "infl:attempt:miru",
+			/** @constant 〜とする attempts (e.g. 入ろうとしている) */
+			ToSuru: "infl:attempt:tosuru",
+		},
+		/** @constant temporary tags (removed by parseTags) */
+		Temporary: {
+			/** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */
+			ObligatoryParticle: "infl:tmp:must:prt",
+			/** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */
+			ObligatoryResult: "infl:tmp:must:res",
+		},
+	},
+	/** @constant uncategorized tags */
+	Auxiliary: {
+		/** @constant word usually written using only kana (but also has kanji) */
+		UsuallyKana: "aux:uk",
+	},
+} as const;
+
+export const TagGroup = {
+	/** @constant array that contains all tags of word classes that can be conjugated */
+	Conjugable: [
+		...Object.values(Tag.Class.Verb),
+		...Object.values(Tag.Class.Adjective),
+	],
+} as const;
+
+export type TokenTag = string; // no way around it
+
+export type TokenTags = Array<TokenTag>;
+
+/** @summary parse concatenated tag string to TokenTags */
+export function parseTags(input: string) {
+	var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
+	var filteredTags: TokenTag[] = [];
+	var negationCount = 0;
+	for (var tag of tags) {
+		// conjugations that are used as "stepping stones" for others should be
+		// filtered in this loop. checking if a combination of tags is valid should
+		// be done in ./parser.ts
+
+		// skip past tense tag if used as step for -te and -tari inflection
+		if (tag == Tag.Inflection.Tense.Past &&
+				filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
+
+		// skip -te suffix tag if used for
+		if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([
+			Tag.Inflection.Tense.Continuous, // base for continuous tense
+			Tag.Inflection.Obligatory, // base for obligatory inflection
+			Tag.Inflection.Attempt.Miru, // base for 〜みる attempt
+		])) continue;
+	
+		// skip volitional tag if used for 〜とする attempt
+		if (tag == Tag.Inflection.Desirable.Volitional &&
+				filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue;
+
+		// skip conditional 〜ば if used for obligatory inflection
+		if (tag == Tag.Inflection.Conditional.Ba &&
+				filteredTags.anyOf([Tag.Inflection.Obligatory])) continue;
+
+		// normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative
+		if (tag == Tag.Inflection.Negative) {
+			negationCount++;
+			continue;
+		}
+
+		filteredTags.push(tag);
+	}
+
+	// negative + と without resulting action = implicit affirmative obligatory
+	if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) &&
+			!filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) {
+		negationCount = 0; // -> make resulting tags affirmative
+	}
+
+	// normalize affirmative/negative
+	filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative);
+
+	// filter any remaining temporary tags
+	type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary];
+	filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag));
+
+	// filter any duplicates
+	return filteredTags.set().arr() as TokenTags;
+}
+
diff --git a/search/types.ts b/search/types.ts
new file mode 100644
index 0000000..d90afd6
--- /dev/null
+++ b/search/types.ts
@@ -0,0 +1,60 @@
+import { TokenTags } from "./tags.ts";
+
+export interface SearchGlossaryDefinition {
+	
+};
+
+export interface SearchGlossary {
+	id: number;
+	definitions: SearchGlossaryDefinition[];
+};
+
+export interface SearchTermResult {
+  /** @property dictionary term id */
+	id: number;
+  /** @property (preferably) kanji writing of term */
+	writing: string;
+  /** @property kana-only reading of term */
+	reading: string;
+  /** @property word tags including deconjugation tags */
+	tags: TokenTags;
+  /** @property original conjugated string */
+	source: string;
+  /** @property numeric sorting value for term */
+  sort: number;
+  /** @property amount of steps that were needed to deconjugate */
+  depth: number;
+  /** @property matching results */
+  match: {
+    /** @property term matched by writing */
+    writing: boolean;
+    /** @property term matched by reading */
+    reading: boolean;
+  }
+};
+
+export interface SearchWord extends SearchTermResult {
+  /** @property starting index of word in sentence */
+	start: number;
+};
+
+export interface SearchSentenceResult {
+	words: SearchWord[];
+	input: string;
+};
+
+/** @summary options for Search.sentence() */
+export interface SearchSentenceProps {
+	/** @prop max amount of characters to look ahead when attempting to deconjugate words */
+	lookahead: number;
+	/** @prop search bias values */
+	priorityMod: {
+		/** @prop offset for negative bias */
+		low: number;
+		/** @prop offset for positive bias */
+		high: number;
+	};
+	/** @prop list of breaks treated as delimiter */
+	breaks: Array<number>;
+};
+
author	lonkaars <loek@pipeframe.xyz>	2023-07-10 16:26:13 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-07-10 16:26:13 +0200
commit	a3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
tree	afad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /search
parent	e430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)