aboutsummaryrefslogtreecommitdiff
path: root/search
diff options
context:
space:
mode:
authorlonkaars <loek@pipeframe.xyz>2023-07-10 16:26:13 +0200
committerlonkaars <loek@pipeframe.xyz>2023-07-10 16:26:13 +0200
commita3a81530a0a30ba02b5253b762e2ccd77d3b01fc (patch)
treeafad1bae0c2f7cb9d4a11b6c1c56bc8bae2f14e5 /search
parente430d8cb4a30640298b7fae3c93bc6329e2a0382 (diff)
small restructuring + all deinflection tests working
Diffstat (limited to 'search')
-rw-r--r--search/readme.md53
-rw-r--r--search/search.ts141
-rw-r--r--search/tags.ts232
-rw-r--r--search/types.ts60
4 files changed, 486 insertions, 0 deletions
diff --git a/search/readme.md b/search/readme.md
new file mode 100644
index 0000000..400c8ce
--- /dev/null
+++ b/search/readme.md
@@ -0,0 +1,53 @@
+# Search
+
+This directory contains files that provide an abstracted interface with the
+database for looking up sentences and words.
+
+## Tags
+
+All dictionary entries have tags. Tags are combined from term info, dictionary
+info, and glossary info. Tags can have subcategories separated by `:`. A
+separate tags table handles displaying tags for different display languages,
+including abbreviated versions.
+
+Tags that may alter behavior are stored as constants in [tags.ts](./tags.ts).
+Dictionary importers should map the dictionary-specific version of these tags
+to Yomikun's tags for compatibility. Other tags include:
+
+|tag|description|
+|-|-|
+|`series:*`|abbreviated series name, e.g. "The Legend of Zelda" is `series:zelda`, and "Tears of the Kingdom" is `series:totk`. series with multiple entries should split the series and entry into separate tags, e.g. `series:zelda series:totk` instead of `series:zelda_totk`.
+|`dict:*`|dictionary tag. e.g. `dict:jmdict_dutch` or `dict:daijisen`|
+|`pitch:*`|`pitch:0` for 平板, `pitch:1` for 頭高, etc.
+|`aux:*`|used for other tags (joyo kanji, commonly used term, usually kana, etc.)
+
+### Behavior-altering tags
+
+Some tag classes impact the parser's behavior. For example, the input text
+「完了しました」 will be parsed as just 「完了」, but with the
+`class:verb:suru-included` tag added by the parser. This is because the word
+「完了」 has the tag `class:verb:suru` in the database, which allows the parser
+to deconjugate a noun with the verb 「する」 back into the stem.
+
+Other uses of this behavior include more accurate automatic kanji reading
+generation, for example 「城」 being read as 「じょう」 in 「ハイラル城」
+because 「ハイラル」 has the tag `name:place` in the database, and
+「城(じょう)」 has `class:suffix`, while 「城(しろ)」 has `class:noun`.
+
+Yomikun encourages homebrew dictionary sharing, and encourages using
+behavior-altering tags for fixing readings for cases like the above examples.
+As another example of this, it is encouraged that a dictionary for (for
+example) Zelda add 「トト」 as a term with tags `class:noun` and `name:place`,
+instead of 「トト湖(こ)」 as an expression to fix the reading of the kanji
+「湖(みずうみ)」.
+
+If Yomikun doesn't generate the correct reading, and the reading isn't based on
+natural language context (=a computer *could* accurately decide which reading
+is correct based on other words/tags in the sentence), please submit a pull
+request with the sentence and its (expected) reading. An example of a
+non-deterministic reading is 「何」 in the sentence 「何できた?」 which can be
+read as both 「なん」 in which case 「何で」 turns into a single word, or
+「なに」 where 「何」 is a regular word and 「で」 is particle.
+
+[taekim]: https://guidetojapanese.org/learn/
+
diff --git a/search/search.ts b/search/search.ts
new file mode 100644
index 0000000..0a50773
--- /dev/null
+++ b/search/search.ts
@@ -0,0 +1,141 @@
+import { Tag, TagGroup } from "./tags.ts";
+import { SearchSentenceProps, SearchSentenceResult, SearchTermResult, SearchWord } from "./types.ts";
+import DB from "../db/db.ts";
+import "../util/array.ts";
+import "../util/set.ts";
+import { DeepPartial } from "../util/types.ts";
+
+/** @summary main Search class */
+export default class Search {
+ db: DB;
+ ready: Promise<void>;
+
+ constructor() {
+ this.db = new DB();
+
+ this.ready = new Promise<void>(async resolve => {
+ await this.db.ready;
+ resolve();
+ });
+ }
+
+ /** @summary find possible terms at start of string by deconjugating */
+ public async terms(term: string): Promise<Array<SearchTermResult>> {
+ await this.ready;
+
+ var results = await this.db.findTerm(term);
+
+ // skip filtering valid results if there are none
+ if (results.length == 0) return [];
+
+ // filter invalid deconjugations/results
+ results = results.filter(result => {
+ // ignore ignored by user terms
+ if (result.sort < 0) return false;
+
+ // deconjugated words
+ if (result.depth > 0) {
+ // check if this word can be conjugated at all
+ if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;
+
+ // ignore other wrong deconjugations
+ if (result.tags.includes(Tag.Class.Verb.U) &&
+ !result.tags.includes(Tag.Inflection.Reason.U)) return false;
+ if (result.tags.includes(Tag.Class.Verb.Ru) &&
+ !result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
+ if (result.tags.includes(Tag.Class.Verb.Suru) &&
+ !result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
+ if (result.tags.includes(Tag.Class.Adjective.I) &&
+ !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
+ if (result.tags.includes(Tag.Class.Adjective.Na) &&
+ !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
+ }
+
+ // all other results should be valid
+ return true;
+ });
+
+ return results.map(result => ({
+ id: result.id,
+ writing: result.expression,
+ reading: result.reading,
+ tags: result.tags,
+ source: result.original,
+ sort: result.sort,
+ depth: result.depth,
+ match: {
+ reading: result.match.reading,
+ writing: result.match.writing,
+ },
+ }));
+ }
+
+ /** @summary parse sentence into terms with readings */
+ public async sentence(sentence: string, optional?: DeepPartial<SearchSentenceProps>): Promise<SearchSentenceResult> {
+ await this.ready;
+
+ var props: SearchSentenceProps = {
+ lookahead: optional?.lookahead ?? 15,
+ priorityMod: {
+ high: optional?.priorityMod?.high ?? 10,
+ low: optional?.priorityMod?.low ?? -10,
+ },
+ breaks: optional?.breaks ?? [],
+ }
+
+ var parseResult: SearchSentenceResult = {
+ input: sentence,
+ words: [],
+ };
+
+ for (let start = 0; start < sentence.length; start++) {
+ var lookahead = props.lookahead; // TODO: stop at next delimiter (optimization)
+ var term = sentence.substring(start, start + lookahead);
+ var results = (await this.terms(term)).map(term => {
+ var word = term as SearchWord;
+ word.start = start;
+ return word;
+ });
+
+ // current starting point did not yield results, try again at next character or until end of input
+ if (results.length == 0) continue;
+
+ // bias search results by modifying sort value
+ results = results.map(result => {
+ // true if last token was a name else false
+ const lastTokenName = parseResult.words.peek()?.tags.anyOf(Object.values(Tag.Name));
+
+ // give higher priority to suffixes when last token was a name, else lower priority
+ if (result.tags.includes(Tag.Class.Suffix))
+ result.sort += lastTokenName ? props.priorityMod.high : props.priorityMod.low;
+
+ // give lower priority to terms matched only by their readings, and are
+ // usually written in kanji
+ if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.writing)
+ result.sort += props.priorityMod.low;
+
+ return result;
+ });
+
+ results.sort((a, b) => {
+ // sort by original string length (long to short)
+ if (a.source.length != b.source.length) return b.source.length - a.source.length;
+ // then by sort index (high to low)
+ if (a.sort != b.sort) return b.sort - a.sort;
+ // then by depth (high to low)
+ if (a.depth != b.depth) return b.depth - a.depth;
+ // else keep current order (random)
+ return 0;
+ });
+
+ // pick top result
+ const result = results[0];
+
+ parseResult.words.push(result);
+ start += result.source.length - 1; // -1 because loop already increments start
+ continue; // extra verbose end of iteration
+ }
+ return parseResult;
+ }
+};
+
diff --git a/search/tags.ts b/search/tags.ts
new file mode 100644
index 0000000..92279c5
--- /dev/null
+++ b/search/tags.ts
@@ -0,0 +1,232 @@
+import "../util/array.ts";
+
+/** @constant Tags that have significant meaning to the parser */
+export const Tag = {
+ /** @constant grammatical classes */
+ Class: {
+ /** @constant verb subgroup */
+ Verb: {
+ /** @constant noun that can be conjugated into a verb by adding する and する itself */
+ Suru: "class:verb:suru",
+ /**
+ * @constant verb stored as conjugated noun in database (nominal verb)
+ *
+ * @deprecated The use of conjugated forms in dictionaries is discouraged.
+ *
+ * This tag is added by the deconjugation code to check for a legal
+ * deconjugation if する has been deconjugated away for a word marked
+ * suru-verb.
+ */
+ SuruIncluded: "class:verb:suru-included",
+ /** @constant 〜う verbs in [taekim] (godan) */
+ U: "class:verb:u",
+ /** @constant 〜る verbs in [taekim] (ichidan) */
+ Ru: "class:verb:ru",
+ /** @constant kuru (来る) */
+ Kuru: "class:verb:kuru",
+ },
+ Adjective: {
+ /** @constant adjectives that end in 〜い */
+ I: "class:adj:i",
+ /** @constant adjectives that need to be conjugated using な */
+ Na: "class:adj:na",
+ },
+ /** @constant regular nouns or words that can be treated as nouns */
+ Noun: "class:noun",
+ /** @constant terms that are read differently when used as a suffix */
+ Suffix: "class:suffix", // TODO: specify place, honorific, counter suffix types
+ /** @constant grammatical particles (e.g. の, と, は, を, etc.) */
+ Particle: "class:part",
+ /** @constant expressions and idioms
+ *
+ * Can also be used for longer strings that are read in a special way, but
+ * is discouraged.
+ *
+ * @see ./readme.md#behavior-altering-tags
+ */
+ Expression: "class:expr",
+ /** @constant adverbs (e.g. 早く) */
+ Adverb: "class:adverb",
+ },
+ /** @constant types of names */
+ Name: {
+ /** @constant name of a place/location. allows suffixes */
+ Place: "name:place",
+ /** @constant feminine name. allows suffixes and honorifics */
+ Female: "name:female",
+ /** @constant masculine name. allows suffixes and honorifics */
+ Male: "name:male",
+ },
+ /**
+ * @constant added to a word when deconjugated by the deinflection table
+ *
+ * Some inflections are used as steps in others, like the -tari suffix which
+ * is conjugated after the past tense. In this case, the past tense tag would
+ * be removed when it comes after the -tari tag. (see ../util/string.ts)
+ *
+ * e.g. 来ない -> 来る [infl:negative]
+ */
+ Inflection: {
+ /**
+ * @constant affirmative conjugations
+ *
+ * This conjugation should not be added by any deconjugation rules, but is
+ * calculated based on the amount of negations. Even counts of negative
+ * inflections (including 0) add this tag, while odd counts don't add this
+ * tag.
+ */
+ Affirmative: "infl:affirmative",
+ /** @constant negative conjugations */
+ Negative: "infl:negative",
+ /** @constant time-related conjugations */
+ Tense: {
+ /** @constant past tense (e.g. 叩いた) */
+ Past: "infl:tense:past",
+ /** @constant continuous tense (e.g. 喋っている) */
+ Continuous: "infl:tense:cont",
+ },
+ /** @constant adverbs (e.g. 早く) */
+ Adverb: "infl:adverb",
+ /** @constant polite conjugations */
+ Polite: {
+ /** @constant 丁寧語 〜ます conjugations (e.g. 食べました) */
+ Masu: "infl:polite:masu",
+ /** @constant 〜なさい conjugations (e.g. 座りなさい) */
+ Nasai: "infl:polite:nasai",
+ },
+ /** @constant common ending conjugations */
+ Suffix: {
+ /** @constant -te ending (e.g. 売って) */
+ Te: "infl:suffix:te",
+ /** @constant -tari ending (e.g. 遊んだり) */
+ Tari: "infl:suffix:tari",
+ },
+ /** @constant internal deinflection rules */
+ Reason: {
+ /** @constant applied if word was deconjugated as -ru (ichidan) verb */
+ Ru: "infl:reason:ru",
+ /** @constant applied if word was deconjugated as -u (godan) verb */
+ U: "infl:reason:u",
+ /** @constant applied if word was deconjugated as suru verb */
+ Suru: "infl:reason:suru",
+ /** @constant applied if word was deconjugated as kuru verb */
+ Kuru: "infl:reason:kuru",
+ /** @constant applied if word was deconjugated as i-adjective */
+ Adjective: {
+ I: "infl:reason:adj:i",
+ Na: "infl:reason:adj:na",
+ },
+ },
+ /** @constant makes a verb usable without specifying who carries it out (e.g. 言われる) */
+ Passive: "infl:passive",
+ /** @constant indicates that a verb *can* happen (e.g. 落ちられる) */
+ Potential: "infl:potential",
+ /** @constant indicates that someone makes a verb happen (e.g. ⾷べさせる) */
+ Causative: "infl:causative",
+ /** @constant imperative form (e.g. 聞け) */
+ Command: "infl:command",
+ /** @constant conditional forms */
+ Conditional: {
+ /** @constant -ba ending (e.g. 泳げれば) */
+ Ba: "infl:cond:ba",
+ /** @constant -ra ending (e.g. 取ったら) */
+ Ra: "infl:cond:ra",
+ },
+ /** @constant makes a verb obligatory (e.g. 入ってはいけない) */
+ Obligatory: "infl:must",
+ /** @constant verbs that someone wants to do / be done */
+ Desirable: {
+ /** @constant 〜たい endings (e.g. 買いたい) */
+ Itai: "infl:desire:itai",
+ /** @constant 〜おう endings (e.g. 寝よう) */
+ Volitional: "infl:desire:volitional",
+ },
+ /** @constant makes a verb an attempt */
+ Attempt: {
+ /** @constant 〜みる to try something out (e.g. 飲んでみた) */
+ Miru: "infl:attempt:miru",
+ /** @constant 〜とする attempts (e.g. 入ろうとしている) */
+ ToSuru: "infl:attempt:tosuru",
+ },
+ /** @constant temporary tags (removed by parseTags) */
+ Temporary: {
+ /** @constant particle of obligatory conjugation (e.g. 行かない*と*だめ), or colloquial abbreviation */
+ ObligatoryParticle: "infl:tmp:must:prt",
+ /** @constant resulting action part of obligatory conjugation (e.g. 行かないと*だめ*) */
+ ObligatoryResult: "infl:tmp:must:res",
+ },
+ },
+ /** @constant uncategorized tags */
+ Auxiliary: {
+ /** @constant word usually written using only kana (but also has kanji) */
+ UsuallyKana: "aux:uk",
+ },
+} as const;
+
+export const TagGroup = {
+ /** @constant array that contains all tags of word classes that can be conjugated */
+ Conjugable: [
+ ...Object.values(Tag.Class.Verb),
+ ...Object.values(Tag.Class.Adjective),
+ ],
+} as const;
+
+export type TokenTag = string; // no way around it
+
+export type TokenTags = Array<TokenTag>;
+
+/** @summary parse concatenated tag string to TokenTags */
+export function parseTags(input: string) {
+ var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
+ var filteredTags: TokenTag[] = [];
+ var negationCount = 0;
+ for (var tag of tags) {
+ // conjugations that are used as "stepping stones" for others should be
+ // filtered in this loop. checking if a combination of tags is valid should
+ // be done in ./parser.ts
+
+ // skip past tense tag if used as step for -te and -tari inflection
+ if (tag == Tag.Inflection.Tense.Past &&
+ filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
+
+ // skip -te suffix tag if used for
+ if (tag == Tag.Inflection.Suffix.Te && filteredTags.anyOf([
+ Tag.Inflection.Tense.Continuous, // base for continuous tense
+ Tag.Inflection.Obligatory, // base for obligatory inflection
+ Tag.Inflection.Attempt.Miru, // base for 〜みる attempt
+ ])) continue;
+
+ // skip volitional tag if used for 〜とする attempt
+ if (tag == Tag.Inflection.Desirable.Volitional &&
+ filteredTags.anyOf([Tag.Inflection.Attempt.ToSuru])) continue;
+
+ // skip conditional 〜ば if used for obligatory inflection
+ if (tag == Tag.Inflection.Conditional.Ba &&
+ filteredTags.anyOf([Tag.Inflection.Obligatory])) continue;
+
+ // normalize multiple Inflection.Negative to single Inflection.Affirmative or Inflection.Negative
+ if (tag == Tag.Inflection.Negative) {
+ negationCount++;
+ continue;
+ }
+
+ filteredTags.push(tag);
+ }
+
+ // negative + と without resulting action = implicit affirmative obligatory
+ if (filteredTags.includes(Tag.Inflection.Temporary.ObligatoryParticle) &&
+ !filteredTags.includes(Tag.Inflection.Temporary.ObligatoryResult)) {
+ negationCount = 0; // -> make resulting tags affirmative
+ }
+
+ // normalize affirmative/negative
+ filteredTags.push(negationCount % 2 == 0 ? Tag.Inflection.Affirmative : Tag.Inflection.Negative);
+
+ // filter any remaining temporary tags
+ type tempTag = typeof Tag.Inflection.Temporary[keyof typeof Tag.Inflection.Temporary];
+ filteredTags = filteredTags.filter(t => !Object.values(Tag.Inflection.Temporary).includes(t as tempTag));
+
+ // filter any duplicates
+ return filteredTags.set().arr() as TokenTags;
+}
+
diff --git a/search/types.ts b/search/types.ts
new file mode 100644
index 0000000..d90afd6
--- /dev/null
+++ b/search/types.ts
@@ -0,0 +1,60 @@
+import { TokenTags } from "./tags.ts";
+
+export interface SearchGlossaryDefinition {
+
+};
+
+export interface SearchGlossary {
+ id: number;
+ definitions: SearchGlossaryDefinition[];
+};
+
+export interface SearchTermResult {
+ /** @property dictionary term id */
+ id: number;
+ /** @property (preferably) kanji writing of term */
+ writing: string;
+ /** @property kana-only reading of term */
+ reading: string;
+ /** @property word tags including deconjugation tags */
+ tags: TokenTags;
+ /** @property original conjugated string */
+ source: string;
+ /** @property numeric sorting value for term */
+ sort: number;
+ /** @property amount of steps that were needed to deconjugate */
+ depth: number;
+ /** @property matching results */
+ match: {
+ /** @property term matched by writing */
+ writing: boolean;
+ /** @property term matched by reading */
+ reading: boolean;
+ }
+};
+
+export interface SearchWord extends SearchTermResult {
+ /** @property starting index of word in sentence */
+ start: number;
+};
+
+export interface SearchSentenceResult {
+ words: SearchWord[];
+ input: string;
+};
+
+/** @summary options for Search.sentence() */
+export interface SearchSentenceProps {
+ /** @prop max amount of characters to look ahead when attempting to deconjugate words */
+ lookahead: number;
+ /** @prop search bias values */
+ priorityMod: {
+ /** @prop offset for negative bias */
+ low: number;
+ /** @prop offset for positive bias */
+ high: number;
+ };
+ /** @prop list of breaks treated as delimiter */
+ breaks: Array<number>;
+};
+