language/parser.ts


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144

import { Tag, TagGroup } from "./tags.ts";
import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts";
import DB from "../db/db.ts";
import "../util/array.ts";
import "../util/set.ts";
import { DeepPartial } from "../util/types.ts";

// TODO: rename Parser to Search
/** @summary main Parser class */
export default class Parser {
	db: DB;
	ready: Promise<void>;

	constructor() {
		this.db = new DB();

		this.ready = new Promise<void>(async resolve => {
			await this.db.ready;
			resolve();
		});
	}

	// Search.sentence()
	async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> {
		await this.ready;

		// initialize default options
		var props: InputSentenceProps = {
			lookahead: optional?.lookahead ?? 15,
			depth: optional?.depth ?? ParseDepth.Term,
			priorityMod: {
				high: optional?.priorityMod?.high ?? 10,
				low: optional?.priorityMod?.low ?? 0.1,
			},
			breaks: optional?.breaks ?? [],
		}

		let parseResult = await this.parseTerms(sentence, props);
		if (props.depth <= ParseDepth.Term) return parseResult;

		parseResult = await this.addGlossary(parseResult, props);
		if (props.depth <= ParseDepth.Term) return parseResult;

		return parseResult;
	}

	/** @summary parse sentence into terms with readings */
	private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> {
		var parseResult: ParseResult = {
			tokens: [],
			depth: ParseDepth.Term,
			input: sentence,
		};

		for (let start = 0; start < sentence.length; start++) {
			var lookahead = options.lookahead;

			var results = await this.db.findTerm(sentence.substring(start, start + lookahead));
			// current starting point did not yield results, try again at next character or until end of input
			if (results.length == 0) continue;

			results = results.filter(result => {
				// ignore ignored by user terms
				if (result.sort < 0) return false;

				// deconjugated words
				if (result.depth > 0) {
					// check if this word can be conjugated at all
					if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;

					// ignore other wrong deconjugations
					if (result.tags.includes(Tag.Class.Verb.U) &&
							!result.tags.includes(Tag.Inflection.Reason.U)) return false;
					if (result.tags.includes(Tag.Class.Verb.Ru) &&
							!result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
					if (result.tags.includes(Tag.Class.Verb.Suru) &&
							!result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
					if (result.tags.includes(Tag.Class.Adjective.I) &&
						  !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
					if (result.tags.includes(Tag.Class.Adjective.Na) &&
						  !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
				}

				// all other results should be valid grammatically
				return true;
			});

			// no valid results left after filter, try again at next character or until end of input
			if (results.length == 0) continue;
	
			// bias search results by modifying sort value
			results = results.map(result => {
				// true if last token was a name else false
				const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name));

				// give higher priority to suffixes when last token was a name, else lower priority
				if (result.tags.includes(Tag.Class.Suffix))
					result.sort *= lastTokenName ? options.priorityMod.high : options.priorityMod.low;

				// give lower priority to terms matched only by their readings, and are
				// usually written in kanji
				if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji)
					result.sort *= options.priorityMod.low;

				return result;
			});


			results.sort((a, b) => {
				// sort by original string length (long to short)
				if (a.original.length != b.original.length) return b.original.length - a.original.length;
				// then by sort index (high to low)
				if (a.sort != b.sort) return b.sort - a.sort;
				// then by depth (high to low)
				if (a.depth != b.depth) return b.depth - a.depth;
				// else keep current order (random)
				return 0;
			});

			// pick top result
			const result = results[0];

			parseResult.tokens.push({
				writing: result.expression,
				reading: result.reading,
				tags: result.tags,
				term_id: result.id,
				source: result.original,
				start: start,
			});

			start += result.original.length - 1; // -1 because loop already increments start
			continue; // extra verbose end of iteration
		}
		return parseResult;
	}

	private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> {
		// TODO: annotate input with glossaries from DB
		options; // prevent unused warning
		return input;
	}
};