1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
|
import { Tag, TagGroup } from "./tags.ts";
import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts";
import DB from "../db/db.ts";
import "../util/array.ts";
import "../util/set.ts";
import { DeepPartial } from "../util/types.ts";
// TODO: rename Parser to Search
/** @summary main Parser class */
export default class Parser {
db: DB;
ready: Promise<void>;
constructor() {
this.db = new DB();
this.ready = new Promise<void>(async resolve => {
await this.db.ready;
resolve();
});
}
// Search.sentence()
async parse(sentence: string, optional?: DeepPartial<InputSentenceProps>): Promise<ParseResult> {
await this.ready;
// initialize default options
var props: InputSentenceProps = {
lookahead: optional?.lookahead ?? 15,
depth: optional?.depth ?? ParseDepth.Term,
priorityMod: {
high: optional?.priorityMod?.high ?? 10,
low: optional?.priorityMod?.low ?? 0.1,
},
breaks: optional?.breaks ?? [],
}
let parseResult = await this.parseTerms(sentence, props);
if (props.depth <= ParseDepth.Term) return parseResult;
parseResult = await this.addGlossary(parseResult, props);
if (props.depth <= ParseDepth.Term) return parseResult;
return parseResult;
}
/** @summary parse sentence into terms with readings */
private async parseTerms(sentence: string, options: InputSentenceProps): Promise<ParseResult> {
var parseResult: ParseResult = {
tokens: [],
depth: ParseDepth.Term,
input: sentence,
};
for (let start = 0; start < sentence.length; start++) {
var lookahead = options.lookahead;
var results = await this.db.findTerm(sentence.substring(start, start + lookahead));
// current starting point did not yield results, try again at next character or until end of input
if (results.length == 0) continue;
results = results.filter(result => {
// ignore ignored by user terms
if (result.sort < 0) return false;
// deconjugated words
if (result.depth > 0) {
// check if this word can be conjugated at all
if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false;
// ignore other wrong deconjugations
if (result.tags.includes(Tag.Class.Verb.U) &&
!result.tags.includes(Tag.Inflection.Reason.U)) return false;
if (result.tags.includes(Tag.Class.Verb.Ru) &&
!result.tags.includes(Tag.Inflection.Reason.Ru)) return false;
if (result.tags.includes(Tag.Class.Verb.Suru) &&
!result.tags.includes(Tag.Inflection.Reason.Suru)) return false;
if (result.tags.includes(Tag.Class.Adjective.I) &&
!result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false;
if (result.tags.includes(Tag.Class.Adjective.Na) &&
!result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false;
}
// all other results should be valid grammatically
return true;
});
// no valid results left after filter, try again at next character or until end of input
if (results.length == 0) continue;
// bias search results by modifying sort value
results = results.map(result => {
// true if last token was a name else false
const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name));
// give higher priority to suffixes when last token was a name, else lower priority
if (result.tags.includes(Tag.Class.Suffix))
result.sort *= lastTokenName ? options.priorityMod.high : options.priorityMod.low;
// give lower priority to terms matched only by their readings, and are
// usually written in kanji
if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji)
result.sort *= options.priorityMod.low;
return result;
});
results.sort((a, b) => {
// sort by original string length (long to short)
if (a.original.length != b.original.length) return b.original.length - a.original.length;
// then by sort index (high to low)
if (a.sort != b.sort) return b.sort - a.sort;
// then by depth (high to low)
if (a.depth != b.depth) return b.depth - a.depth;
// else keep current order (random)
return 0;
});
// pick top result
const result = results[0];
parseResult.tokens.push({
writing: result.expression,
reading: result.reading,
tags: result.tags,
term_id: result.id,
source: result.original,
start: start,
});
start += result.original.length - 1; // -1 because loop already increments start
continue; // extra verbose end of iteration
}
return parseResult;
}
private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise<ParseResult> {
// TODO: annotate input with glossaries from DB
options; // prevent unused warning
return input;
}
};
|