import { Tag, TagGroup } from "./tags.ts"; import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; import DB from "../db/db.ts"; import "../util/array.ts"; import "../util/set.ts"; import { DeepPartial } from "../util/types.ts"; // TODO: rename Parser to Search /** @summary main Parser class */ export default class Parser { db: DB; ready: Promise; constructor() { this.db = new DB(); this.ready = new Promise(async resolve => { await this.db.ready; resolve(); }); } // Search.sentence() async parse(sentence: string, optional?: DeepPartial): Promise { await this.ready; // initialize default options var props: InputSentenceProps = { lookahead: optional?.lookahead ?? 15, depth: optional?.depth ?? ParseDepth.Term, priorityMod: { high: optional?.priorityMod?.high ?? 10, low: optional?.priorityMod?.low ?? -10, }, breaks: optional?.breaks ?? [], } let parseResult = await this.parseTerms(sentence, props); if (props.depth <= ParseDepth.Term) return parseResult; parseResult = await this.addGlossary(parseResult, props); if (props.depth <= ParseDepth.Term) return parseResult; return parseResult; } /** @summary parse sentence into terms with readings */ private async parseTerms(sentence: string, options: InputSentenceProps): Promise { var parseResult: ParseResult = { tokens: [], depth: ParseDepth.Term, input: sentence, }; for (let start = 0; start < sentence.length; start++) { var lookahead = options.lookahead; var results = await this.db.findTerm(sentence.substring(start, start + lookahead)); // current starting point did not yield results, try again at next character or until end of input if (results.length == 0) continue; results = results.filter(result => { // ignore ignored by user terms if (result.sort < 0) return false; // deconjugated words if (result.depth > 0) { // check if this word can be conjugated at all if (!result.tags.anyOf(TagGroup.Conjugable as string[])) return false; // ignore other wrong deconjugations if (result.tags.includes(Tag.Class.Verb.U) && !result.tags.includes(Tag.Inflection.Reason.U)) return false; if (result.tags.includes(Tag.Class.Verb.Ru) && !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; if (result.tags.includes(Tag.Class.Verb.Suru) && !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; if (result.tags.includes(Tag.Class.Adjective.I) && !result.tags.includes(Tag.Inflection.Reason.Adjective.I)) return false; if (result.tags.includes(Tag.Class.Adjective.Na) && !result.tags.includes(Tag.Inflection.Reason.Adjective.Na)) return false; } // all other results should be valid grammatically return true; }); // no valid results left after filter, try again at next character or until end of input if (results.length == 0) continue; // bias search results by modifying sort value results = results.map(result => { // true if last token was a name else false const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); // give higher priority to suffixes when last token was a name, else lower priority if (result.tags.includes(Tag.Class.Suffix)) result.sort += lastTokenName ? options.priorityMod.high : options.priorityMod.low; // give lower priority to terms matched only by their readings, and are // usually written in kanji if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) result.sort += options.priorityMod.low; return result; }); results.sort((a, b) => { // sort by original string length (long to short) if (a.original.length != b.original.length) return b.original.length - a.original.length; // then by sort index (high to low) if (a.sort != b.sort) return b.sort - a.sort; // then by depth (high to low) if (a.depth != b.depth) return b.depth - a.depth; // else keep current order (random) return 0; }); // pick top result const result = results[0]; parseResult.tokens.push({ writing: result.expression, reading: result.reading, tags: result.tags, term_id: result.id, source: result.original, start: start, }); start += result.original.length - 1; // -1 because loop already increments start continue; // extra verbose end of iteration } return parseResult; } private async addGlossary(input: ParseResult, options: InputSentenceProps): Promise { // TODO: annotate input with glossaries from DB options; // prevent unused warning return input; } };