import { Tag } from "./tags.ts"; import { ParseResult, InputSentenceProps, ParseDepth } from "./types.ts"; import DB from "../db/db.ts"; import "../util/array.ts"; import "../util/set.ts"; /** @summary main Parser class */ export default class Parser { db: DB; constructor() { this.db = new DB(); } async prepare() { await Promise.all([ this.db.prepare(), ]); } parse(sentence: string, options?: InputSentenceProps): ParseResult { let parseResult = this.parseTerms(sentence, options); if ((options?.depth || ParseDepth.Term) <= ParseDepth.Term) return parseResult; parseResult = this.addGlossary(parseResult, options); if ((options?.depth || ParseDepth.Term) <= ParseDepth.Term) return parseResult; return parseResult; } /** @summary parse sentence into terms with readings */ private parseTerms(sentence: string, options?: InputSentenceProps): ParseResult { const MAX_LOOKAHEAD = options?.lookahead ?? 15; const PRIORITY_MOD_HIGHER = options?.priorityMod?.high ?? 10; const PRIORITY_MOD_LOWER = options?.priorityMod?.low ?? 0.1; var parseResult: ParseResult = { tokens: [], depth: ParseDepth.Term, }; for (let start = 0; start < sentence.length; start++) { var results = this.db.findTerm(sentence.substring(start, start + MAX_LOOKAHEAD)); // current starting point did not yield results, try again at next character or until end of input if (results.length == 0) continue; results = results.filter(result => { // ignore ignored by user terms if (result.sort < 0) return false; // deconjugated words if (result.depth > 0) { // can't be conjugated at all if (!result.tags.anyOf(Object.values(Tag.Class.Verb))) return false; // ignore other wrong deconjugations if (result.tags.includes(Tag.Class.Verb.U) && !result.tags.includes(Tag.Inflection.Reason.U)) return false; if (result.tags.includes(Tag.Class.Verb.Ru) && !result.tags.includes(Tag.Inflection.Reason.Ru)) return false; if (result.tags.includes(Tag.Class.Verb.Suru) && !result.tags.includes(Tag.Inflection.Reason.Suru)) return false; } // all other results should be valid grammatically return true; }); // no valid results left after filter, try again at next character or until end of input if (results.length == 0) continue; // bias search results by modifying sort value results = results.map(result => { // true if last token was a name else false const lastTokenName = parseResult.tokens.peek()?.tags.anyOf(Object.values(Tag.Name)); // give higher priority to suffixes when last token was a name, else lower priority if (result.tags.includes(Tag.Class.Suffix)) result.sort *= lastTokenName ? PRIORITY_MOD_HIGHER : PRIORITY_MOD_LOWER; // give lower priority to terms matched only by their readings, and are // usually written in kanji if (!result.tags.includes(Tag.Auxiliary.UsuallyKana) && !result.match.kanji) result.sort *= PRIORITY_MOD_LOWER; return result; }); results.sort((a, b) => { // sort by original string length (long to short) if (a.original.length != b.original.length) return b.original.length - a.original.length; // then by sort index (high to low) if (a.sort != b.sort) return b.sort - a.sort; // then by depth (high to low) if (a.depth != b.depth) return b.depth - a.depth; // else keep current order (random) return 0; }); // pick top result const result = results[0]; parseResult.tokens.push({ reading: [ {"text": result.expression, "ruby": result.reading} ], // TODO: source to reading + separate kaji/kana tags: result.tags, term_id: result.id, source: result.original, }); start += result.original.length - 1; // -1 because loop already increments start continue; // extra verbose end of iteration } return parseResult; } private addGlossary(input: ParseResult, options?: InputSentenceProps): ParseResult { // TODO: annotate input with glossaries from DB options; // prevent unused warning return input; } };