import { TokenTags, TokenTag, Tag } from "../language/tags.ts"; import JapaneseString from "../language/japanese.ts"; declare global { /** @summary extended String prototype functions */ interface String { range(): UnicodeRange; rangeTally(): RangeTally; jp(): JapaneseString; parseTags(): TokenTags; } } export enum UnicodeRange { BasicLatin = "latin", Whitespace = "any-whitespace", Punctuation = "any-punctuation", Unknown = "any-unknown", JapanesePunctuation = "jp-punctuation", JapaneseHiragana = "jp-hiragana", JapaneseKatakana = "jp-katakana", JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana", JapaneseKanji = "jp-kanji", } type RangeTally = Record; /** @summary get UnicodeRange for character at index 0 */ String.prototype.range = function() { var code = this.charCodeAt(0); if (0x09 == code) return UnicodeRange.Whitespace; // tab if (0x20 == code) return UnicodeRange.Whitespace; // space if (0x21 == code) return UnicodeRange.Punctuation; // exclamation mark if (0x2e == code) return UnicodeRange.Punctuation; // full stop if (0x3f == code) return UnicodeRange.Punctuation; // question mark if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin; if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation; if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana; if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana; if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana; if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji; return UnicodeRange.Unknown; } /** @summary create a RangeTally object for counting used unicode ranges in string */ String.prototype.rangeTally = function() { var tally = Object.keys(UnicodeRange).reduce((a: any,c) => (a[c] = 0, a), {}) as RangeTally; for (var char of this) tally[char.range()]++; return tally; }; /** @summary get JapaneseString from this string */ String.prototype.jp = function() { return new JapaneseString(this); } /** @summary parse concatenated tag string to TokenTags */ String.prototype.parseTags = function() { var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; var filteredTags: TokenTag[] = []; for (var tag of tags) { // skip past tense tags after -te and -tari deinflection if (tag == Tag.Inflection.Tense.Past && filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; filteredTags.push(tag); } return new Set(filteredTags) as TokenTags; }