1 files changed, 74 insertions, 0 deletions
diff --git a/util/string.ts b/util/string.ts
new file mode 100644
index 0000000..d94f5a3
--- /dev/null
+++ b/util/string.ts
@@ -0,0 +1,74 @@
+import { TokenTags, TokenTag, Tag } from "../language/tags.ts";
+import JapaneseString from "../language/japanese.ts";
+
+declare global {
+	/** @summary extended String prototype functions */
+	interface String {
+		range(): UnicodeRange;
+		rangeTally(): RangeTally;
+
+		jp(): JapaneseString;
+
+		parseTags(): TokenTags;
+	}
+}
+
+export enum UnicodeRange {
+	BasicLatin = "latin",
+	Whitespace = "any-whitespace",
+	Punctuation = "any-punctuation",
+	Unknown = "any-unknown",
+	JapanesePunctuation = "jp-punctuation",
+	JapaneseHiragana = "jp-hiragana",
+	JapaneseKatakana = "jp-katakana",
+	JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana",
+	JapaneseKanji = "jp-kanji",
+}
+
+type RangeTally = Record<UnicodeRange, number>;
+
+/** @summary get UnicodeRange for character at index 0 */
+String.prototype.range = function() {
+	var code = this.charCodeAt(0);
+
+	if (0x09 == code) return UnicodeRange.Whitespace; // tab
+	if (0x20 == code) return UnicodeRange.Whitespace; // space
+	if (0x21 == code) return UnicodeRange.Punctuation; // exclamation mark
+	if (0x2e == code) return UnicodeRange.Punctuation; // full stop
+	if (0x3f == code) return UnicodeRange.Punctuation; // question mark
+
+	if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin;
+	if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation;
+	if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana;
+	if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana;
+	if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana;
+	if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji;
+	return UnicodeRange.Unknown;
+}
+
+/** @summary create a RangeTally object for counting used unicode ranges in string */
+String.prototype.rangeTally = function() {
+	var tally = Object.keys(UnicodeRange).reduce((a: any,c) => (a[c] = 0, a), {}) as RangeTally;
+	for (var char of this) tally[char.range()]++;
+	return tally;
+};
+
+/** @summary get JapaneseString from this string */
+String.prototype.jp = function() {
+	return new JapaneseString(this);
+}
+
+/** @summary parse concatenated tag string to TokenTags */
+String.prototype.parseTags = function() {
+	var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
+	var filteredTags: TokenTag[] = [];
+	for (var tag of tags) {
+		// skip past tense tags after -te and -tari deinflection
+		if (tag == Tag.Inflection.Tense.Past &&
+				filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
+
+		filteredTags.push(tag);
+	}
+	return new Set(filteredTags) as TokenTags;
+}
+