diff options
Diffstat (limited to 'util/string.ts')
-rw-r--r-- | util/string.ts | 74 |
1 files changed, 74 insertions, 0 deletions
diff --git a/util/string.ts b/util/string.ts new file mode 100644 index 0000000..d94f5a3 --- /dev/null +++ b/util/string.ts @@ -0,0 +1,74 @@ +import { TokenTags, TokenTag, Tag } from "../language/tags.ts"; +import JapaneseString from "../language/japanese.ts"; + +declare global { + /** @summary extended String prototype functions */ + interface String { + range(): UnicodeRange; + rangeTally(): RangeTally; + + jp(): JapaneseString; + + parseTags(): TokenTags; + } +} + +export enum UnicodeRange { + BasicLatin = "latin", + Whitespace = "any-whitespace", + Punctuation = "any-punctuation", + Unknown = "any-unknown", + JapanesePunctuation = "jp-punctuation", + JapaneseHiragana = "jp-hiragana", + JapaneseKatakana = "jp-katakana", + JapaneseFWLatinHWKatakana = "jp-full-width-latin-half-width-katakana", + JapaneseKanji = "jp-kanji", +} + +type RangeTally = Record<UnicodeRange, number>; + +/** @summary get UnicodeRange for character at index 0 */ +String.prototype.range = function() { + var code = this.charCodeAt(0); + + if (0x09 == code) return UnicodeRange.Whitespace; // tab + if (0x20 == code) return UnicodeRange.Whitespace; // space + if (0x21 == code) return UnicodeRange.Punctuation; // exclamation mark + if (0x2e == code) return UnicodeRange.Punctuation; // full stop + if (0x3f == code) return UnicodeRange.Punctuation; // question mark + + if (0x0000 <= code && code <= 0x007f) return UnicodeRange.BasicLatin; + if (0x3000 <= code && code <= 0x303f) return UnicodeRange.JapanesePunctuation; + if (0x3040 <= code && code <= 0x309f) return UnicodeRange.JapaneseHiragana; + if (0x30a0 <= code && code <= 0x30ff) return UnicodeRange.JapaneseKatakana; + if (0xff00 <= code && code <= 0xffef) return UnicodeRange.JapaneseFWLatinHWKatakana; + if (0x4e00 <= code && code <= 0x9faf) return UnicodeRange.JapaneseKanji; + return UnicodeRange.Unknown; +} + +/** @summary create a RangeTally object for counting used unicode ranges in string */ +String.prototype.rangeTally = function() { + var tally = Object.keys(UnicodeRange).reduce((a: any,c) => (a[c] = 0, a), {}) as RangeTally; + for (var char of this) tally[char.range()]++; + return tally; +}; + +/** @summary get JapaneseString from this string */ +String.prototype.jp = function() { + return new JapaneseString(this); +} + +/** @summary parse concatenated tag string to TokenTags */ +String.prototype.parseTags = function() { + var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[]; + var filteredTags: TokenTag[] = []; + for (var tag of tags) { + // skip past tense tags after -te and -tari deinflection + if (tag == Tag.Inflection.Tense.Past && + filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue; + + filteredTags.push(tag); + } + return new Set(filteredTags) as TokenTags; +} + |