diff options
author | StefanVukovic99 <stefanvukovic44@gmail.com> | 2024-02-17 02:45:24 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-02-17 01:45:24 +0000 |
commit | 4aaa9f15d97668203741c1731f15e710ae8b8294 (patch) | |
tree | d1885f7fbd7d1510a71176597169d6847ae26572 /ext/js/language | |
parent | 4e77741d22778bd09b772fc53f1cbd64107e3d24 (diff) |
add language select, abstract text transformations (#584)
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* part1
* frotend done?
* fix tests
* offscreen and type complications
* add tests
* start fixing tests
* keep fixing tests
* fix tests
* Copy functions from JapaneseUtil
* Remove JapaneseUtil
* Update usages of JapaneseUtil functions
* delete pt
* renames
* add tests
* kebab-case filenames
* lint
* minor fixes
* merge
* fixes
* fix part of comments
* fix more comments
* delete unused types
* comment
* comment
* do backend
* other files
* move fetch utils to own file
* remove extra line
* add extra line
* remove unnecessary export
* simplify folder structure
* remove redundant async
* fix param type in api
* fix language index
* undo changes to cssStyleApplier
* undo changes to utilities.js
* undo changes to utilities.js
* simplify language util
* lint
* undo phantom changes to anki integration
* require textTransformations options
* explicit locale in localeCompare
* punctuate notes
* prefer early exit
* rename LanguageOptionsObjectMap
* rename to textPreprocessor
* tuple with names instead of boolean array
* safe data setting
* optional chaining
* simplify LanguageOptions
* encapsulate languages
* delete language util
* nullable language in text preprocessors controller
* rename transform to process
* remove settings
* make translation advanced again
* remove unused getTextTransformations api call
* comments
* change language types
* RIP flags
* comments
* fix tests
* lint
* Text preprocessor type changes (#10)
* Add types
* Update types
* Simplify type check
* Refactor typing and structuring of language definitions
* lint
* update translator benchmark
* undo markdown changes
* undo markdown changes
* undo markdown changes
* more merge
* simplify language controller
---------
Co-authored-by: toasted-nutbread <toasted-nutbread@users.noreply.github.com>
Co-authored-by: Darius Jahandarie <djahandarie@gmail.com>
Diffstat (limited to 'ext/js/language')
-rw-r--r-- | ext/js/language/en/language-english.js | 29 | ||||
-rw-r--r-- | ext/js/language/ja/language-japanese.js | 77 | ||||
-rwxr-xr-x | ext/js/language/languages.js | 61 | ||||
-rwxr-xr-x | ext/js/language/text-preprocessors.js | 35 | ||||
-rw-r--r-- | ext/js/language/translator.js | 134 |
5 files changed, 259 insertions, 77 deletions
diff --git a/ext/js/language/en/language-english.js b/ext/js/language/en/language-english.js new file mode 100644 index 00000000..8268653f --- /dev/null +++ b/ext/js/language/en/language-english.js @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {capitalizeFirstLetter, decapitalize} from '../text-preprocessors.js'; + +/** @type {import('language-english').EnglishLanguageDescriptor} */ +export const descriptor = { + name: 'English', + iso: 'en', + exampleText: 'read', + textPreprocessors: { + capitalizeFirstLetter, + decapitalize + } +}; diff --git a/ext/js/language/ja/language-japanese.js b/ext/js/language/ja/language-japanese.js new file mode 100644 index 00000000..ced34bcd --- /dev/null +++ b/ext/js/language/ja/language-japanese.js @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {convertAlphabeticToKana} from './japanese-wanakana.js'; +import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth} from './japanese.js'; + +/** @type {import('language-japanese').JapaneseLanguageDescriptor} */ +export const descriptor = { + name: 'Japanese', + iso: 'ja', + exampleText: '読め', + textPreprocessors: { + convertHalfWidthCharacters: { + name: 'Convert half width characters to full width', + description: 'ヨミチャン → ヨミチャン', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction<boolean>} */ + process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) + }, + convertNumericCharacters: { + name: 'Convert numeric characters to full width', + description: '1234 → 1234', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction<boolean>} */ + process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) + }, + convertAlphabeticCharacters: { + name: 'Convert alphabetic characters to hiragana', + description: 'yomichan → よみちゃん', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction<boolean>} */ + process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) + }, + convertHiraganaToKatakana: { + name: 'Convert hiragana to katakana', + description: 'よみちゃん → ヨミチャン', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction<boolean>} */ + process: (str, setting) => (setting ? convertHiraganaToKatakana(str) : str) + }, + convertKatakanaToHiragana: { + name: 'Convert katakana to hiragana', + description: 'ヨミチャン → よみちゃん', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction<boolean>} */ + process: (str, setting) => (setting ? convertKatakanaToHiragana(str) : str) + }, + collapseEmphaticSequences: { + name: 'Collapse emphatic character sequences', + description: 'すっっごーーい → すっごーい / すごい', + options: [[false, false], [true, false], [true, true]], + /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ + process: (str, setting, sourceMap) => { + const [collapseEmphatic, collapseEmphaticFull] = setting; + if (collapseEmphatic) { + str = collapseEmphaticSequences(str, collapseEmphaticFull, sourceMap); + } + return str; + } + } + } +}; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js new file mode 100755 index 00000000..f51ca163 --- /dev/null +++ b/ext/js/language/languages.js @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +import {descriptor as descriptorEnglish} from './en/language-english.js'; +import {descriptor as descriptorJapanese} from './ja/language-japanese.js'; + +const languageDescriptors = [ + descriptorEnglish, + descriptorJapanese +]; + +/** @type {Map<string, typeof languageDescriptors[0]>} */ +const languageDescriptorMap = new Map(); +for (const languageDescriptor of languageDescriptors) { + languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); +} + +/** + * @returns {import('language').LanguageSummary[]} + */ +export function getLanguageSummaries() { + const results = []; + for (const {name, iso, exampleText} of languageDescriptorMap.values()) { + results.push({name, iso, exampleText}); + } + return results; +} + +/** + * @returns {import('language').LanguageAndPreprocessors[]} + * @throws {Error} + */ +export function getAllLanguageTextPreprocessors() { + const results = []; + for (const {iso, textPreprocessors} of languageDescriptorMap.values()) { + /** @type {import('language').TextPreprocessorWithId<unknown>[]} */ + const textPreprocessorsArray = []; + for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) { + textPreprocessorsArray.push({ + id, + textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor) + }); + } + results.push({iso, textPreprocessors: textPreprocessorsArray}); + } + return results; +} diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js new file mode 100755 index 00000000..12b3d1b6 --- /dev/null +++ b/ext/js/language/text-preprocessors.js @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/** @type {import('language').TextPreprocessorOptions<boolean>} */ +export const basicTextPreprocessorOptions = [false, true]; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const decapitalize = { + name: 'Decapitalize text', + description: 'CAPITALIZED TEXT → capitalized text', + options: basicTextPreprocessorOptions, + process: (str, setting) => (setting ? str.toLowerCase() : str) +}; + +/** @type {import('language').TextPreprocessor<boolean>} */ +export const capitalizeFirstLetter = { + name: 'Capitalize first letter', + description: 'lowercase text → Lowercase text', + options: basicTextPreprocessorOptions, + process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str) +}; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index b2342e8d..4f9304b5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@ import {applyTextReplacement} from '../general/regex-util.js'; import {TextSourceMap} from '../general/text-source-map.js'; -import {convertAlphabeticToKana} from './ja/japanese-wanakana.js'; -import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js'; +import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; +import {getAllLanguageTextPreprocessors} from './languages.js'; /** * Class which finds term and kanji dictionary entries for text. @@ -41,6 +41,8 @@ export class Translator { this._stringComparer = new Intl.Collator('en-US'); // Invariant locale /** @type {RegExp} */ this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; + /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ + this._textPreprocessors = new Map(); } /** @@ -49,6 +51,14 @@ export class Translator { */ prepare(descriptor) { this._languageTransformer.addDescriptor(descriptor); + for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { + /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ + const optionSpace = new Map(); + for (const {id, textPreprocessor} of textPreprocessors) { + optionSpace.set(id, textPreprocessor.options); + } + this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); + } } /** @@ -415,51 +425,45 @@ export class Translator { } } - // Deinflections and text transformations + // Deinflections and text preprocessing /** * @param {string} text * @param {import('translation').FindTermsOptions} options * @returns {import('translation-internal').DatabaseDeinflection[]} + * @throws {Error} */ _getAlgorithmDeinflections(text, options) { - /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */ - const textOptionVariantArray = [ - this._getTextReplacementsVariants(options), - this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), - this._getTextOptionEntryVariants(options.convertNumericCharacters), - this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), - this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), - this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), - this._getCollapseEmphaticOptions(options) - ]; + const {language} = options; + const info = this._textPreprocessors.get(language); + if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } + const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; + + /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ + const variantSpace = new Map(); + variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); + for (const [key, value] of textPreprocessorOptionsSpace) { + variantSpace.set(key, value); + } /** @type {import('translation-internal').DatabaseDeinflection[]} */ const deinflections = []; const used = new Set(); - for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator<import('translation-internal').TextDeinflectionOptions, void, unknown>} */ (this._getArrayVariants(textOptionVariantArray))) { + + for (const arrayVariant of this._generateArrayVariants(variantSpace)) { + const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); + let text2 = text; const sourceMap = new TextSourceMap(text2); + if (textReplacements !== null) { text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); } - if (halfWidth) { - text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap); - } - if (numeric) { - text2 = convertNumericToFullWidth(text2); - } - if (alphabetic) { - text2 = convertAlphabeticToKana(text2, sourceMap); - } - if (katakana) { - text2 = convertHiraganaToKatakana(text2); - } - if (hiragana) { - text2 = convertKatakanaToHiragana(text2); - } - if (collapseEmphatic) { - text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + + for (const preprocessor of textPreprocessors.values()) { + const {id, textPreprocessor} = preprocessor; + const setting = arrayVariant.get(id); + text2 = textPreprocessor.process(text2, setting, sourceMap); } for ( @@ -527,36 +531,6 @@ export class Translator { } /** - * @param {import('translation').FindTermsVariantMode} value - * @returns {boolean[]} - */ - _getTextOptionEntryVariants(value) { - switch (value) { - case 'true': return [true]; - case 'variant': return [false, true]; - default: return [false]; - } - } - - /** - * @param {import('translation').FindTermsOptions} options - * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} - */ - _getCollapseEmphaticOptions(options) { - /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */ - const collapseEmphaticOptions = [[false, false]]; - switch (options.collapseEmphaticSequences) { - case 'true': - collapseEmphaticOptions.push([true, false]); - break; - case 'full': - collapseEmphaticOptions.push([true, false], [true, true]); - break; - } - return collapseEmphaticOptions; - } - - /** * @param {import('translation').FindTermsOptions} options * @returns {(import('translation').FindTermsTextReplacement[] | null)[]} */ @@ -1343,26 +1317,32 @@ export class Translator { } /** - * @param {[...args: unknown[][]]} arrayVariants - * @yields {[...args: unknown[]]} - * @returns {Generator<unknown[], void, unknown>} + * @param {Map<string, unknown[]>} arrayVariants + * @yields {Map<string, unknown>} + * @returns {Generator<Map<string, unknown>, void, void>} */ - *_getArrayVariants(arrayVariants) { - const ii = arrayVariants.length; - - let total = 1; - for (let i = 0; i < ii; ++i) { - total *= arrayVariants[i].length; + *_generateArrayVariants(arrayVariants) { + const variantKeys = [...arrayVariants.keys()]; + const entryVariantLengths = []; + for (const key of variantKeys) { + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + entryVariantLengths.push(entryVariants.length); } + const totalVariants = entryVariantLengths.reduce((acc, length) => acc * length, 1); + + for (let variantIndex = 0; variantIndex < totalVariants; ++variantIndex) { + /** @type {Map<string, unknown>} */ + const variant = new Map(); + let remainingIndex = variantIndex; - for (let a = 0; a < total; ++a) { - const variant = []; - let index = a; - for (let i = 0; i < ii; ++i) { - const entryVariants = arrayVariants[i]; - variant.push(entryVariants[index % entryVariants.length]); - index = Math.floor(index / entryVariants.length); + for (let keyIndex = 0; keyIndex < variantKeys.length; ++keyIndex) { + const key = variantKeys[keyIndex]; + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + const entryIndex = remainingIndex % entryVariants.length; + variant.set(key, entryVariants[entryIndex]); + remainingIndex = Math.floor(remainingIndex / entryVariants.length); } + yield variant; } } |