aboutsummaryrefslogtreecommitdiff
path: root/ext/js/language
diff options
context:
space:
mode:
authorStefanVukovic99 <stefanvukovic44@gmail.com>2024-02-17 02:45:24 +0100
committerGitHub <noreply@github.com>2024-02-17 01:45:24 +0000
commit4aaa9f15d97668203741c1731f15e710ae8b8294 (patch)
treed1885f7fbd7d1510a71176597169d6847ae26572 /ext/js/language
parent4e77741d22778bd09b772fc53f1cbd64107e3d24 (diff)
add language select, abstract text transformations (#584)
* Copy functions from JapaneseUtil * Remove JapaneseUtil * Update usages of JapaneseUtil functions * part1 * frotend done? * fix tests * offscreen and type complications * add tests * start fixing tests * keep fixing tests * fix tests * Copy functions from JapaneseUtil * Remove JapaneseUtil * Update usages of JapaneseUtil functions * delete pt * renames * add tests * kebab-case filenames * lint * minor fixes * merge * fixes * fix part of comments * fix more comments * delete unused types * comment * comment * do backend * other files * move fetch utils to own file * remove extra line * add extra line * remove unnecessary export * simplify folder structure * remove redundant async * fix param type in api * fix language index * undo changes to cssStyleApplier * undo changes to utilities.js * undo changes to utilities.js * simplify language util * lint * undo phantom changes to anki integration * require textTransformations options * explicit locale in localeCompare * punctuate notes * prefer early exit * rename LanguageOptionsObjectMap * rename to textPreprocessor * tuple with names instead of boolean array * safe data setting * optional chaining * simplify LanguageOptions * encapsulate languages * delete language util * nullable language in text preprocessors controller * rename transform to process * remove settings * make translation advanced again * remove unused getTextTransformations api call * comments * change language types * RIP flags * comments * fix tests * lint * Text preprocessor type changes (#10) * Add types * Update types * Simplify type check * Refactor typing and structuring of language definitions * lint * update translator benchmark * undo markdown changes * undo markdown changes * undo markdown changes * more merge * simplify language controller --------- Co-authored-by: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Co-authored-by: Darius Jahandarie <djahandarie@gmail.com>
Diffstat (limited to 'ext/js/language')
-rw-r--r--ext/js/language/en/language-english.js29
-rw-r--r--ext/js/language/ja/language-japanese.js77
-rwxr-xr-xext/js/language/languages.js61
-rwxr-xr-xext/js/language/text-preprocessors.js35
-rw-r--r--ext/js/language/translator.js134
5 files changed, 259 insertions, 77 deletions
diff --git a/ext/js/language/en/language-english.js b/ext/js/language/en/language-english.js
new file mode 100644
index 00000000..8268653f
--- /dev/null
+++ b/ext/js/language/en/language-english.js
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {capitalizeFirstLetter, decapitalize} from '../text-preprocessors.js';
+
+/** @type {import('language-english').EnglishLanguageDescriptor} */
+export const descriptor = {
+ name: 'English',
+ iso: 'en',
+ exampleText: 'read',
+ textPreprocessors: {
+ capitalizeFirstLetter,
+ decapitalize
+ }
+};
diff --git a/ext/js/language/ja/language-japanese.js b/ext/js/language/ja/language-japanese.js
new file mode 100644
index 00000000..ced34bcd
--- /dev/null
+++ b/ext/js/language/ja/language-japanese.js
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+import {convertAlphabeticToKana} from './japanese-wanakana.js';
+import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth} from './japanese.js';
+
+/** @type {import('language-japanese').JapaneseLanguageDescriptor} */
+export const descriptor = {
+ name: 'Japanese',
+ iso: 'ja',
+ exampleText: '読め',
+ textPreprocessors: {
+ convertHalfWidthCharacters: {
+ name: 'Convert half width characters to full width',
+ description: 'ヨミチャン → ヨミチャン',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction<boolean>} */
+ process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
+ },
+ convertNumericCharacters: {
+ name: 'Convert numeric characters to full width',
+ description: '1234 → 1234',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction<boolean>} */
+ process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
+ },
+ convertAlphabeticCharacters: {
+ name: 'Convert alphabetic characters to hiragana',
+ description: 'yomichan → よみちゃん',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction<boolean>} */
+ process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
+ },
+ convertHiraganaToKatakana: {
+ name: 'Convert hiragana to katakana',
+ description: 'よみちゃん → ヨミチャン',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction<boolean>} */
+ process: (str, setting) => (setting ? convertHiraganaToKatakana(str) : str)
+ },
+ convertKatakanaToHiragana: {
+ name: 'Convert katakana to hiragana',
+ description: 'ヨミチャン → よみちゃん',
+ options: basicTextPreprocessorOptions,
+ /** @type {import('language').TextPreprocessorFunction<boolean>} */
+ process: (str, setting) => (setting ? convertKatakanaToHiragana(str) : str)
+ },
+ collapseEmphaticSequences: {
+ name: 'Collapse emphatic character sequences',
+ description: 'すっっごーーい → すっごーい / すごい',
+ options: [[false, false], [true, false], [true, true]],
+ /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
+ process: (str, setting, sourceMap) => {
+ const [collapseEmphatic, collapseEmphaticFull] = setting;
+ if (collapseEmphatic) {
+ str = collapseEmphaticSequences(str, collapseEmphaticFull, sourceMap);
+ }
+ return str;
+ }
+ }
+ }
+};
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
new file mode 100755
index 00000000..f51ca163
--- /dev/null
+++ b/ext/js/language/languages.js
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import {descriptor as descriptorEnglish} from './en/language-english.js';
+import {descriptor as descriptorJapanese} from './ja/language-japanese.js';
+
+const languageDescriptors = [
+ descriptorEnglish,
+ descriptorJapanese
+];
+
+/** @type {Map<string, typeof languageDescriptors[0]>} */
+const languageDescriptorMap = new Map();
+for (const languageDescriptor of languageDescriptors) {
+ languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor);
+}
+
+/**
+ * @returns {import('language').LanguageSummary[]}
+ */
+export function getLanguageSummaries() {
+ const results = [];
+ for (const {name, iso, exampleText} of languageDescriptorMap.values()) {
+ results.push({name, iso, exampleText});
+ }
+ return results;
+}
+
+/**
+ * @returns {import('language').LanguageAndPreprocessors[]}
+ * @throws {Error}
+ */
+export function getAllLanguageTextPreprocessors() {
+ const results = [];
+ for (const {iso, textPreprocessors} of languageDescriptorMap.values()) {
+ /** @type {import('language').TextPreprocessorWithId<unknown>[]} */
+ const textPreprocessorsArray = [];
+ for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) {
+ textPreprocessorsArray.push({
+ id,
+ textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor)
+ });
+ }
+ results.push({iso, textPreprocessors: textPreprocessorsArray});
+ }
+ return results;
+}
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js
new file mode 100755
index 00000000..12b3d1b6
--- /dev/null
+++ b/ext/js/language/text-preprocessors.js
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2024 Yomitan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/** @type {import('language').TextPreprocessorOptions<boolean>} */
+export const basicTextPreprocessorOptions = [false, true];
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const decapitalize = {
+ name: 'Decapitalize text',
+ description: 'CAPITALIZED TEXT → capitalized text',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.toLowerCase() : str)
+};
+
+/** @type {import('language').TextPreprocessor<boolean>} */
+export const capitalizeFirstLetter = {
+ name: 'Capitalize first letter',
+ description: 'lowercase text → Lowercase text',
+ options: basicTextPreprocessorOptions,
+ process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
+};
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index b2342e8d..4f9304b5 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -18,9 +18,9 @@
import {applyTextReplacement} from '../general/regex-util.js';
import {TextSourceMap} from '../general/text-source-map.js';
-import {convertAlphabeticToKana} from './ja/japanese-wanakana.js';
-import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js';
+import {isCodePointJapanese} from './ja/japanese.js';
import {LanguageTransformer} from './language-transformer.js';
+import {getAllLanguageTextPreprocessors} from './languages.js';
/**
* Class which finds term and kanji dictionary entries for text.
@@ -41,6 +41,8 @@ export class Translator {
this._stringComparer = new Intl.Collator('en-US'); // Invariant locale
/** @type {RegExp} */
this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
+ /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */
+ this._textPreprocessors = new Map();
}
/**
@@ -49,6 +51,14 @@ export class Translator {
*/
prepare(descriptor) {
this._languageTransformer.addDescriptor(descriptor);
+ for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) {
+ /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */
+ const optionSpace = new Map();
+ for (const {id, textPreprocessor} of textPreprocessors) {
+ optionSpace.set(id, textPreprocessor.options);
+ }
+ this._textPreprocessors.set(iso, {textPreprocessors, optionSpace});
+ }
}
/**
@@ -415,51 +425,45 @@ export class Translator {
}
}
- // Deinflections and text transformations
+ // Deinflections and text preprocessing
/**
* @param {string} text
* @param {import('translation').FindTermsOptions} options
* @returns {import('translation-internal').DatabaseDeinflection[]}
+ * @throws {Error}
*/
_getAlgorithmDeinflections(text, options) {
- /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */
- const textOptionVariantArray = [
- this._getTextReplacementsVariants(options),
- this._getTextOptionEntryVariants(options.convertHalfWidthCharacters),
- this._getTextOptionEntryVariants(options.convertNumericCharacters),
- this._getTextOptionEntryVariants(options.convertAlphabeticCharacters),
- this._getTextOptionEntryVariants(options.convertHiraganaToKatakana),
- this._getTextOptionEntryVariants(options.convertKatakanaToHiragana),
- this._getCollapseEmphaticOptions(options)
- ];
+ const {language} = options;
+ const info = this._textPreprocessors.get(language);
+ if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); }
+ const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info;
+
+ /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */
+ const variantSpace = new Map();
+ variantSpace.set('textReplacements', this._getTextReplacementsVariants(options));
+ for (const [key, value] of textPreprocessorOptionsSpace) {
+ variantSpace.set(key, value);
+ }
/** @type {import('translation-internal').DatabaseDeinflection[]} */
const deinflections = [];
const used = new Set();
- for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator<import('translation-internal').TextDeinflectionOptions, void, unknown>} */ (this._getArrayVariants(textOptionVariantArray))) {
+
+ for (const arrayVariant of this._generateArrayVariants(variantSpace)) {
+ const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements'));
+
let text2 = text;
const sourceMap = new TextSourceMap(text2);
+
if (textReplacements !== null) {
text2 = this._applyTextReplacements(text2, sourceMap, textReplacements);
}
- if (halfWidth) {
- text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap);
- }
- if (numeric) {
- text2 = convertNumericToFullWidth(text2);
- }
- if (alphabetic) {
- text2 = convertAlphabeticToKana(text2, sourceMap);
- }
- if (katakana) {
- text2 = convertHiraganaToKatakana(text2);
- }
- if (hiragana) {
- text2 = convertKatakanaToHiragana(text2);
- }
- if (collapseEmphatic) {
- text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap);
+
+ for (const preprocessor of textPreprocessors.values()) {
+ const {id, textPreprocessor} = preprocessor;
+ const setting = arrayVariant.get(id);
+ text2 = textPreprocessor.process(text2, setting, sourceMap);
}
for (
@@ -527,36 +531,6 @@ export class Translator {
}
/**
- * @param {import('translation').FindTermsVariantMode} value
- * @returns {boolean[]}
- */
- _getTextOptionEntryVariants(value) {
- switch (value) {
- case 'true': return [true];
- case 'variant': return [false, true];
- default: return [false];
- }
- }
-
- /**
- * @param {import('translation').FindTermsOptions} options
- * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]}
- */
- _getCollapseEmphaticOptions(options) {
- /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */
- const collapseEmphaticOptions = [[false, false]];
- switch (options.collapseEmphaticSequences) {
- case 'true':
- collapseEmphaticOptions.push([true, false]);
- break;
- case 'full':
- collapseEmphaticOptions.push([true, false], [true, true]);
- break;
- }
- return collapseEmphaticOptions;
- }
-
- /**
* @param {import('translation').FindTermsOptions} options
* @returns {(import('translation').FindTermsTextReplacement[] | null)[]}
*/
@@ -1343,26 +1317,32 @@ export class Translator {
}
/**
- * @param {[...args: unknown[][]]} arrayVariants
- * @yields {[...args: unknown[]]}
- * @returns {Generator<unknown[], void, unknown>}
+ * @param {Map<string, unknown[]>} arrayVariants
+ * @yields {Map<string, unknown>}
+ * @returns {Generator<Map<string, unknown>, void, void>}
*/
- *_getArrayVariants(arrayVariants) {
- const ii = arrayVariants.length;
-
- let total = 1;
- for (let i = 0; i < ii; ++i) {
- total *= arrayVariants[i].length;
+ *_generateArrayVariants(arrayVariants) {
+ const variantKeys = [...arrayVariants.keys()];
+ const entryVariantLengths = [];
+ for (const key of variantKeys) {
+ const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key));
+ entryVariantLengths.push(entryVariants.length);
}
+ const totalVariants = entryVariantLengths.reduce((acc, length) => acc * length, 1);
+
+ for (let variantIndex = 0; variantIndex < totalVariants; ++variantIndex) {
+ /** @type {Map<string, unknown>} */
+ const variant = new Map();
+ let remainingIndex = variantIndex;
- for (let a = 0; a < total; ++a) {
- const variant = [];
- let index = a;
- for (let i = 0; i < ii; ++i) {
- const entryVariants = arrayVariants[i];
- variant.push(entryVariants[index % entryVariants.length]);
- index = Math.floor(index / entryVariants.length);
+ for (let keyIndex = 0; keyIndex < variantKeys.length; ++keyIndex) {
+ const key = variantKeys[keyIndex];
+ const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key));
+ const entryIndex = remainingIndex % entryVariants.length;
+ variant.set(key, entryVariants[entryIndex]);
+ remainingIndex = Math.floor(remainingIndex / entryVariants.length);
}
+
yield variant;
}
}