aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStefanVukovic99 <stefanvukovic44@gmail.com>2024-04-21 17:15:08 +0200
committerGitHub <noreply@github.com>2024-04-21 15:15:08 +0000
commit07258ecc35c1a05aa1581a54c9f47a40ce3d76c9 (patch)
tree0a73bc6c1224710906ef3cded2a19399fc626f12
parent22904d166d5ea33667458ccd0fde36e77d0ff65d (diff)
rework text processors (#793)24.4.21.0
* rework text processors * rename text-preprocessors file * Fix search header left margins on small screens (#839) * Refocuses search input on backspace (#840) Fixes #775. Note that this behavior gets overridden if backspace is set as a shortcut action. * Change hotkey triggering condition to account for IME usage (#837) _isKeyCharacterInput only worked when not using an IME, as inside of an IME when a keydown event is fired, the key is reported as "Process", which does not have a key.length equal to 1. This resulted in hotkeys being triggered while typing, which this commit fixes. --------- Co-authored-by: James Maa <jmaa@berkeley.edu> Co-authored-by: Kuuuube <61125188+Kuuuube@users.noreply.github.com> Co-authored-by: Andrew Thomas Sartor <andrew@sartor.net>
-rw-r--r--.eslintrc.json3
-rw-r--r--ext/js/general/regex-util.js12
-rw-r--r--ext/js/general/text-source-map.js153
-rw-r--r--ext/js/language/ar/arabic-text-preprocessors.js6
-rw-r--r--ext/js/language/ja/japanese-text-preprocessors.js24
-rw-r--r--ext/js/language/ja/japanese-wanakana.js52
-rw-r--r--ext/js/language/ja/japanese.js15
-rw-r--r--ext/js/language/language-descriptors.js11
-rwxr-xr-xext/js/language/languages.js20
-rw-r--r--ext/js/language/ru/russian-text-preprocessors.js10
-rwxr-xr-xext/js/language/text-processors.js (renamed from ext/js/language/text-preprocessors.js)16
-rw-r--r--ext/js/language/translator.js147
-rw-r--r--test/data/anki-note-builder-test-results.json16
-rw-r--r--test/data/translator-test-inputs.json4
-rw-r--r--test/data/translator-test-results-note-data1.json4
-rw-r--r--test/data/translator-test-results.json8
-rw-r--r--test/japanese-util.test.js136
-rw-r--r--test/text-source-map.test.js223
-rw-r--r--types/ext/language-descriptors.d.ts145
-rw-r--r--types/ext/language.d.ts26
-rw-r--r--types/ext/translation-internal.d.ts12
21 files changed, 349 insertions, 694 deletions
diff --git a/.eslintrc.json b/.eslintrc.json
index 8b08e827..e347c978 100644
--- a/.eslintrc.json
+++ b/.eslintrc.json
@@ -639,7 +639,6 @@
"ext/js/general/cache-map.js",
"ext/js/general/object-property-accessor.js",
"ext/js/general/regex-util.js",
- "ext/js/general/text-source-map.js",
"ext/js/language/ar/arabic-text-preprocessors.js",
"ext/js/language/de/german-text-preprocessors.js",
"ext/js/language/de/german-transforms.js",
@@ -656,7 +655,7 @@
"ext/js/language/ru/russian-text-preprocessors.js",
"ext/js/language/sga/old-irish-transforms.js",
"ext/js/language/sq/albanian-transforms.js",
- "ext/js/language/text-preprocessors.js",
+ "ext/js/language/text-processors.js",
"ext/js/language/translator.js",
"ext/js/media/audio-downloader.js",
"ext/js/media/media-util.js",
diff --git a/ext/js/general/regex-util.js b/ext/js/general/regex-util.js
index e0982154..c633ec06 100644
--- a/ext/js/general/regex-util.js
+++ b/ext/js/general/regex-util.js
@@ -23,13 +23,12 @@ const matchReplacementPattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g;
* Applies string.replace using a regular expression and replacement string as arguments.
* A source map of the changes is also maintained.
* @param {string} text A string of the text to replace.
- * @param {import('./text-source-map.js').TextSourceMap} sourceMap An instance of `TextSourceMap` which corresponds to `text`.
* @param {RegExp} pattern A regular expression to use as the replacement.
* @param {string} replacement A replacement string that follows the format of the standard
* JavaScript regular expression replacement string.
* @returns {string} A new string with the pattern replacements applied and the source map updated.
*/
-export function applyTextReplacement(text, sourceMap, pattern, replacement) {
+export function applyTextReplacement(text, pattern, replacement) {
const isGlobal = pattern.global;
if (isGlobal) { pattern.lastIndex = 0; }
for (let loop = true; loop; loop = isGlobal) {
@@ -44,15 +43,6 @@ export function applyTextReplacement(text, sourceMap, pattern, replacement) {
text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`;
pattern.lastIndex += delta;
-
- if (actualReplacementLength > 0) {
- /** @type {number[]} */
- const zeroes = new Array(actualReplacementLength).fill(0);
- sourceMap.insert(index, ...zeroes);
- sourceMap.combine(index - 1 + actualReplacementLength, matchText.length);
- } else {
- sourceMap.combine(index, matchText.length);
- }
}
return text;
}
diff --git a/ext/js/general/text-source-map.js b/ext/js/general/text-source-map.js
deleted file mode 100644
index 527c232b..00000000
--- a/ext/js/general/text-source-map.js
+++ /dev/null
@@ -1,153 +0,0 @@
-/*
- * Copyright (C) 2023-2024 Yomitan Authors
- * Copyright (C) 2020-2022 Yomichan Authors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-export class TextSourceMap {
- /**
- * @param {string} source
- * @param {number[]|null} [mapping=null]
- */
- constructor(source, mapping = null) {
- /** @type {string} */
- this._source = source;
- /** @type {?number[]} */
- this._mapping = (mapping !== null ? TextSourceMap.normalizeMapping(mapping) : null);
- }
-
- /** @type {string} */
- get source() {
- return this._source;
- }
-
- /**
- * @param {unknown} other
- * @returns {boolean}
- */
- equals(other) {
- if (this === other) {
- return true;
- }
-
- const source = this._source;
- if (!(other instanceof TextSourceMap && source === other.source)) {
- return false;
- }
-
- let mapping = this._mapping;
- let otherMapping = other.getMappingCopy();
- if (mapping === null) {
- if (otherMapping === null) {
- return true;
- }
- mapping = TextSourceMap.createMapping(source);
- } else if (otherMapping === null) {
- otherMapping = TextSourceMap.createMapping(source);
- }
-
- const mappingLength = mapping.length;
- if (mappingLength !== otherMapping.length) {
- return false;
- }
-
- for (let i = 0; i < mappingLength; ++i) {
- if (mapping[i] !== otherMapping[i]) {
- return false;
- }
- }
-
- return true;
- }
-
- /**
- * @param {number} finalLength
- * @returns {number}
- */
- getSourceLength(finalLength) {
- const mapping = this._mapping;
- if (mapping === null) {
- return finalLength;
- }
-
- let sourceLength = 0;
- for (let i = 0; i < finalLength; ++i) {
- sourceLength += mapping[i];
- }
- return sourceLength;
- }
-
- /**
- * @param {number} index
- * @param {number} count
- */
- combine(index, count) {
- if (count <= 0) { return; }
-
- if (this._mapping === null) {
- this._mapping = TextSourceMap.createMapping(this._source);
- }
-
- let sum = this._mapping[index];
- const parts = this._mapping.splice(index + 1, count);
- for (const part of parts) {
- sum += part;
- }
- this._mapping[index] = sum;
- }
-
- /**
- * @param {number} index
- * @param {number[]} items
- */
- insert(index, ...items) {
- if (this._mapping === null) {
- this._mapping = TextSourceMap.createMapping(this._source);
- }
-
- this._mapping.splice(index, 0, ...items);
- }
-
- /**
- * @returns {?number[]}
- */
- getMappingCopy() {
- return this._mapping !== null ? [...this._mapping] : null;
- }
-
- /**
- * @param {string} text
- * @returns {number[]}
- */
- static createMapping(text) {
- return new Array(text.length).fill(1);
- }
-
- /**
- * @param {number[]} mapping
- * @returns {number[]}
- */
- static normalizeMapping(mapping) {
- const result = [];
- for (const value of mapping) {
- result.push(
- (typeof value === 'number' && Number.isFinite(value)) ?
- Math.floor(value) :
- 0
- );
- }
- return result;
- }
-}
diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js
index 6007d770..91535ccd 100644
--- a/ext/js/language/ar/arabic-text-preprocessors.js
+++ b/ext/js/language/ar/arabic-text-preprocessors.js
@@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+import {basicTextProcessorOptions} from '../text-processors.js';
const optionalDiacritics = [
'\u0618', // Small Fatha
@@ -38,11 +38,11 @@ const optionalDiacritics = [
const diacriticsRegex = new RegExp(`[${optionalDiacritics.join('')}]`, 'g');
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const removeArabicScriptDiacritics = {
name: 'Remove diacritics',
description: 'وَلَدَ ⬅️ ولد',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (text, setting) => {
return setting ? text.replace(diacriticsRegex, '') : text;
}
diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js
index 06f944c1..b3d50817 100644
--- a/ext/js/language/ja/japanese-text-preprocessors.js
+++ b/ext/js/language/ja/japanese-text-preprocessors.js
@@ -15,7 +15,7 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+import {basicTextProcessorOptions} from '../text-processors.js';
import {convertAlphabeticToKana} from './japanese-wanakana.js';
import {
collapseEmphaticSequences as collapseEmphaticSequencesFunction,
@@ -25,28 +25,28 @@ import {
convertNumericToFullWidth
} from './japanese.js';
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const convertHalfWidthCharacters = {
name: 'Convert half width characters to full width',
description: 'ヨミチャン → ヨミチャン',
- options: basicTextPreprocessorOptions,
- process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str)
+ options: basicTextProcessorOptions,
+ process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str)
};
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const convertNumericCharacters = {
name: 'Convert numeric characters to full width',
description: '1234 → 1234',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)
};
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const convertAlphabeticCharacters = {
name: 'Convert alphabetic characters to hiragana',
description: 'yomichan → よみちゃん',
- options: basicTextPreprocessorOptions,
- process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str)
+ options: basicTextProcessorOptions,
+ process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str)
};
/** @type {import('language').BidirectionalConversionPreprocessor} */
@@ -66,15 +66,15 @@ export const convertHiraganaToKatakana = {
}
};
-/** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
+/** @type {import('language').TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */
export const collapseEmphaticSequences = {
name: 'Collapse emphatic character sequences',
description: 'すっっごーーい → すっごーい / すごい',
options: [[false, false], [true, false], [true, true]],
- process: (str, setting, sourceMap) => {
+ process: (str, setting) => {
const [collapseEmphatic, collapseEmphaticFull] = setting;
if (collapseEmphatic) {
- str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull, sourceMap);
+ str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull);
}
return str;
}
diff --git a/ext/js/language/ja/japanese-wanakana.js b/ext/js/language/ja/japanese-wanakana.js
index 32260489..a87db6b7 100644
--- a/ext/js/language/ja/japanese-wanakana.js
+++ b/ext/js/language/ja/japanese-wanakana.js
@@ -19,51 +19,10 @@ import * as wanakana from '../../../lib/wanakana.js';
/**
* @param {string} text
- * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap
- * @param {number} sourceMapStart
* @returns {string}
*/
-function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) {
- const result = wanakana.toHiragana(text);
-
- // Generate source mapping
- if (sourceMap !== null) {
- let i = 0;
- let resultPos = 0;
- const ii = text.length;
- while (i < ii) {
- // Find smallest matching substring
- let iNext = i + 1;
- let resultPosNext = result.length;
- while (iNext < ii) {
- const t = wanakana.toHiragana(text.substring(0, iNext));
- if (t === result.substring(0, t.length)) {
- resultPosNext = t.length;
- break;
- }
- ++iNext;
- }
-
- // Merge characters
- const removals = iNext - i - 1;
- if (removals > 0) {
- sourceMap.combine(sourceMapStart, removals);
- }
- ++sourceMapStart;
-
- // Empty elements
- const additions = resultPosNext - resultPos - 1;
- for (let j = 0; j < additions; ++j) {
- sourceMap.insert(sourceMapStart, 0);
- ++sourceMapStart;
- }
-
- i = iNext;
- resultPos = resultPosNext;
- }
- }
-
- return result;
+function convertAlphabeticPartToKana(text) {
+ return wanakana.toHiragana(text);
}
/**
@@ -84,10 +43,9 @@ export function convertToRomaji(text) {
/**
* @param {string} text
- * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap
* @returns {string}
*/
-export function convertAlphabeticToKana(text, sourceMap = null) {
+export function convertAlphabeticToKana(text) {
let part = '';
let result = '';
@@ -106,7 +64,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {
c = 0x2d; // '-'
} else {
if (part.length > 0) {
- result += convertAlphabeticPartToKana(part, sourceMap, result.length);
+ result += convertAlphabeticPartToKana(part);
part = '';
}
result += char;
@@ -116,7 +74,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {
}
if (part.length > 0) {
- result += convertAlphabeticPartToKana(part, sourceMap, result.length);
+ result += convertAlphabeticPartToKana(part);
}
return result;
}
diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js
index 2c9a1f7f..3507e5df 100644
--- a/ext/js/language/ja/japanese.js
+++ b/ext/js/language/ja/japanese.js
@@ -539,10 +539,9 @@ export function convertNumericToFullWidth(text) {
/**
* @param {string} text
- * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap]
* @returns {string}
*/
-export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) {
+export function convertHalfWidthKanaToFullWidth(text) {
let result = '';
// This function is safe to use charCodeAt instead of codePointAt, since all
@@ -575,9 +574,6 @@ export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) {
}
}
- if (sourceMap !== null && index > 0) {
- sourceMap.combine(result.length, 1);
- }
result += c2;
}
@@ -705,13 +701,11 @@ export function distributeFuriganaInflected(term, reading, source) {
/**
* @param {string} text
* @param {boolean} fullCollapse
- * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap]
* @returns {string}
*/
-export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null) {
+export function collapseEmphaticSequences(text, fullCollapse) {
let result = '';
let collapseCodePoint = -1;
- const hasSourceMap = (sourceMap !== null);
for (const char of text) {
const c = char.codePointAt(0);
if (
@@ -729,11 +723,6 @@ export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null)
} else {
collapseCodePoint = -1;
result += char;
- continue;
- }
-
- if (hasSourceMap) {
- sourceMap.combine(Math.max(0, result.length - 1), 1);
}
}
return result;
diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js
index c5c3e01e..2df2f794 100644
--- a/ext/js/language/language-descriptors.js
+++ b/ext/js/language/language-descriptors.js
@@ -25,7 +25,7 @@ import {isStringPartiallyJapanese} from './ja/japanese.js';
import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';
import {oldIrishTransforms} from './sga/old-irish-transforms.js';
import {albanianTransforms} from './sq/albanian-transforms.js';
-import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js';
+import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';
const capitalizationPreprocessors = {
decapitalize,
@@ -138,8 +138,7 @@ const languageDescriptors = [
{
iso: 'km',
name: 'Khmer',
- exampleText: 'អាន',
- textPreprocessors: {}
+ exampleText: 'អាន'
},
{
iso: 'pl',
@@ -201,8 +200,7 @@ const languageDescriptors = [
{
iso: 'th',
name: 'Thai',
- exampleText: 'อ่าน',
- textPreprocessors: {}
+ exampleText: 'อ่าน'
},
{
iso: 'tr',
@@ -219,8 +217,7 @@ const languageDescriptors = [
{
iso: 'zh',
name: 'Chinese',
- exampleText: '读',
- textPreprocessors: {}
+ exampleText: '读'
}
];
diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js
index 4b196c2c..b3890aa6 100755
--- a/ext/js/language/languages.js
+++ b/ext/js/language/languages.js
@@ -29,21 +29,29 @@ export function getLanguageSummaries() {
}
/**
- * @returns {import('language').LanguageAndPreprocessors[]}
+ * @returns {import('language').LanguageAndProcessors[]}
* @throws {Error}
*/
-export function getAllLanguageTextPreprocessors() {
+export function getAllLanguageTextProcessors() {
const results = [];
- for (const {iso, textPreprocessors} of languageDescriptorMap.values()) {
- /** @type {import('language').TextPreprocessorWithId<unknown>[]} */
+ for (const {iso, textPreprocessors = {}, textPostprocessors = {}} of languageDescriptorMap.values()) {
+ /** @type {import('language').TextProcessorWithId<unknown>[]} */
const textPreprocessorsArray = [];
for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) {
textPreprocessorsArray.push({
id,
- textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor)
+ textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPreprocessor)
});
}
- results.push({iso, textPreprocessors: textPreprocessorsArray});
+ /** @type {import('language').TextProcessorWithId<unknown>[]} */
+ const textPostprocessorsArray = [];
+ for (const [id, textPostprocessor] of Object.entries(textPostprocessors)) {
+ textPostprocessorsArray.push({
+ id,
+ textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPostprocessor)
+ });
+ }
+ results.push({iso, textPreprocessors: textPreprocessorsArray, textPostprocessors: textPostprocessorsArray});
}
return results;
}
diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js
index fc4472e9..fbda38c7 100644
--- a/ext/js/language/ru/russian-text-preprocessors.js
+++ b/ext/js/language/ru/russian-text-preprocessors.js
@@ -15,23 +15,23 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import {basicTextPreprocessorOptions} from '../text-preprocessors.js';
+import {basicTextProcessorOptions} from '../text-processors.js';
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const removeRussianDiacritics = {
name: 'Remove diacritics',
description: 'A\u0301 → A, a\u0301 → a',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => {
return setting ? str.replace(/\u0301/g, '') : str;
}
};
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const yoToE = {
name: 'Yo to E',
description: 'ё → е, Ё → Е',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => {
return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str;
}
diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-processors.js
index e33fccda..e7855df2 100755
--- a/ext/js/language/text-preprocessors.js
+++ b/ext/js/language/text-processors.js
@@ -15,22 +15,22 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-/** @type {import('language').TextPreprocessorOptions<boolean>} */
-export const basicTextPreprocessorOptions = [false, true];
+/** @type {import('language').TextProcessorOptions<boolean>} */
+export const basicTextProcessorOptions = [false, true];
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const decapitalize = {
name: 'Decapitalize text',
description: 'CAPITALIZED TEXT → capitalized text',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => (setting ? str.toLowerCase() : str)
};
-/** @type {import('language').TextPreprocessor<boolean>} */
+/** @type {import('language').TextProcessor<boolean>} */
export const capitalizeFirstLetter = {
name: 'Capitalize first letter',
description: 'lowercase text → Lowercase text',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)
};
@@ -39,11 +39,11 @@ export const capitalizeFirstLetter = {
* as it can result in undesirable normalization:
* - '\u9038'.normalize('NFD') => '\u9038' (逸)
* - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸)
- * @type {import('language').TextPreprocessor<boolean>}
+ * @type {import('language').TextProcessor<boolean>}
*/
export const removeAlphabeticDiacritics = {
name: 'Remove Alphabetic Diacritics',
description: 'ἄήé -> αηe',
- options: basicTextPreprocessorOptions,
+ options: basicTextProcessorOptions,
process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)
};
diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js
index 6132ee82..845d53d5 100644
--- a/ext/js/language/translator.js
+++ b/ext/js/language/translator.js
@@ -17,10 +17,9 @@
*/
import {applyTextReplacement} from '../general/regex-util.js';
-import {TextSourceMap} from '../general/text-source-map.js';
import {isCodePointJapanese} from './ja/japanese.js';
import {LanguageTransformer} from './language-transformer.js';
-import {getAllLanguageTextPreprocessors} from './languages.js';
+import {getAllLanguageTextProcessors} from './languages.js';
import {MultiLanguageTransformer} from './multi-language-transformer.js';
/**
@@ -41,8 +40,8 @@ export class Translator {
this._stringComparer = new Intl.Collator('en-US'); // Invariant locale
/** @type {RegExp} */
this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/;
- /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */
- this._textPreprocessors = new Map();
+ /** @type {import('translation-internal').TextProcessorMap} */
+ this._textProcessors = new Map();
}
/**
@@ -50,13 +49,19 @@ export class Translator {
*/
prepare() {
this._multiLanguageTransformer.prepare();
- for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) {
- /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */
- const optionSpace = new Map();
- for (const {id, textPreprocessor} of textPreprocessors) {
- optionSpace.set(id, textPreprocessor.options);
+ for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) {
+ /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */
+ const preprocessorOptionsSpace = new Map();
+ /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */
+ const postprocessorOptionsSpace = new Map();
+
+ for (const {id, textProcessor} of textPreprocessors) {
+ preprocessorOptionsSpace.set(id, textProcessor.options);
}
- this._textPreprocessors.set(iso, {textPreprocessors, optionSpace});
+ for (const {id, textProcessor} of textPostprocessors) {
+ postprocessorOptionsSpace.set(id, textProcessor.options);
+ }
+ this._textProcessors.set(iso, {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace});
}
}
@@ -428,7 +433,7 @@ export class Translator {
}
}
- // Deinflections and text preprocessing
+ // Deinflections and text processing
/**
* @param {string} text
@@ -438,57 +443,90 @@ export class Translator {
*/
_getAlgorithmDeinflections(text, options) {
const {language} = options;
- const info = this._textPreprocessors.get(language);
+ const info = this._textProcessors.get(language);
if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); }
- const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info;
+ const {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace} = info;
- /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */
- const variantSpace = new Map();
- variantSpace.set('textReplacements', this._getTextReplacementsVariants(options));
- for (const [key, value] of textPreprocessorOptionsSpace) {
- variantSpace.set(key, value);
- }
+ const preprocessorVariantSpace = new Map(preprocessorOptionsSpace);
+ preprocessorVariantSpace.set('textReplacements', this._getTextReplacementsVariants(options));
+ const preprocessorVariants = this._getArrayVariants(preprocessorVariantSpace);
+ const postprocessorVariants = this._getArrayVariants(postprocessorOptionsSpace);
/** @type {import('translation-internal').DatabaseDeinflection[]} */
const deinflections = [];
const used = new Set();
+ /** @type {Map<string, import('core').SafeAny>} */
+ const sourceCache = new Map(); // For reusing text processors' outputs
+
+ for (
+ let i = text.length;
+ i > 0;
+ i = this._getNextSubstringLength(options.searchResolution, i, text)
+ ) {
+ const rawSource = text.substring(0, i);
+
+ for (const preprocessorVariant of preprocessorVariants) {
+ let source = rawSource;
+
+ const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (preprocessorVariant.get('textReplacements'));
+ if (textReplacements !== null) {
+ source = this._applyTextReplacements(source, textReplacements);
+ }
- for (const arrayVariant of this._generateArrayVariants(variantSpace)) {
- const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements'));
+ source = this._applyTextProcessors(textPreprocessors, preprocessorVariant, source, sourceCache);
- let text2 = text;
- const sourceMap = new TextSourceMap(text2);
+ if (used.has(source)) { continue; }
+ used.add(source);
+ for (const deinflection of this._multiLanguageTransformer.transform(language, source)) {
+ const {trace, conditions} = deinflection;
+ for (const postprocessorVariant of postprocessorVariants) {
+ let {text: transformedText} = deinflection;
+ transformedText = this._applyTextProcessors(textPostprocessors, postprocessorVariant, transformedText, sourceCache);
+
+ /** @type {import('dictionary').InflectionRuleChainCandidate} */
+ const inflectionRuleChainCandidate = {
+ source: 'algorithm',
+ inflectionRules: trace.map((frame) => frame.transform)
+ };
+ deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate]));
+ }
+ }
+ }
+ }
+ return deinflections;
+ }
- if (textReplacements !== null) {
- text2 = this._applyTextReplacements(text2, sourceMap, textReplacements);
+ /**
+ * @param {import('language').TextProcessorWithId<unknown>[]} textProcessors
+ * @param {Map<string, unknown>} processorVariant
+ * @param {string} text
+ * @param {Map<string, import('core').SafeAny>} textCache
+ * @returns {string}
+ */
+ _applyTextProcessors(textProcessors, processorVariant, text, textCache) {
+ for (const {id, textProcessor: {process}} of textProcessors) {
+ const setting = processorVariant.get(id);
+ let level1 = textCache.get(text);
+ if (!level1) {
+ level1 = new Map();
+ textCache.set(text, level1);
}
- for (const preprocessor of textPreprocessors.values()) {
- const {id, textPreprocessor} = preprocessor;
- const setting = arrayVariant.get(id);
- text2 = textPreprocessor.process(text2, setting, sourceMap);
+ let level2 = level1.get(id);
+ if (!level2) {
+ level2 = new Map();
+ level1.set(id, level2);
}
- for (
- let source = text2, i = text2.length;
- i > 0;
- i = this._getNextSubstringLength(options.searchResolution, i, source)
- ) {
- source = text2.substring(0, i);
- if (used.has(source)) { break; }
- used.add(source);
- const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i));
- for (const {text: transformedText, conditions, trace} of this._multiLanguageTransformer.transform(language, source)) {
- /** @type {import('dictionary').InflectionRuleChainCandidate} */
- const inflectionRuleChainCandidate = {
- source: 'algorithm',
- inflectionRules: trace.map((frame) => frame.transform)
- };
- deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate]));
- }
+ if (!level2.has(setting)) {
+ text = process(text, setting);
+ level2.set(setting, text);
+ } else {
+ text = level2.get(setting);
}
}
- return deinflections;
+
+ return text;
}
/**
@@ -507,13 +545,12 @@ export class Translator {
/**
* @param {string} text
- * @param {TextSourceMap} sourceMap
* @param {import('translation').FindTermsTextReplacement[]} replacements
* @returns {string}
*/
- _applyTextReplacements(text, sourceMap, replacements) {
+ _applyTextReplacements(text, replacements) {
for (const {pattern, replacement} of replacements) {
- text = applyTextReplacement(text, sourceMap, pattern, replacement);
+ text = applyTextReplacement(text, pattern, replacement);
}
return text;
}
@@ -1325,10 +1362,11 @@ export class Translator {
/**
* @param {Map<string, unknown[]>} arrayVariants
- * @yields {Map<string, unknown>}
- * @returns {Generator<Map<string, unknown>, void, void>}
+ * @returns {Map<string, unknown>[]}
*/
- *_generateArrayVariants(arrayVariants) {
+ _getArrayVariants(arrayVariants) {
+ /** @type {Map<string, unknown>[]} */
+ const results = [];
const variantKeys = [...arrayVariants.keys()];
const entryVariantLengths = [];
for (const key of variantKeys) {
@@ -1350,8 +1388,9 @@ export class Translator {
remainingIndex = Math.floor(remainingIndex / entryVariants.length);
}
- yield variant;
+ results.push(variant);
}
+ return results;
}
/**
diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json
index d8d5ab0f..162be7fe 100644
--- a/test/data/anki-note-builder-test-results.json
+++ b/test/data/anki-note-builder-test-results.json
@@ -2548,8 +2548,8 @@
"audio": "",
"clipboard-image": "",
"clipboard-text": "",
- "cloze-body": "打",
- "cloze-body-kana": "だ",
+ "cloze-body": "打(う)",
+ "cloze-body-kana": "だ(う)",
"cloze-prefix": "cloze-prefix",
"cloze-suffix": "cloze-suffix",
"conjugation": "",
@@ -2577,8 +2577,8 @@
"screenshot": "",
"search-query": "fullQuery",
"selection-text": "",
- "sentence": "cloze-prefix打cloze-suffix",
- "sentence-furigana": "cloze-prefix打cloze-suffix",
+ "sentence": "cloze-prefix打(う)cloze-suffix",
+ "sentence-furigana": "cloze-prefix打(う)cloze-suffix",
"tags": "n",
"url": "<a href=\"url:\">url:</a>"
},
@@ -2586,8 +2586,8 @@
"audio": "",
"clipboard-image": "",
"clipboard-text": "",
- "cloze-body": "打",
- "cloze-body-kana": "ダース",
+ "cloze-body": "打(う)",
+ "cloze-body-kana": "ダース(う)",
"cloze-prefix": "cloze-prefix",
"cloze-suffix": "cloze-suffix",
"conjugation": "",
@@ -2615,8 +2615,8 @@
"screenshot": "",
"search-query": "fullQuery",
"selection-text": "",
- "sentence": "cloze-prefix打cloze-suffix",
- "sentence-furigana": "cloze-prefix打cloze-suffix",
+ "sentence": "cloze-prefix打(う)cloze-suffix",
+ "sentence-furigana": "cloze-prefix打(う)cloze-suffix",
"tags": "abbr, n",
"url": "<a href=\"url:\">url:</a>"
}
diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json
index c9047716..9e62954e 100644
--- a/test/data/translator-test-inputs.json
+++ b/test/data/translator-test-inputs.json
@@ -191,7 +191,7 @@
null,
[
{
- "pattern": "\\(([^)]*)(?:\\)|$)",
+ "pattern": "\\(([^)]*)(?:\\))",
"flags": "g",
"replacement": ""
}
@@ -214,7 +214,7 @@
null,
[
{
- "pattern": "\\(([^)]*)(?:\\)|$)",
+ "pattern": "\\(([^)]*)(?:\\))",
"flags": "g",
"replacement": "$1"
}
diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json
index f580ac53..f0f32fa8 100644
--- a/test/data/translator-test-results-note-data1.json
+++ b/test/data/translator-test-results-note-data1.json
@@ -22773,7 +22773,7 @@
"type": "term",
"id": 1,
"source": "打",
- "rawSource": "打",
+ "rawSource": "打(う)",
"sourceTerm": "打",
"inflectionRuleChainCandidates": [
{
@@ -23087,7 +23087,7 @@
"type": "term",
"id": 2,
"source": "打",
- "rawSource": "打",
+ "rawSource": "打(う)",
"sourceTerm": "打",
"inflectionRuleChainCandidates": [
{
diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json
index da2f8da2..b3574b46 100644
--- a/test/data/translator-test-results.json
+++ b/test/data/translator-test-results.json
@@ -12904,7 +12904,7 @@
"dictionaryIndex": 0,
"dictionaryPriority": 0,
"sourceTermExactMatchCount": 1,
- "maxOriginalTextLength": 1,
+ "maxOriginalTextLength": 4,
"headwords": [
{
"index": 0,
@@ -12912,7 +12912,7 @@
"reading": "だ",
"sources": [
{
- "originalText": "打",
+ "originalText": "打(う)",
"transformedText": "打",
"deinflectedText": "打",
"matchType": "exact",
@@ -13072,7 +13072,7 @@
"dictionaryIndex": 0,
"dictionaryPriority": 0,
"sourceTermExactMatchCount": 1,
- "maxOriginalTextLength": 1,
+ "maxOriginalTextLength": 4,
"headwords": [
{
"index": 0,
@@ -13080,7 +13080,7 @@
"reading": "ダース",
"sources": [
{
- "originalText": "打",
+ "originalText": "打(う)",
"transformedText": "打",
"deinflectedText": "打",
"matchType": "exact",
diff --git a/test/japanese-util.test.js b/test/japanese-util.test.js
index 5f64a714..bff51f85 100644
--- a/test/japanese-util.test.js
+++ b/test/japanese-util.test.js
@@ -17,7 +17,6 @@
*/
import {describe, expect, test} from 'vitest';
-import {TextSourceMap} from '../ext/js/general/text-source-map.js';
import * as jpw from '../ext/js/language/ja/japanese-wanakana.js';
import * as jp from '../ext/js/language/ja/japanese.js';
@@ -194,54 +193,46 @@ describe('Japanese utility functions', () => {
});
describe('convertHalfWidthKanaToFullWidth', () => {
- /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */
+ /** @type {[string: string, expected: string][]} */
const data = [
['0123456789', '0123456789'],
['abcdefghij', 'abcdefghij'],
['カタカナ', 'カタカナ'],
['ひらがな', 'ひらがな'],
- ['カキ', 'カキ', [1, 1]],
- ['ガキ', 'ガキ', [2, 1]],
- ['ニホン', 'ニホン', [1, 1, 1]],
- ['ニッポン', 'ニッポン', [1, 1, 2, 1]]
+ ['カキ', 'カキ'],
+ ['ガキ', 'ガキ'],
+ ['ニホン', 'ニホン'],
+ ['ニッポン', 'ニッポン']
];
- for (const [string, expected, expectedSourceMapping] of data) {
- test(`${string} -> ${expected}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => {
- const sourceMap = new TextSourceMap(string);
- const actual1 = jp.convertHalfWidthKanaToFullWidth(string, null);
- const actual2 = jp.convertHalfWidthKanaToFullWidth(string, sourceMap);
+ for (const [string, expected] of data) {
+ test(`${string} -> ${expected}`, () => {
+ const actual1 = jp.convertHalfWidthKanaToFullWidth(string);
+ const actual2 = jp.convertHalfWidthKanaToFullWidth(string);
expect(actual1).toStrictEqual(expected);
expect(actual2).toStrictEqual(expected);
- if (typeof expectedSourceMapping !== 'undefined') {
- expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true);
- }
});
}
});
describe('convertAlphabeticToKana', () => {
- /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */
+ /** @type {[string: string, expected: string][]} */
const data = [
['0123456789', '0123456789'],
- ['abcdefghij', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]],
- ['ABCDEFGHIJ', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]], // wanakana.toHiragana converts text to lower case
+ ['abcdefghij', 'あbcでfgひj'],
+ ['ABCDEFGHIJ', 'あbcでfgひj'], // wanakana.toHiragana converts text to lower case
['カタカナ', 'カタカナ'],
['ひらがな', 'ひらがな'],
- ['chikara', 'ちから', [3, 2, 2]],
- ['CHIKARA', 'ちから', [3, 2, 2]]
+ ['chikara', 'ちから'],
+ ['CHIKARA', 'ちから']
];
- for (const [string, expected, expectedSourceMapping] of data) {
- test(`${string} -> ${string}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => {
- const sourceMap = new TextSourceMap(string);
- const actual1 = jpw.convertAlphabeticToKana(string, null);
- const actual2 = jpw.convertAlphabeticToKana(string, sourceMap);
+ for (const [string, expected] of data) {
+ test(`${string} -> ${string}`, () => {
+ const actual1 = jpw.convertAlphabeticToKana(string);
+ const actual2 = jpw.convertAlphabeticToKana(string);
expect(actual1).toStrictEqual(expected);
expect(actual2).toStrictEqual(expected);
- if (typeof expectedSourceMapping !== 'undefined') {
- expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true);
- }
});
}
});
@@ -765,59 +756,54 @@ describe('Japanese utility functions', () => {
});
describe('collapseEmphaticSequences', () => {
- /** @type {[input: [text: string, fullCollapse: boolean], output: [expected: string, expectedSourceMapping: number[]]][]} */
+ /** @type {[input: [text: string, fullCollapse: boolean], output: string][]} */
const data = [
- [['かこい', false], ['かこい', [1, 1, 1]]],
- [['かこい', true], ['かこい', [1, 1, 1]]],
- [['かっこい', false], ['かっこい', [1, 1, 1, 1]]],
- [['かっこい', true], ['かこい', [2, 1, 1]]],
- [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]],
- [['かっっこい', true], ['かこい', [3, 1, 1]]],
- [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]],
- [['かっっっこい', true], ['かこい', [4, 1, 1]]],
-
- [['こい', false], ['こい', [1, 1]]],
- [['こい', true], ['こい', [1, 1]]],
- [['っこい', false], ['っこい', [1, 1, 1]]],
- [['っこい', true], ['こい', [2, 1]]],
- [['っっこい', false], ['っこい', [2, 1, 1]]],
- [['っっこい', true], ['こい', [3, 1]]],
- [['っっっこい', false], ['っこい', [3, 1, 1]]],
- [['っっっこい', true], ['こい', [4, 1]]],
-
- [['すごい', false], ['すごい', [1, 1, 1]]],
- [['すごい', true], ['すごい', [1, 1, 1]]],
- [['すごーい', false], ['すごーい', [1, 1, 1, 1]]],
- [['すごーい', true], ['すごい', [1, 2, 1]]],
- [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]],
- [['すごーーい', true], ['すごい', [1, 3, 1]]],
- [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]],
- [['すっごーい', true], ['すごい', [2, 2, 1]]],
- [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]],
- [['すっっごーーい', true], ['すごい', [3, 3, 1]]],
-
- [['', false], ['', []]],
- [['', true], ['', []]],
- [['っ', false], ['っ', [1]]],
- [['っ', true], ['', [1]]],
- [['っっ', false], ['っ', [2]]],
- [['っっ', true], ['', [2]]],
- [['っっっ', false], ['っ', [3]]],
- [['っっっ', true], ['', [3]]]
+ [['かこい', false], 'かこい'],
+ [['かこい', true], 'かこい'],
+ [['かっこい', false], 'かっこい'],
+ [['かっこい', true], 'かこい'],
+ [['かっっこい', false], 'かっこい'],
+ [['かっっこい', true], 'かこい'],
+ [['かっっっこい', false], 'かっこい'],
+ [['かっっっこい', true], 'かこい'],
+
+ [['こい', false], 'こい'],
+ [['こい', true], 'こい'],
+ [['っこい', false], 'っこい'],
+ [['っこい', true], 'こい'],
+ [['っっこい', false], 'っこい'],
+ [['っっこい', true], 'こい'],
+ [['っっっこい', false], 'っこい'],
+ [['っっっこい', true], 'こい'],
+
+ [['すごい', false], 'すごい'],
+ [['すごい', true], 'すごい'],
+ [['すごーい', false], 'すごーい'],
+ [['すごーい', true], 'すごい'],
+ [['すごーーい', false], 'すごーい'],
+ [['すごーーい', true], 'すごい'],
+ [['すっごーい', false], 'すっごーい'],
+ [['すっごーい', true], 'すごい'],
+ [['すっっごーーい', false], 'すっごーい'],
+ [['すっっごーーい', true], 'すごい'],
+
+ [['', false], ''],
+ [['', true], ''],
+ [['っ', false], 'っ'],
+ [['っ', true], ''],
+ [['っっ', false], 'っ'],
+ [['っっ', true], ''],
+ [['っっっ', false], 'っ'],
+ [['っっっ', true], '']
];
test.each(data)('%o -> %o', (input, output) => {
const [text, fullCollapse] = input;
- const [expected, expectedSourceMapping] = output;
-
- const sourceMap = new TextSourceMap(text);
- const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null);
- const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap);
- expect(actual1).toStrictEqual(expected);
- expect(actual2).toStrictEqual(expected);
- if (typeof expectedSourceMapping !== 'undefined') {
- expect(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))).toBe(true);
- }
+
+ const actual1 = jp.collapseEmphaticSequences(text, fullCollapse);
+ const actual2 = jp.collapseEmphaticSequences(text, fullCollapse);
+ expect(actual1).toStrictEqual(output);
+ expect(actual2).toStrictEqual(output);
});
});
diff --git a/test/text-source-map.test.js b/test/text-source-map.test.js
deleted file mode 100644
index 09341774..00000000
--- a/test/text-source-map.test.js
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (C) 2023-2024 Yomitan Authors
- * Copyright (C) 2020-2022 Yomichan Authors
- *
- * This program is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <https://www.gnu.org/licenses/>.
- */
-
-import {describe, expect, test} from 'vitest';
-import {TextSourceMap} from '../ext/js/general/text-source-map.js';
-
-describe('TextSourceMap', () => {
- describe('Source', () => {
- const data = [
- ['source1'],
- ['source2'],
- ['source3']
- ];
-
- test.each(data)('source-test-%#', (source) => {
- const sourceMap = new TextSourceMap(source);
- expect(source).toStrictEqual(sourceMap.source);
- });
- });
-
- describe('Equals', () => {
- /** @type {[args1: [source1: string, mapping1: ?(number[])], args2: [source2: string, mapping2: ?(number[])], expectedEquals: boolean][]} */
- const data = [
- [['source1', null], ['source1', null], true],
- [['source2', null], ['source2', null], true],
- [['source3', null], ['source3', null], true],
-
- [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', null], true],
- [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', null], true],
- [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', null], true],
-
- [['source1', null], ['source1', [1, 1, 1, 1, 1, 1, 1]], true],
- [['source2', null], ['source2', [1, 1, 1, 1, 1, 1, 1]], true],
- [['source3', null], ['source3', [1, 1, 1, 1, 1, 1, 1]], true],
-
- [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', [1, 1, 1, 1, 1, 1, 1]], true],
- [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', [1, 1, 1, 1, 1, 1, 1]], true],
- [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', [1, 1, 1, 1, 1, 1, 1]], true],
-
- [['source1', [1, 2, 1, 3]], ['source1', [1, 2, 1, 3]], true],
- [['source2', [1, 2, 1, 3]], ['source2', [1, 2, 1, 3]], true],
- [['source3', [1, 2, 1, 3]], ['source3', [1, 2, 1, 3]], true],
-
- [['source1', [1, 3, 1, 2]], ['source1', [1, 2, 1, 3]], false],
- [['source2', [1, 3, 1, 2]], ['source2', [1, 2, 1, 3]], false],
- [['source3', [1, 3, 1, 2]], ['source3', [1, 2, 1, 3]], false],
-
- [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source4', [1, 1, 1, 1, 1, 1, 1]], false],
- [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source5', [1, 1, 1, 1, 1, 1, 1]], false],
- [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source6', [1, 1, 1, 1, 1, 1, 1]], false]
- ];
-
- test.each(data)('equals-test-%#', ([source1, mapping1], [source2, mapping2], expectedEquals) => {
- const sourceMap1 = new TextSourceMap(source1, mapping1);
- const sourceMap2 = new TextSourceMap(source2, mapping2);
- expect(sourceMap1.equals(sourceMap1)).toBe(true);
- expect(sourceMap2.equals(sourceMap2)).toBe(true);
- expect(sourceMap1.equals(sourceMap2)).toStrictEqual(expectedEquals);
- });
- });
-
- describe('GetSourceLength', () => {
- /** @type {[args: [source: string, mapping: number[]], finalLength: number, expectedValue: number][]} */
- const data = [
- [['source', [1, 1, 1, 1, 1, 1]], 1, 1],
- [['source', [1, 1, 1, 1, 1, 1]], 2, 2],
- [['source', [1, 1, 1, 1, 1, 1]], 3, 3],
- [['source', [1, 1, 1, 1, 1, 1]], 4, 4],
- [['source', [1, 1, 1, 1, 1, 1]], 5, 5],
- [['source', [1, 1, 1, 1, 1, 1]], 6, 6],
-
- [['source', [2, 2, 2]], 1, 2],
- [['source', [2, 2, 2]], 2, 4],
- [['source', [2, 2, 2]], 3, 6],
-
- [['source', [3, 3]], 1, 3],
- [['source', [3, 3]], 2, 6],
-
- [['source', [6, 6]], 1, 6]
- ];
-
- test.each(data)('get-source-length-test-%#', ([source, mapping], finalLength, expectedValue) => {
- const sourceMap = new TextSourceMap(source, mapping);
- expect(sourceMap.getSourceLength(finalLength)).toStrictEqual(expectedValue);
- });
- });
-
- describe('CombineInsert', () => {
- /** @type {[args: [source: string, mapping: ?(number[])], expectedArgs: [expectedSource: string, expectedMapping: ?(number[])], operations: [operation: string, arg1: number, arg2: number][]][]} */
- const data = [
- // No operations
- [
- ['source', null],
- ['source', [1, 1, 1, 1, 1, 1]],
- []
- ],
-
- // Combine
- [
- ['source', null],
- ['source', [3, 1, 1, 1]],
- [
- ['combine', 0, 2]
- ]
- ],
- [
- ['source', null],
- ['source', [1, 1, 1, 3]],
- [
- ['combine', 3, 2]
- ]
- ],
- [
- ['source', null],
- ['source', [3, 3]],
- [
- ['combine', 0, 2],
- ['combine', 1, 2]
- ]
- ],
- [
- ['source', null],
- ['source', [3, 3]],
- [
- ['combine', 3, 2],
- ['combine', 0, 2]
- ]
- ],
-
- // Insert
- [
- ['source', null],
- ['source', [0, 1, 1, 1, 1, 1, 1]],
- [
- ['insert', 0, 0]
- ]
- ],
- [
- ['source', null],
- ['source', [1, 1, 1, 1, 1, 1, 0]],
- [
- ['insert', 6, 0]
- ]
- ],
- [
- ['source', null],
- ['source', [0, 1, 1, 1, 1, 1, 1, 0]],
- [
- ['insert', 0, 0],
- ['insert', 7, 0]
- ]
- ],
- [
- ['source', null],
- ['source', [0, 1, 1, 1, 1, 1, 1, 0]],
- [
- ['insert', 6, 0],
- ['insert', 0, 0]
- ]
- ],
-
- // Mixed
- [
- ['source', null],
- ['source', [3, 0, 3]],
- [
- ['combine', 0, 2],
- ['insert', 1, 0],
- ['combine', 2, 2]
- ]
- ],
- [
- ['source', null],
- ['source', [3, 0, 3]],
- [
- ['combine', 0, 2],
- ['combine', 1, 2],
- ['insert', 1, 0]
- ]
- ],
- [
- ['source', null],
- ['source', [3, 0, 3]],
- [
- ['insert', 3, 0],
- ['combine', 0, 2],
- ['combine', 2, 2]
- ]
- ]
- ];
-
- test.each(data)('combine-insert-test-%#', ([source, mapping], [expectedSource, expectedMapping], operations) => {
- const sourceMap = new TextSourceMap(source, mapping);
- const expectedSourceMap = new TextSourceMap(expectedSource, expectedMapping);
- for (const [operation, ...args] of operations) {
- switch (operation) {
- case 'combine':
- sourceMap.combine(...args);
- break;
- case 'insert':
- sourceMap.insert(...args);
- break;
- }
- }
- expect(sourceMap.equals(expectedSourceMap)).toBe(true);
- });
- });
-});
diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts
index 6674b28c..37da106c 100644
--- a/types/ext/language-descriptors.d.ts
+++ b/types/ext/language-descriptors.d.ts
@@ -15,13 +15,17 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language';
+import type {TextProcessor, BidirectionalConversionPreprocessor} from './language';
import type {LanguageTransformDescriptor} from './language-transformer';
import type {SafeAny} from './core';
export type IsTextLookupWorthyFunction = (text: string) => boolean;
-type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = {
+type LanguageDescriptor<
+ TIso extends string,
+ TTextPreprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>,
+ TTextPostprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>,
+> = {
iso: TIso;
name: string;
exampleText: string;
@@ -32,75 +36,126 @@ type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends
* If no value is provided, `true` is assumed for all inputs.
*/
isTextLookupWorthy?: IsTextLookupWorthyFunction;
- textPreprocessors: TTextPreprocessorDescriptor;
+ textPreprocessors?: TTextPreprocessorDescriptor;
+ textPostprocessors?: TTextPostprocessorDescriptor;
languageTransforms?: LanguageTransformDescriptor;
};
-type TextPreprocessorDescriptor = {
- [key: string]: TextPreprocessor<SafeAny>;
+type TextProcessorDescriptor = {
+ [key: string]: TextProcessor<SafeAny>;
};
type LanguageDescriptorObjectMap = {
- [key in keyof AllTextPreprocessors]: LanguageDescriptor<key, AllTextPreprocessors[key]>;
+ [key in keyof AllTextProcessors]: LanguageDescriptor<
+ key,
+ AllTextProcessors[key] extends {pre: TextProcessorDescriptor} ? AllTextProcessors[key]['pre'] : Record<string, never>,
+ AllTextProcessors[key] extends {post: TextProcessorDescriptor} ? AllTextProcessors[key]['post'] : Record<string, never>
+ >;
};
export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap];
type CapitalizationPreprocessors = {
- capitalizeFirstLetter: TextPreprocessor<boolean>;
- decapitalize: TextPreprocessor<boolean>;
+ capitalizeFirstLetter: TextProcessor<boolean>;
+ decapitalize: TextProcessor<boolean>;
};
/**
- * This is a mapping of the iso tag to all of the preprocessors for that language.
+ * This is a mapping of the iso tag to all of the text processors for that language.
* Any new language should be added to this object.
*/
-type AllTextPreprocessors = {
+type AllTextProcessors = {
ar: {
- removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+ pre: {
+ removeArabicScriptDiacritics: TextProcessor<boolean>;
+ };
};
- de: CapitalizationPreprocessors & {
- eszettPreprocessor: BidirectionalConversionPreprocessor;
+ de: {
+ pre: CapitalizationPreprocessors & {
+ eszettPreprocessor: BidirectionalConversionPreprocessor;
+ };
+ };
+ el: {
+ pre: CapitalizationPreprocessors;
+ };
+ en: {
+ pre: CapitalizationPreprocessors;
+ };
+ es: {
+ pre: CapitalizationPreprocessors;
};
- el: CapitalizationPreprocessors;
- en: CapitalizationPreprocessors;
- es: CapitalizationPreprocessors;
fa: {
- removeArabicScriptDiacritics: TextPreprocessor<boolean>;
+ pre: {
+ removeArabicScriptDiacritics: TextProcessor<boolean>;
+ };
+ };
+ fr: {
+ pre: CapitalizationPreprocessors;
+ };
+ grc: {
+ pre: CapitalizationPreprocessors & {
+ removeAlphabeticDiacritics: TextProcessor<boolean>;
+ };
};
- fr: CapitalizationPreprocessors;
- grc: CapitalizationPreprocessors & {
- removeAlphabeticDiacritics: TextPreprocessor<boolean>;
+ hu: {
+ pre: CapitalizationPreprocessors;
};
- hu: CapitalizationPreprocessors;
- id: CapitalizationPreprocessors;
- it: CapitalizationPreprocessors;
- la: CapitalizationPreprocessors & {
- removeAlphabeticDiacritics: TextPreprocessor<boolean>;
+ id: {
+ pre: CapitalizationPreprocessors;
+ };
+ it: {
+ pre: CapitalizationPreprocessors;
+ };
+ la: {
+ pre: CapitalizationPreprocessors & {
+ removeAlphabeticDiacritics: TextProcessor<boolean>;
+ };
};
ja: {
- convertHalfWidthCharacters: TextPreprocessor<boolean>;
- convertNumericCharacters: TextPreprocessor<boolean>;
- convertAlphabeticCharacters: TextPreprocessor<boolean>;
- convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
- collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
+ pre: {
+ convertHalfWidthCharacters: TextProcessor<boolean>;
+ convertNumericCharacters: TextProcessor<boolean>;
+ convertAlphabeticCharacters: TextProcessor<boolean>;
+ convertHiraganaToKatakana: BidirectionalConversionPreprocessor;
+ collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>;
+ };
};
km: Record<string, never>;
- pl: CapitalizationPreprocessors;
- pt: CapitalizationPreprocessors;
- ro: CapitalizationPreprocessors;
- ru: CapitalizationPreprocessors & {
- yoToE: TextPreprocessor<boolean>;
- removeRussianDiacritics: TextPreprocessor<boolean>;
- };
- sga: CapitalizationPreprocessors & {
- removeAlphabeticDiacritics: TextPreprocessor<boolean>;
- };
- sh: CapitalizationPreprocessors;
- sq: CapitalizationPreprocessors;
- sv: CapitalizationPreprocessors;
+ pl: {
+ pre: CapitalizationPreprocessors;
+ };
+ pt: {
+ pre: CapitalizationPreprocessors;
+ };
+ ro: {
+ pre: CapitalizationPreprocessors;
+ };
+ ru: {
+ pre: CapitalizationPreprocessors & {
+ yoToE: TextProcessor<boolean>;
+ removeRussianDiacritics: TextProcessor<boolean>;
+ };
+ };
+ sga: {
+ pre: CapitalizationPreprocessors & {
+ removeAlphabeticDiacritics: TextProcessor<boolean>;
+ };
+ };
+ sh: {
+ pre: CapitalizationPreprocessors;
+ };
+ sq: {
+ pre: CapitalizationPreprocessors;
+ };
+ sv: {
+ pre: CapitalizationPreprocessors;
+ };
th: Record<string, never>;
- tr: CapitalizationPreprocessors;
- vi: CapitalizationPreprocessors;
+ tr: {
+ pre: CapitalizationPreprocessors;
+ };
+ vi: {
+ pre: CapitalizationPreprocessors;
+ };
zh: Record<string, never>;
};
diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts
index 8fa6f0e7..c708f6e7 100644
--- a/types/ext/language.d.ts
+++ b/types/ext/language.d.ts
@@ -15,32 +15,32 @@
* along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
-import type {TextSourceMap} from '../../ext/js/general/text-source-map.js';
import type {LanguageTransformDescriptor} from './language-transformer.js';
-export type TextPreprocessorOptions<T = unknown> = T[];
+export type TextProcessorOptions<T = unknown> = T[];
-export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string;
+export type TextProcessorFunction<T = unknown> = (str: string, setting: T) => string;
/**
- * Text preprocessors are used during the translation process to create alternate versions of the input text to search for.
+ * Text pre- and post-processors are used during the translation process to create alternate versions of the input text to search for.
* This is helpful when the input text doesn't exactly match the term or expression found in the database.
- * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors.
+ * When a language has multiple processors, the translator will generate variants of the text by applying all combinations of the processors.
*/
-export type TextPreprocessor<T = unknown> = {
+export type TextProcessor<T = unknown> = {
name: string;
description: string;
- options: TextPreprocessorOptions<T>;
- process: TextPreprocessorFunction<T>;
+ options: TextProcessorOptions<T>;
+ process: TextProcessorFunction<T>;
};
export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse';
-export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>;
+export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>;
-export type LanguageAndPreprocessors = {
+export type LanguageAndProcessors = {
iso: string;
- textPreprocessors: TextPreprocessorWithId<unknown>[];
+ textPreprocessors?: TextProcessorWithId<unknown>[];
+ textPostprocessors?: TextProcessorWithId<unknown>[];
};
export type LanguageAndTransforms = {
@@ -48,9 +48,9 @@ export type LanguageAndTransforms = {
languageTransforms: LanguageTransformDescriptor;
};
-export type TextPreprocessorWithId<T = unknown> = {
+export type TextProcessorWithId<T = unknown> = {
id: string;
- textPreprocessor: TextPreprocessor<T>;
+ textProcessor: TextProcessor<T>;
};
export type LanguageSummary = {
diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts
index 7006221e..00056562 100644
--- a/types/ext/translation-internal.d.ts
+++ b/types/ext/translation-internal.d.ts
@@ -49,4 +49,14 @@ export type DatabaseDeinflection = {
databaseEntries: DictionaryDatabase.TermEntry[];
};
-export type PreprocessorOptionsSpace = Map<string, Language.TextPreprocessorOptions<unknown>>;
+export type TextProcessorOptionsSpace = Map<string, Language.TextProcessorOptions<unknown>>;
+
+export type TextProcessorMap = Map<
+ string,
+ {
+ textPreprocessors: Language.TextProcessorWithId<unknown>[];
+ preprocessorOptionsSpace: TextProcessorOptionsSpace;
+ textPostprocessors: Language.TextProcessorWithId<unknown>[];
+ postprocessorOptionsSpace: TextProcessorOptionsSpace;
+ }
+>;