diff options
21 files changed, 349 insertions, 694 deletions
diff --git a/.eslintrc.json b/.eslintrc.json index 8b08e827..e347c978 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -639,7 +639,6 @@ "ext/js/general/cache-map.js", "ext/js/general/object-property-accessor.js", "ext/js/general/regex-util.js", - "ext/js/general/text-source-map.js", "ext/js/language/ar/arabic-text-preprocessors.js", "ext/js/language/de/german-text-preprocessors.js", "ext/js/language/de/german-transforms.js", @@ -656,7 +655,7 @@ "ext/js/language/ru/russian-text-preprocessors.js", "ext/js/language/sga/old-irish-transforms.js", "ext/js/language/sq/albanian-transforms.js", - "ext/js/language/text-preprocessors.js", + "ext/js/language/text-processors.js", "ext/js/language/translator.js", "ext/js/media/audio-downloader.js", "ext/js/media/media-util.js", diff --git a/ext/js/general/regex-util.js b/ext/js/general/regex-util.js index e0982154..c633ec06 100644 --- a/ext/js/general/regex-util.js +++ b/ext/js/general/regex-util.js @@ -23,13 +23,12 @@ const matchReplacementPattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g; * Applies string.replace using a regular expression and replacement string as arguments. * A source map of the changes is also maintained. * @param {string} text A string of the text to replace. - * @param {import('./text-source-map.js').TextSourceMap} sourceMap An instance of `TextSourceMap` which corresponds to `text`. * @param {RegExp} pattern A regular expression to use as the replacement. * @param {string} replacement A replacement string that follows the format of the standard * JavaScript regular expression replacement string. * @returns {string} A new string with the pattern replacements applied and the source map updated. */ -export function applyTextReplacement(text, sourceMap, pattern, replacement) { +export function applyTextReplacement(text, pattern, replacement) { const isGlobal = pattern.global; if (isGlobal) { pattern.lastIndex = 0; } for (let loop = true; loop; loop = isGlobal) { @@ -44,15 +43,6 @@ export function applyTextReplacement(text, sourceMap, pattern, replacement) { text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`; pattern.lastIndex += delta; - - if (actualReplacementLength > 0) { - /** @type {number[]} */ - const zeroes = new Array(actualReplacementLength).fill(0); - sourceMap.insert(index, ...zeroes); - sourceMap.combine(index - 1 + actualReplacementLength, matchText.length); - } else { - sourceMap.combine(index, matchText.length); - } } return text; } diff --git a/ext/js/general/text-source-map.js b/ext/js/general/text-source-map.js deleted file mode 100644 index 527c232b..00000000 --- a/ext/js/general/text-source-map.js +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (C) 2023-2024 Yomitan Authors - * Copyright (C) 2020-2022 Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - */ - -export class TextSourceMap { - /** - * @param {string} source - * @param {number[]|null} [mapping=null] - */ - constructor(source, mapping = null) { - /** @type {string} */ - this._source = source; - /** @type {?number[]} */ - this._mapping = (mapping !== null ? TextSourceMap.normalizeMapping(mapping) : null); - } - - /** @type {string} */ - get source() { - return this._source; - } - - /** - * @param {unknown} other - * @returns {boolean} - */ - equals(other) { - if (this === other) { - return true; - } - - const source = this._source; - if (!(other instanceof TextSourceMap && source === other.source)) { - return false; - } - - let mapping = this._mapping; - let otherMapping = other.getMappingCopy(); - if (mapping === null) { - if (otherMapping === null) { - return true; - } - mapping = TextSourceMap.createMapping(source); - } else if (otherMapping === null) { - otherMapping = TextSourceMap.createMapping(source); - } - - const mappingLength = mapping.length; - if (mappingLength !== otherMapping.length) { - return false; - } - - for (let i = 0; i < mappingLength; ++i) { - if (mapping[i] !== otherMapping[i]) { - return false; - } - } - - return true; - } - - /** - * @param {number} finalLength - * @returns {number} - */ - getSourceLength(finalLength) { - const mapping = this._mapping; - if (mapping === null) { - return finalLength; - } - - let sourceLength = 0; - for (let i = 0; i < finalLength; ++i) { - sourceLength += mapping[i]; - } - return sourceLength; - } - - /** - * @param {number} index - * @param {number} count - */ - combine(index, count) { - if (count <= 0) { return; } - - if (this._mapping === null) { - this._mapping = TextSourceMap.createMapping(this._source); - } - - let sum = this._mapping[index]; - const parts = this._mapping.splice(index + 1, count); - for (const part of parts) { - sum += part; - } - this._mapping[index] = sum; - } - - /** - * @param {number} index - * @param {number[]} items - */ - insert(index, ...items) { - if (this._mapping === null) { - this._mapping = TextSourceMap.createMapping(this._source); - } - - this._mapping.splice(index, 0, ...items); - } - - /** - * @returns {?number[]} - */ - getMappingCopy() { - return this._mapping !== null ? [...this._mapping] : null; - } - - /** - * @param {string} text - * @returns {number[]} - */ - static createMapping(text) { - return new Array(text.length).fill(1); - } - - /** - * @param {number[]} mapping - * @returns {number[]} - */ - static normalizeMapping(mapping) { - const result = []; - for (const value of mapping) { - result.push( - (typeof value === 'number' && Number.isFinite(value)) ? - Math.floor(value) : - 0 - ); - } - return result; - } -} diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js index 6007d770..91535ccd 100644 --- a/ext/js/language/ar/arabic-text-preprocessors.js +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -15,7 +15,7 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js'; const optionalDiacritics = [ '\u0618', // Small Fatha @@ -38,11 +38,11 @@ const optionalDiacritics = [ const diacriticsRegex = new RegExp(`[${optionalDiacritics.join('')}]`, 'g'); -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const removeArabicScriptDiacritics = { name: 'Remove diacritics', description: 'وَلَدَ ⬅️ ولد', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (text, setting) => { return setting ? text.replace(diacriticsRegex, '') : text; } diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index 06f944c1..b3d50817 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -15,7 +15,7 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js'; import {convertAlphabeticToKana} from './japanese-wanakana.js'; import { collapseEmphaticSequences as collapseEmphaticSequencesFunction, @@ -25,28 +25,28 @@ import { convertNumericToFullWidth } from './japanese.js'; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const convertHalfWidthCharacters = { name: 'Convert half width characters to full width', description: 'ヨミチャン → ヨミチャン', - options: basicTextPreprocessorOptions, - process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str) }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const convertNumericCharacters = { name: 'Convert numeric characters to full width', description: '1234 → 1234', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const convertAlphabeticCharacters = { name: 'Convert alphabetic characters to hiragana', description: 'yomichan → よみちゃん', - options: basicTextPreprocessorOptions, - process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) + options: basicTextProcessorOptions, + process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str) }; /** @type {import('language').BidirectionalConversionPreprocessor} */ @@ -66,15 +66,15 @@ export const convertHiraganaToKatakana = { } }; -/** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ +/** @type {import('language').TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ export const collapseEmphaticSequences = { name: 'Collapse emphatic character sequences', description: 'すっっごーーい → すっごーい / すごい', options: [[false, false], [true, false], [true, true]], - process: (str, setting, sourceMap) => { + process: (str, setting) => { const [collapseEmphatic, collapseEmphaticFull] = setting; if (collapseEmphatic) { - str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull, sourceMap); + str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull); } return str; } diff --git a/ext/js/language/ja/japanese-wanakana.js b/ext/js/language/ja/japanese-wanakana.js index 32260489..a87db6b7 100644 --- a/ext/js/language/ja/japanese-wanakana.js +++ b/ext/js/language/ja/japanese-wanakana.js @@ -19,51 +19,10 @@ import * as wanakana from '../../../lib/wanakana.js'; /** * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap - * @param {number} sourceMapStart * @returns {string} */ -function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { - const result = wanakana.toHiragana(text); - - // Generate source mapping - if (sourceMap !== null) { - let i = 0; - let resultPos = 0; - const ii = text.length; - while (i < ii) { - // Find smallest matching substring - let iNext = i + 1; - let resultPosNext = result.length; - while (iNext < ii) { - const t = wanakana.toHiragana(text.substring(0, iNext)); - if (t === result.substring(0, t.length)) { - resultPosNext = t.length; - break; - } - ++iNext; - } - - // Merge characters - const removals = iNext - i - 1; - if (removals > 0) { - sourceMap.combine(sourceMapStart, removals); - } - ++sourceMapStart; - - // Empty elements - const additions = resultPosNext - resultPos - 1; - for (let j = 0; j < additions; ++j) { - sourceMap.insert(sourceMapStart, 0); - ++sourceMapStart; - } - - i = iNext; - resultPos = resultPosNext; - } - } - - return result; +function convertAlphabeticPartToKana(text) { + return wanakana.toHiragana(text); } /** @@ -84,10 +43,9 @@ export function convertToRomaji(text) { /** * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap * @returns {string} */ -export function convertAlphabeticToKana(text, sourceMap = null) { +export function convertAlphabeticToKana(text) { let part = ''; let result = ''; @@ -106,7 +64,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) { c = 0x2d; // '-' } else { if (part.length > 0) { - result += convertAlphabeticPartToKana(part, sourceMap, result.length); + result += convertAlphabeticPartToKana(part); part = ''; } result += char; @@ -116,7 +74,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) { } if (part.length > 0) { - result += convertAlphabeticPartToKana(part, sourceMap, result.length); + result += convertAlphabeticPartToKana(part); } return result; } diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js index 2c9a1f7f..3507e5df 100644 --- a/ext/js/language/ja/japanese.js +++ b/ext/js/language/ja/japanese.js @@ -539,10 +539,9 @@ export function convertNumericToFullWidth(text) { /** * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap] * @returns {string} */ -export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) { +export function convertHalfWidthKanaToFullWidth(text) { let result = ''; // This function is safe to use charCodeAt instead of codePointAt, since all @@ -575,9 +574,6 @@ export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) { } } - if (sourceMap !== null && index > 0) { - sourceMap.combine(result.length, 1); - } result += c2; } @@ -705,13 +701,11 @@ export function distributeFuriganaInflected(term, reading, source) { /** * @param {string} text * @param {boolean} fullCollapse - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap] * @returns {string} */ -export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null) { +export function collapseEmphaticSequences(text, fullCollapse) { let result = ''; let collapseCodePoint = -1; - const hasSourceMap = (sourceMap !== null); for (const char of text) { const c = char.codePointAt(0); if ( @@ -729,11 +723,6 @@ export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null) } else { collapseCodePoint = -1; result += char; - continue; - } - - if (hasSourceMap) { - sourceMap.combine(Math.max(0, result.length - 1), 1); } } return result; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index c5c3e01e..2df2f794 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -25,7 +25,7 @@ import {isStringPartiallyJapanese} from './ja/japanese.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; -import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js'; +import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; const capitalizationPreprocessors = { decapitalize, @@ -138,8 +138,7 @@ const languageDescriptors = [ { iso: 'km', name: 'Khmer', - exampleText: 'អាន', - textPreprocessors: {} + exampleText: 'អាន' }, { iso: 'pl', @@ -201,8 +200,7 @@ const languageDescriptors = [ { iso: 'th', name: 'Thai', - exampleText: 'อ่าน', - textPreprocessors: {} + exampleText: 'อ่าน' }, { iso: 'tr', @@ -219,8 +217,7 @@ const languageDescriptors = [ { iso: 'zh', name: 'Chinese', - exampleText: '读', - textPreprocessors: {} + exampleText: '读' } ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 4b196c2c..b3890aa6 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -29,21 +29,29 @@ export function getLanguageSummaries() { } /** - * @returns {import('language').LanguageAndPreprocessors[]} + * @returns {import('language').LanguageAndProcessors[]} * @throws {Error} */ -export function getAllLanguageTextPreprocessors() { +export function getAllLanguageTextProcessors() { const results = []; - for (const {iso, textPreprocessors} of languageDescriptorMap.values()) { - /** @type {import('language').TextPreprocessorWithId<unknown>[]} */ + for (const {iso, textPreprocessors = {}, textPostprocessors = {}} of languageDescriptorMap.values()) { + /** @type {import('language').TextProcessorWithId<unknown>[]} */ const textPreprocessorsArray = []; for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) { textPreprocessorsArray.push({ id, - textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor) + textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPreprocessor) }); } - results.push({iso, textPreprocessors: textPreprocessorsArray}); + /** @type {import('language').TextProcessorWithId<unknown>[]} */ + const textPostprocessorsArray = []; + for (const [id, textPostprocessor] of Object.entries(textPostprocessors)) { + textPostprocessorsArray.push({ + id, + textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPostprocessor) + }); + } + results.push({iso, textPreprocessors: textPreprocessorsArray, textPostprocessors: textPostprocessorsArray}); } return results; } diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js index fc4472e9..fbda38c7 100644 --- a/ext/js/language/ru/russian-text-preprocessors.js +++ b/ext/js/language/ru/russian-text-preprocessors.js @@ -15,23 +15,23 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js'; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const removeRussianDiacritics = { name: 'Remove diacritics', description: 'A\u0301 → A, a\u0301 → a', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => { return setting ? str.replace(/\u0301/g, '') : str; } }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const yoToE = { name: 'Yo to E', description: 'ё → е, Ё → Е', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => { return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str; } diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-processors.js index e33fccda..e7855df2 100755 --- a/ext/js/language/text-preprocessors.js +++ b/ext/js/language/text-processors.js @@ -15,22 +15,22 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -/** @type {import('language').TextPreprocessorOptions<boolean>} */ -export const basicTextPreprocessorOptions = [false, true]; +/** @type {import('language').TextProcessorOptions<boolean>} */ +export const basicTextProcessorOptions = [false, true]; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const decapitalize = { name: 'Decapitalize text', description: 'CAPITALIZED TEXT → capitalized text', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => (setting ? str.toLowerCase() : str) }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */ export const capitalizeFirstLetter = { name: 'Capitalize first letter', description: 'lowercase text → Lowercase text', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str) }; @@ -39,11 +39,11 @@ export const capitalizeFirstLetter = { * as it can result in undesirable normalization: * - '\u9038'.normalize('NFD') => '\u9038' (逸) * - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸) - * @type {import('language').TextPreprocessor<boolean>} + * @type {import('language').TextProcessor<boolean>} */ export const removeAlphabeticDiacritics = { name: 'Remove Alphabetic Diacritics', description: 'ἄήé -> αηe', - options: basicTextPreprocessorOptions, + options: basicTextProcessorOptions, process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str) }; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 6132ee82..845d53d5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -17,10 +17,9 @@ */ import {applyTextReplacement} from '../general/regex-util.js'; -import {TextSourceMap} from '../general/text-source-map.js'; import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextPreprocessors} from './languages.js'; +import {getAllLanguageTextProcessors} from './languages.js'; import {MultiLanguageTransformer} from './multi-language-transformer.js'; /** @@ -41,8 +40,8 @@ export class Translator { this._stringComparer = new Intl.Collator('en-US'); // Invariant locale /** @type {RegExp} */ this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; - /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ - this._textPreprocessors = new Map(); + /** @type {import('translation-internal').TextProcessorMap} */ + this._textProcessors = new Map(); } /** @@ -50,13 +49,19 @@ export class Translator { */ prepare() { this._multiLanguageTransformer.prepare(); - for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { - /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ - const optionSpace = new Map(); - for (const {id, textPreprocessor} of textPreprocessors) { - optionSpace.set(id, textPreprocessor.options); + for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) { + /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */ + const preprocessorOptionsSpace = new Map(); + /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */ + const postprocessorOptionsSpace = new Map(); + + for (const {id, textProcessor} of textPreprocessors) { + preprocessorOptionsSpace.set(id, textProcessor.options); } - this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); + for (const {id, textProcessor} of textPostprocessors) { + postprocessorOptionsSpace.set(id, textProcessor.options); + } + this._textProcessors.set(iso, {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace}); } } @@ -428,7 +433,7 @@ export class Translator { } } - // Deinflections and text preprocessing + // Deinflections and text processing /** * @param {string} text @@ -438,57 +443,90 @@ export class Translator { */ _getAlgorithmDeinflections(text, options) { const {language} = options; - const info = this._textPreprocessors.get(language); + const info = this._textProcessors.get(language); if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } - const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; + const {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace} = info; - /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ - const variantSpace = new Map(); - variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); - for (const [key, value] of textPreprocessorOptionsSpace) { - variantSpace.set(key, value); - } + const preprocessorVariantSpace = new Map(preprocessorOptionsSpace); + preprocessorVariantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); + const preprocessorVariants = this._getArrayVariants(preprocessorVariantSpace); + const postprocessorVariants = this._getArrayVariants(postprocessorOptionsSpace); /** @type {import('translation-internal').DatabaseDeinflection[]} */ const deinflections = []; const used = new Set(); + /** @type {Map<string, import('core').SafeAny>} */ + const sourceCache = new Map(); // For reusing text processors' outputs + + for ( + let i = text.length; + i > 0; + i = this._getNextSubstringLength(options.searchResolution, i, text) + ) { + const rawSource = text.substring(0, i); + + for (const preprocessorVariant of preprocessorVariants) { + let source = rawSource; + + const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (preprocessorVariant.get('textReplacements')); + if (textReplacements !== null) { + source = this._applyTextReplacements(source, textReplacements); + } - for (const arrayVariant of this._generateArrayVariants(variantSpace)) { - const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); + source = this._applyTextProcessors(textPreprocessors, preprocessorVariant, source, sourceCache); - let text2 = text; - const sourceMap = new TextSourceMap(text2); + if (used.has(source)) { continue; } + used.add(source); + for (const deinflection of this._multiLanguageTransformer.transform(language, source)) { + const {trace, conditions} = deinflection; + for (const postprocessorVariant of postprocessorVariants) { + let {text: transformedText} = deinflection; + transformedText = this._applyTextProcessors(textPostprocessors, postprocessorVariant, transformedText, sourceCache); + + /** @type {import('dictionary').InflectionRuleChainCandidate} */ + const inflectionRuleChainCandidate = { + source: 'algorithm', + inflectionRules: trace.map((frame) => frame.transform) + }; + deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate])); + } + } + } + } + return deinflections; + } - if (textReplacements !== null) { - text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); + /** + * @param {import('language').TextProcessorWithId<unknown>[]} textProcessors + * @param {Map<string, unknown>} processorVariant + * @param {string} text + * @param {Map<string, import('core').SafeAny>} textCache + * @returns {string} + */ + _applyTextProcessors(textProcessors, processorVariant, text, textCache) { + for (const {id, textProcessor: {process}} of textProcessors) { + const setting = processorVariant.get(id); + let level1 = textCache.get(text); + if (!level1) { + level1 = new Map(); + textCache.set(text, level1); } - for (const preprocessor of textPreprocessors.values()) { - const {id, textPreprocessor} = preprocessor; - const setting = arrayVariant.get(id); - text2 = textPreprocessor.process(text2, setting, sourceMap); + let level2 = level1.get(id); + if (!level2) { + level2 = new Map(); + level1.set(id, level2); } - for ( - let source = text2, i = text2.length; - i > 0; - i = this._getNextSubstringLength(options.searchResolution, i, source) - ) { - source = text2.substring(0, i); - if (used.has(source)) { break; } - used.add(source); - const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); - for (const {text: transformedText, conditions, trace} of this._multiLanguageTransformer.transform(language, source)) { - /** @type {import('dictionary').InflectionRuleChainCandidate} */ - const inflectionRuleChainCandidate = { - source: 'algorithm', - inflectionRules: trace.map((frame) => frame.transform) - }; - deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate])); - } + if (!level2.has(setting)) { + text = process(text, setting); + level2.set(setting, text); + } else { + text = level2.get(setting); } } - return deinflections; + + return text; } /** @@ -507,13 +545,12 @@ export class Translator { /** * @param {string} text - * @param {TextSourceMap} sourceMap * @param {import('translation').FindTermsTextReplacement[]} replacements * @returns {string} */ - _applyTextReplacements(text, sourceMap, replacements) { + _applyTextReplacements(text, replacements) { for (const {pattern, replacement} of replacements) { - text = applyTextReplacement(text, sourceMap, pattern, replacement); + text = applyTextReplacement(text, pattern, replacement); } return text; } @@ -1325,10 +1362,11 @@ export class Translator { /** * @param {Map<string, unknown[]>} arrayVariants - * @yields {Map<string, unknown>} - * @returns {Generator<Map<string, unknown>, void, void>} + * @returns {Map<string, unknown>[]} */ - *_generateArrayVariants(arrayVariants) { + _getArrayVariants(arrayVariants) { + /** @type {Map<string, unknown>[]} */ + const results = []; const variantKeys = [...arrayVariants.keys()]; const entryVariantLengths = []; for (const key of variantKeys) { @@ -1350,8 +1388,9 @@ export class Translator { remainingIndex = Math.floor(remainingIndex / entryVariants.length); } - yield variant; + results.push(variant); } + return results; } /** diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json index d8d5ab0f..162be7fe 100644 --- a/test/data/anki-note-builder-test-results.json +++ b/test/data/anki-note-builder-test-results.json @@ -2548,8 +2548,8 @@ "audio": "", "clipboard-image": "", "clipboard-text": "", - "cloze-body": "打", - "cloze-body-kana": "だ", + "cloze-body": "打(う)", + "cloze-body-kana": "だ(う)", "cloze-prefix": "cloze-prefix", "cloze-suffix": "cloze-suffix", "conjugation": "", @@ -2577,8 +2577,8 @@ "screenshot": "", "search-query": "fullQuery", "selection-text": "", - "sentence": "cloze-prefix打cloze-suffix", - "sentence-furigana": "cloze-prefix打cloze-suffix", + "sentence": "cloze-prefix打(う)cloze-suffix", + "sentence-furigana": "cloze-prefix打(う)cloze-suffix", "tags": "n", "url": "<a href=\"url:\">url:</a>" }, @@ -2586,8 +2586,8 @@ "audio": "", "clipboard-image": "", "clipboard-text": "", - "cloze-body": "打", - "cloze-body-kana": "ダース", + "cloze-body": "打(う)", + "cloze-body-kana": "ダース(う)", "cloze-prefix": "cloze-prefix", "cloze-suffix": "cloze-suffix", "conjugation": "", @@ -2615,8 +2615,8 @@ "screenshot": "", "search-query": "fullQuery", "selection-text": "", - "sentence": "cloze-prefix打cloze-suffix", - "sentence-furigana": "cloze-prefix打cloze-suffix", + "sentence": "cloze-prefix打(う)cloze-suffix", + "sentence-furigana": "cloze-prefix打(う)cloze-suffix", "tags": "abbr, n", "url": "<a href=\"url:\">url:</a>" } diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json index c9047716..9e62954e 100644 --- a/test/data/translator-test-inputs.json +++ b/test/data/translator-test-inputs.json @@ -191,7 +191,7 @@ null, [ { - "pattern": "\\(([^)]*)(?:\\)|$)", + "pattern": "\\(([^)]*)(?:\\))", "flags": "g", "replacement": "" } @@ -214,7 +214,7 @@ null, [ { - "pattern": "\\(([^)]*)(?:\\)|$)", + "pattern": "\\(([^)]*)(?:\\))", "flags": "g", "replacement": "$1" } diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index f580ac53..f0f32fa8 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -22773,7 +22773,7 @@ "type": "term", "id": 1, "source": "打", - "rawSource": "打", + "rawSource": "打(う)", "sourceTerm": "打", "inflectionRuleChainCandidates": [ { @@ -23087,7 +23087,7 @@ "type": "term", "id": 2, "source": "打", - "rawSource": "打", + "rawSource": "打(う)", "sourceTerm": "打", "inflectionRuleChainCandidates": [ { diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index da2f8da2..b3574b46 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -12904,7 +12904,7 @@ "dictionaryIndex": 0, "dictionaryPriority": 0, "sourceTermExactMatchCount": 1, - "maxOriginalTextLength": 1, + "maxOriginalTextLength": 4, "headwords": [ { "index": 0, @@ -12912,7 +12912,7 @@ "reading": "だ", "sources": [ { - "originalText": "打", + "originalText": "打(う)", "transformedText": "打", "deinflectedText": "打", "matchType": "exact", @@ -13072,7 +13072,7 @@ "dictionaryIndex": 0, "dictionaryPriority": 0, "sourceTermExactMatchCount": 1, - "maxOriginalTextLength": 1, + "maxOriginalTextLength": 4, "headwords": [ { "index": 0, @@ -13080,7 +13080,7 @@ "reading": "ダース", "sources": [ { - "originalText": "打", + "originalText": "打(う)", "transformedText": "打", "deinflectedText": "打", "matchType": "exact", diff --git a/test/japanese-util.test.js b/test/japanese-util.test.js index 5f64a714..bff51f85 100644 --- a/test/japanese-util.test.js +++ b/test/japanese-util.test.js @@ -17,7 +17,6 @@ */ import {describe, expect, test} from 'vitest'; -import {TextSourceMap} from '../ext/js/general/text-source-map.js'; import * as jpw from '../ext/js/language/ja/japanese-wanakana.js'; import * as jp from '../ext/js/language/ja/japanese.js'; @@ -194,54 +193,46 @@ describe('Japanese utility functions', () => { }); describe('convertHalfWidthKanaToFullWidth', () => { - /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */ + /** @type {[string: string, expected: string][]} */ const data = [ ['0123456789', '0123456789'], ['abcdefghij', 'abcdefghij'], ['カタカナ', 'カタカナ'], ['ひらがな', 'ひらがな'], - ['カキ', 'カキ', [1, 1]], - ['ガキ', 'ガキ', [2, 1]], - ['ニホン', 'ニホン', [1, 1, 1]], - ['ニッポン', 'ニッポン', [1, 1, 2, 1]] + ['カキ', 'カキ'], + ['ガキ', 'ガキ'], + ['ニホン', 'ニホン'], + ['ニッポン', 'ニッポン'] ]; - for (const [string, expected, expectedSourceMapping] of data) { - test(`${string} -> ${expected}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => { - const sourceMap = new TextSourceMap(string); - const actual1 = jp.convertHalfWidthKanaToFullWidth(string, null); - const actual2 = jp.convertHalfWidthKanaToFullWidth(string, sourceMap); + for (const [string, expected] of data) { + test(`${string} -> ${expected}`, () => { + const actual1 = jp.convertHalfWidthKanaToFullWidth(string); + const actual2 = jp.convertHalfWidthKanaToFullWidth(string); expect(actual1).toStrictEqual(expected); expect(actual2).toStrictEqual(expected); - if (typeof expectedSourceMapping !== 'undefined') { - expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true); - } }); } }); describe('convertAlphabeticToKana', () => { - /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */ + /** @type {[string: string, expected: string][]} */ const data = [ ['0123456789', '0123456789'], - ['abcdefghij', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]], - ['ABCDEFGHIJ', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]], // wanakana.toHiragana converts text to lower case + ['abcdefghij', 'あbcでfgひj'], + ['ABCDEFGHIJ', 'あbcでfgひj'], // wanakana.toHiragana converts text to lower case ['カタカナ', 'カタカナ'], ['ひらがな', 'ひらがな'], - ['chikara', 'ちから', [3, 2, 2]], - ['CHIKARA', 'ちから', [3, 2, 2]] + ['chikara', 'ちから'], + ['CHIKARA', 'ちから'] ]; - for (const [string, expected, expectedSourceMapping] of data) { - test(`${string} -> ${string}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => { - const sourceMap = new TextSourceMap(string); - const actual1 = jpw.convertAlphabeticToKana(string, null); - const actual2 = jpw.convertAlphabeticToKana(string, sourceMap); + for (const [string, expected] of data) { + test(`${string} -> ${string}`, () => { + const actual1 = jpw.convertAlphabeticToKana(string); + const actual2 = jpw.convertAlphabeticToKana(string); expect(actual1).toStrictEqual(expected); expect(actual2).toStrictEqual(expected); - if (typeof expectedSourceMapping !== 'undefined') { - expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true); - } }); } }); @@ -765,59 +756,54 @@ describe('Japanese utility functions', () => { }); describe('collapseEmphaticSequences', () => { - /** @type {[input: [text: string, fullCollapse: boolean], output: [expected: string, expectedSourceMapping: number[]]][]} */ + /** @type {[input: [text: string, fullCollapse: boolean], output: string][]} */ const data = [ - [['かこい', false], ['かこい', [1, 1, 1]]], - [['かこい', true], ['かこい', [1, 1, 1]]], - [['かっこい', false], ['かっこい', [1, 1, 1, 1]]], - [['かっこい', true], ['かこい', [2, 1, 1]]], - [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]], - [['かっっこい', true], ['かこい', [3, 1, 1]]], - [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]], - [['かっっっこい', true], ['かこい', [4, 1, 1]]], - - [['こい', false], ['こい', [1, 1]]], - [['こい', true], ['こい', [1, 1]]], - [['っこい', false], ['っこい', [1, 1, 1]]], - [['っこい', true], ['こい', [2, 1]]], - [['っっこい', false], ['っこい', [2, 1, 1]]], - [['っっこい', true], ['こい', [3, 1]]], - [['っっっこい', false], ['っこい', [3, 1, 1]]], - [['っっっこい', true], ['こい', [4, 1]]], - - [['すごい', false], ['すごい', [1, 1, 1]]], - [['すごい', true], ['すごい', [1, 1, 1]]], - [['すごーい', false], ['すごーい', [1, 1, 1, 1]]], - [['すごーい', true], ['すごい', [1, 2, 1]]], - [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]], - [['すごーーい', true], ['すごい', [1, 3, 1]]], - [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]], - [['すっごーい', true], ['すごい', [2, 2, 1]]], - [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]], - [['すっっごーーい', true], ['すごい', [3, 3, 1]]], - - [['', false], ['', []]], - [['', true], ['', []]], - [['っ', false], ['っ', [1]]], - [['っ', true], ['', [1]]], - [['っっ', false], ['っ', [2]]], - [['っっ', true], ['', [2]]], - [['っっっ', false], ['っ', [3]]], - [['っっっ', true], ['', [3]]] + [['かこい', false], 'かこい'], + [['かこい', true], 'かこい'], + [['かっこい', false], 'かっこい'], + [['かっこい', true], 'かこい'], + [['かっっこい', false], 'かっこい'], + [['かっっこい', true], 'かこい'], + [['かっっっこい', false], 'かっこい'], + [['かっっっこい', true], 'かこい'], + + [['こい', false], 'こい'], + [['こい', true], 'こい'], + [['っこい', false], 'っこい'], + [['っこい', true], 'こい'], + [['っっこい', false], 'っこい'], + [['っっこい', true], 'こい'], + [['っっっこい', false], 'っこい'], + [['っっっこい', true], 'こい'], + + [['すごい', false], 'すごい'], + [['すごい', true], 'すごい'], + [['すごーい', false], 'すごーい'], + [['すごーい', true], 'すごい'], + [['すごーーい', false], 'すごーい'], + [['すごーーい', true], 'すごい'], + [['すっごーい', false], 'すっごーい'], + [['すっごーい', true], 'すごい'], + [['すっっごーーい', false], 'すっごーい'], + [['すっっごーーい', true], 'すごい'], + + [['', false], ''], + [['', true], ''], + [['っ', false], 'っ'], + [['っ', true], ''], + [['っっ', false], 'っ'], + [['っっ', true], ''], + [['っっっ', false], 'っ'], + [['っっっ', true], ''] ]; test.each(data)('%o -> %o', (input, output) => { const [text, fullCollapse] = input; - const [expected, expectedSourceMapping] = output; - - const sourceMap = new TextSourceMap(text); - const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null); - const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap); - expect(actual1).toStrictEqual(expected); - expect(actual2).toStrictEqual(expected); - if (typeof expectedSourceMapping !== 'undefined') { - expect(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))).toBe(true); - } + + const actual1 = jp.collapseEmphaticSequences(text, fullCollapse); + const actual2 = jp.collapseEmphaticSequences(text, fullCollapse); + expect(actual1).toStrictEqual(output); + expect(actual2).toStrictEqual(output); }); }); diff --git a/test/text-source-map.test.js b/test/text-source-map.test.js deleted file mode 100644 index 09341774..00000000 --- a/test/text-source-map.test.js +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2023-2024 Yomitan Authors - * Copyright (C) 2020-2022 Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <https://www.gnu.org/licenses/>. - */ - -import {describe, expect, test} from 'vitest'; -import {TextSourceMap} from '../ext/js/general/text-source-map.js'; - -describe('TextSourceMap', () => { - describe('Source', () => { - const data = [ - ['source1'], - ['source2'], - ['source3'] - ]; - - test.each(data)('source-test-%#', (source) => { - const sourceMap = new TextSourceMap(source); - expect(source).toStrictEqual(sourceMap.source); - }); - }); - - describe('Equals', () => { - /** @type {[args1: [source1: string, mapping1: ?(number[])], args2: [source2: string, mapping2: ?(number[])], expectedEquals: boolean][]} */ - const data = [ - [['source1', null], ['source1', null], true], - [['source2', null], ['source2', null], true], - [['source3', null], ['source3', null], true], - - [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', null], true], - [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', null], true], - [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', null], true], - - [['source1', null], ['source1', [1, 1, 1, 1, 1, 1, 1]], true], - [['source2', null], ['source2', [1, 1, 1, 1, 1, 1, 1]], true], - [['source3', null], ['source3', [1, 1, 1, 1, 1, 1, 1]], true], - - [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', [1, 1, 1, 1, 1, 1, 1]], true], - [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', [1, 1, 1, 1, 1, 1, 1]], true], - [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', [1, 1, 1, 1, 1, 1, 1]], true], - - [['source1', [1, 2, 1, 3]], ['source1', [1, 2, 1, 3]], true], - [['source2', [1, 2, 1, 3]], ['source2', [1, 2, 1, 3]], true], - [['source3', [1, 2, 1, 3]], ['source3', [1, 2, 1, 3]], true], - - [['source1', [1, 3, 1, 2]], ['source1', [1, 2, 1, 3]], false], - [['source2', [1, 3, 1, 2]], ['source2', [1, 2, 1, 3]], false], - [['source3', [1, 3, 1, 2]], ['source3', [1, 2, 1, 3]], false], - - [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source4', [1, 1, 1, 1, 1, 1, 1]], false], - [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source5', [1, 1, 1, 1, 1, 1, 1]], false], - [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source6', [1, 1, 1, 1, 1, 1, 1]], false] - ]; - - test.each(data)('equals-test-%#', ([source1, mapping1], [source2, mapping2], expectedEquals) => { - const sourceMap1 = new TextSourceMap(source1, mapping1); - const sourceMap2 = new TextSourceMap(source2, mapping2); - expect(sourceMap1.equals(sourceMap1)).toBe(true); - expect(sourceMap2.equals(sourceMap2)).toBe(true); - expect(sourceMap1.equals(sourceMap2)).toStrictEqual(expectedEquals); - }); - }); - - describe('GetSourceLength', () => { - /** @type {[args: [source: string, mapping: number[]], finalLength: number, expectedValue: number][]} */ - const data = [ - [['source', [1, 1, 1, 1, 1, 1]], 1, 1], - [['source', [1, 1, 1, 1, 1, 1]], 2, 2], - [['source', [1, 1, 1, 1, 1, 1]], 3, 3], - [['source', [1, 1, 1, 1, 1, 1]], 4, 4], - [['source', [1, 1, 1, 1, 1, 1]], 5, 5], - [['source', [1, 1, 1, 1, 1, 1]], 6, 6], - - [['source', [2, 2, 2]], 1, 2], - [['source', [2, 2, 2]], 2, 4], - [['source', [2, 2, 2]], 3, 6], - - [['source', [3, 3]], 1, 3], - [['source', [3, 3]], 2, 6], - - [['source', [6, 6]], 1, 6] - ]; - - test.each(data)('get-source-length-test-%#', ([source, mapping], finalLength, expectedValue) => { - const sourceMap = new TextSourceMap(source, mapping); - expect(sourceMap.getSourceLength(finalLength)).toStrictEqual(expectedValue); - }); - }); - - describe('CombineInsert', () => { - /** @type {[args: [source: string, mapping: ?(number[])], expectedArgs: [expectedSource: string, expectedMapping: ?(number[])], operations: [operation: string, arg1: number, arg2: number][]][]} */ - const data = [ - // No operations - [ - ['source', null], - ['source', [1, 1, 1, 1, 1, 1]], - [] - ], - - // Combine - [ - ['source', null], - ['source', [3, 1, 1, 1]], - [ - ['combine', 0, 2] - ] - ], - [ - ['source', null], - ['source', [1, 1, 1, 3]], - [ - ['combine', 3, 2] - ] - ], - [ - ['source', null], - ['source', [3, 3]], - [ - ['combine', 0, 2], - ['combine', 1, 2] - ] - ], - [ - ['source', null], - ['source', [3, 3]], - [ - ['combine', 3, 2], - ['combine', 0, 2] - ] - ], - - // Insert - [ - ['source', null], - ['source', [0, 1, 1, 1, 1, 1, 1]], - [ - ['insert', 0, 0] - ] - ], - [ - ['source', null], - ['source', [1, 1, 1, 1, 1, 1, 0]], - [ - ['insert', 6, 0] - ] - ], - [ - ['source', null], - ['source', [0, 1, 1, 1, 1, 1, 1, 0]], - [ - ['insert', 0, 0], - ['insert', 7, 0] - ] - ], - [ - ['source', null], - ['source', [0, 1, 1, 1, 1, 1, 1, 0]], - [ - ['insert', 6, 0], - ['insert', 0, 0] - ] - ], - - // Mixed - [ - ['source', null], - ['source', [3, 0, 3]], - [ - ['combine', 0, 2], - ['insert', 1, 0], - ['combine', 2, 2] - ] - ], - [ - ['source', null], - ['source', [3, 0, 3]], - [ - ['combine', 0, 2], - ['combine', 1, 2], - ['insert', 1, 0] - ] - ], - [ - ['source', null], - ['source', [3, 0, 3]], - [ - ['insert', 3, 0], - ['combine', 0, 2], - ['combine', 2, 2] - ] - ] - ]; - - test.each(data)('combine-insert-test-%#', ([source, mapping], [expectedSource, expectedMapping], operations) => { - const sourceMap = new TextSourceMap(source, mapping); - const expectedSourceMap = new TextSourceMap(expectedSource, expectedMapping); - for (const [operation, ...args] of operations) { - switch (operation) { - case 'combine': - sourceMap.combine(...args); - break; - case 'insert': - sourceMap.insert(...args); - break; - } - } - expect(sourceMap.equals(expectedSourceMap)).toBe(true); - }); - }); -}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 6674b28c..37da106c 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,13 +15,17 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language'; +import type {TextProcessor, BidirectionalConversionPreprocessor} from './language'; import type {LanguageTransformDescriptor} from './language-transformer'; import type {SafeAny} from './core'; export type IsTextLookupWorthyFunction = (text: string) => boolean; -type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = { +type LanguageDescriptor< + TIso extends string, + TTextPreprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>, + TTextPostprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>, +> = { iso: TIso; name: string; exampleText: string; @@ -32,75 +36,126 @@ type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends * If no value is provided, `true` is assumed for all inputs. */ isTextLookupWorthy?: IsTextLookupWorthyFunction; - textPreprocessors: TTextPreprocessorDescriptor; + textPreprocessors?: TTextPreprocessorDescriptor; + textPostprocessors?: TTextPostprocessorDescriptor; languageTransforms?: LanguageTransformDescriptor; }; -type TextPreprocessorDescriptor = { - [key: string]: TextPreprocessor<SafeAny>; +type TextProcessorDescriptor = { + [key: string]: TextProcessor<SafeAny>; }; type LanguageDescriptorObjectMap = { - [key in keyof AllTextPreprocessors]: LanguageDescriptor<key, AllTextPreprocessors[key]>; + [key in keyof AllTextProcessors]: LanguageDescriptor< + key, + AllTextProcessors[key] extends {pre: TextProcessorDescriptor} ? AllTextProcessors[key]['pre'] : Record<string, never>, + AllTextProcessors[key] extends {post: TextProcessorDescriptor} ? AllTextProcessors[key]['post'] : Record<string, never> + >; }; export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap]; type CapitalizationPreprocessors = { - capitalizeFirstLetter: TextPreprocessor<boolean>; - decapitalize: TextPreprocessor<boolean>; + capitalizeFirstLetter: TextProcessor<boolean>; + decapitalize: TextProcessor<boolean>; }; /** - * This is a mapping of the iso tag to all of the preprocessors for that language. + * This is a mapping of the iso tag to all of the text processors for that language. * Any new language should be added to this object. */ -type AllTextPreprocessors = { +type AllTextProcessors = { ar: { - removeArabicScriptDiacritics: TextPreprocessor<boolean>; + pre: { + removeArabicScriptDiacritics: TextProcessor<boolean>; + }; }; - de: CapitalizationPreprocessors & { - eszettPreprocessor: BidirectionalConversionPreprocessor; + de: { + pre: CapitalizationPreprocessors & { + eszettPreprocessor: BidirectionalConversionPreprocessor; + }; + }; + el: { + pre: CapitalizationPreprocessors; + }; + en: { + pre: CapitalizationPreprocessors; + }; + es: { + pre: CapitalizationPreprocessors; }; - el: CapitalizationPreprocessors; - en: CapitalizationPreprocessors; - es: CapitalizationPreprocessors; fa: { - removeArabicScriptDiacritics: TextPreprocessor<boolean>; + pre: { + removeArabicScriptDiacritics: TextProcessor<boolean>; + }; + }; + fr: { + pre: CapitalizationPreprocessors; + }; + grc: { + pre: CapitalizationPreprocessors & { + removeAlphabeticDiacritics: TextProcessor<boolean>; + }; }; - fr: CapitalizationPreprocessors; - grc: CapitalizationPreprocessors & { - removeAlphabeticDiacritics: TextPreprocessor<boolean>; + hu: { + pre: CapitalizationPreprocessors; }; - hu: CapitalizationPreprocessors; - id: CapitalizationPreprocessors; - it: CapitalizationPreprocessors; - la: CapitalizationPreprocessors & { - removeAlphabeticDiacritics: TextPreprocessor<boolean>; + id: { + pre: CapitalizationPreprocessors; + }; + it: { + pre: CapitalizationPreprocessors; + }; + la: { + pre: CapitalizationPreprocessors & { + removeAlphabeticDiacritics: TextProcessor<boolean>; + }; }; ja: { - convertHalfWidthCharacters: TextPreprocessor<boolean>; - convertNumericCharacters: TextPreprocessor<boolean>; - convertAlphabeticCharacters: TextPreprocessor<boolean>; - convertHiraganaToKatakana: BidirectionalConversionPreprocessor; - collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; + pre: { + convertHalfWidthCharacters: TextProcessor<boolean>; + convertNumericCharacters: TextProcessor<boolean>; + convertAlphabeticCharacters: TextProcessor<boolean>; + convertHiraganaToKatakana: BidirectionalConversionPreprocessor; + collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; + }; }; km: Record<string, never>; - pl: CapitalizationPreprocessors; - pt: CapitalizationPreprocessors; - ro: CapitalizationPreprocessors; - ru: CapitalizationPreprocessors & { - yoToE: TextPreprocessor<boolean>; - removeRussianDiacritics: TextPreprocessor<boolean>; - }; - sga: CapitalizationPreprocessors & { - removeAlphabeticDiacritics: TextPreprocessor<boolean>; - }; - sh: CapitalizationPreprocessors; - sq: CapitalizationPreprocessors; - sv: CapitalizationPreprocessors; + pl: { + pre: CapitalizationPreprocessors; + }; + pt: { + pre: CapitalizationPreprocessors; + }; + ro: { + pre: CapitalizationPreprocessors; + }; + ru: { + pre: CapitalizationPreprocessors & { + yoToE: TextProcessor<boolean>; + removeRussianDiacritics: TextProcessor<boolean>; + }; + }; + sga: { + pre: CapitalizationPreprocessors & { + removeAlphabeticDiacritics: TextProcessor<boolean>; + }; + }; + sh: { + pre: CapitalizationPreprocessors; + }; + sq: { + pre: CapitalizationPreprocessors; + }; + sv: { + pre: CapitalizationPreprocessors; + }; th: Record<string, never>; - tr: CapitalizationPreprocessors; - vi: CapitalizationPreprocessors; + tr: { + pre: CapitalizationPreprocessors; + }; + vi: { + pre: CapitalizationPreprocessors; + }; zh: Record<string, never>; }; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index 8fa6f0e7..c708f6e7 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -15,32 +15,32 @@ * along with this program. If not, see <https://www.gnu.org/licenses/>. */ -import type {TextSourceMap} from '../../ext/js/general/text-source-map.js'; import type {LanguageTransformDescriptor} from './language-transformer.js'; -export type TextPreprocessorOptions<T = unknown> = T[]; +export type TextProcessorOptions<T = unknown> = T[]; -export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string; +export type TextProcessorFunction<T = unknown> = (str: string, setting: T) => string; /** - * Text preprocessors are used during the translation process to create alternate versions of the input text to search for. + * Text pre- and post-processors are used during the translation process to create alternate versions of the input text to search for. * This is helpful when the input text doesn't exactly match the term or expression found in the database. - * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors. + * When a language has multiple processors, the translator will generate variants of the text by applying all combinations of the processors. */ -export type TextPreprocessor<T = unknown> = { +export type TextProcessor<T = unknown> = { name: string; description: string; - options: TextPreprocessorOptions<T>; - process: TextPreprocessorFunction<T>; + options: TextProcessorOptions<T>; + process: TextProcessorFunction<T>; }; export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse'; -export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>; +export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>; -export type LanguageAndPreprocessors = { +export type LanguageAndProcessors = { iso: string; - textPreprocessors: TextPreprocessorWithId<unknown>[]; + textPreprocessors?: TextProcessorWithId<unknown>[]; + textPostprocessors?: TextProcessorWithId<unknown>[]; }; export type LanguageAndTransforms = { @@ -48,9 +48,9 @@ export type LanguageAndTransforms = { languageTransforms: LanguageTransformDescriptor; }; -export type TextPreprocessorWithId<T = unknown> = { +export type TextProcessorWithId<T = unknown> = { id: string; - textPreprocessor: TextPreprocessor<T>; + textProcessor: TextProcessor<T>; }; export type LanguageSummary = { diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts index 7006221e..00056562 100644 --- a/types/ext/translation-internal.d.ts +++ b/types/ext/translation-internal.d.ts @@ -49,4 +49,14 @@ export type DatabaseDeinflection = { databaseEntries: DictionaryDatabase.TermEntry[]; }; -export type PreprocessorOptionsSpace = Map<string, Language.TextPreprocessorOptions<unknown>>; +export type TextProcessorOptionsSpace = Map<string, Language.TextProcessorOptions<unknown>>; + +export type TextProcessorMap = Map< + string, + { + textPreprocessors: Language.TextProcessorWithId<unknown>[]; + preprocessorOptionsSpace: TextProcessorOptionsSpace; + textPostprocessors: Language.TextProcessorWithId<unknown>[]; + postprocessorOptionsSpace: TextProcessorOptionsSpace; + } +>; |