diff options
21 files changed, 349 insertions, 694 deletions
| diff --git a/.eslintrc.json b/.eslintrc.json index 8b08e827..e347c978 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -639,7 +639,6 @@                  "ext/js/general/cache-map.js",                  "ext/js/general/object-property-accessor.js",                  "ext/js/general/regex-util.js", -                "ext/js/general/text-source-map.js",                  "ext/js/language/ar/arabic-text-preprocessors.js",                  "ext/js/language/de/german-text-preprocessors.js",                  "ext/js/language/de/german-transforms.js", @@ -656,7 +655,7 @@                  "ext/js/language/ru/russian-text-preprocessors.js",                  "ext/js/language/sga/old-irish-transforms.js",                  "ext/js/language/sq/albanian-transforms.js", -                "ext/js/language/text-preprocessors.js", +                "ext/js/language/text-processors.js",                  "ext/js/language/translator.js",                  "ext/js/media/audio-downloader.js",                  "ext/js/media/media-util.js", diff --git a/ext/js/general/regex-util.js b/ext/js/general/regex-util.js index e0982154..c633ec06 100644 --- a/ext/js/general/regex-util.js +++ b/ext/js/general/regex-util.js @@ -23,13 +23,12 @@ const matchReplacementPattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g;   * Applies string.replace using a regular expression and replacement string as arguments.   * A source map of the changes is also maintained.   * @param {string} text A string of the text to replace. - * @param {import('./text-source-map.js').TextSourceMap} sourceMap An instance of `TextSourceMap` which corresponds to `text`.   * @param {RegExp} pattern A regular expression to use as the replacement.   * @param {string} replacement A replacement string that follows the format of the standard   *   JavaScript regular expression replacement string.   * @returns {string} A new string with the pattern replacements applied and the source map updated.   */ -export function applyTextReplacement(text, sourceMap, pattern, replacement) { +export function applyTextReplacement(text, pattern, replacement) {      const isGlobal = pattern.global;      if (isGlobal) { pattern.lastIndex = 0; }      for (let loop = true; loop; loop = isGlobal) { @@ -44,15 +43,6 @@ export function applyTextReplacement(text, sourceMap, pattern, replacement) {          text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`;          pattern.lastIndex += delta; - -        if (actualReplacementLength > 0) { -            /** @type {number[]} */ -            const zeroes = new Array(actualReplacementLength).fill(0); -            sourceMap.insert(index, ...zeroes); -            sourceMap.combine(index - 1 + actualReplacementLength, matchText.length); -        } else { -            sourceMap.combine(index, matchText.length); -        }      }      return text;  } diff --git a/ext/js/general/text-source-map.js b/ext/js/general/text-source-map.js deleted file mode 100644 index 527c232b..00000000 --- a/ext/js/general/text-source-map.js +++ /dev/null @@ -1,153 +0,0 @@ -/* - * Copyright (C) 2023-2024  Yomitan Authors - * Copyright (C) 2020-2022  Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <https://www.gnu.org/licenses/>. - */ - -export class TextSourceMap { -    /** -     * @param {string} source -     * @param {number[]|null} [mapping=null] -     */ -    constructor(source, mapping = null) { -        /** @type {string} */ -        this._source = source; -        /** @type {?number[]} */ -        this._mapping = (mapping !== null ? TextSourceMap.normalizeMapping(mapping) : null); -    } - -    /** @type {string} */ -    get source() { -        return this._source; -    } - -    /** -     * @param {unknown} other -     * @returns {boolean} -     */ -    equals(other) { -        if (this === other) { -            return true; -        } - -        const source = this._source; -        if (!(other instanceof TextSourceMap && source === other.source)) { -            return false; -        } - -        let mapping = this._mapping; -        let otherMapping = other.getMappingCopy(); -        if (mapping === null) { -            if (otherMapping === null) { -                return true; -            } -            mapping = TextSourceMap.createMapping(source); -        } else if (otherMapping === null) { -            otherMapping = TextSourceMap.createMapping(source); -        } - -        const mappingLength = mapping.length; -        if (mappingLength !== otherMapping.length) { -            return false; -        } - -        for (let i = 0; i < mappingLength; ++i) { -            if (mapping[i] !== otherMapping[i]) { -                return false; -            } -        } - -        return true; -    } - -    /** -     * @param {number} finalLength -     * @returns {number} -     */ -    getSourceLength(finalLength) { -        const mapping = this._mapping; -        if (mapping === null) { -            return finalLength; -        } - -        let sourceLength = 0; -        for (let i = 0; i < finalLength; ++i) { -            sourceLength += mapping[i]; -        } -        return sourceLength; -    } - -    /** -     * @param {number} index -     * @param {number} count -     */ -    combine(index, count) { -        if (count <= 0) { return; } - -        if (this._mapping === null) { -            this._mapping = TextSourceMap.createMapping(this._source); -        } - -        let sum = this._mapping[index]; -        const parts = this._mapping.splice(index + 1, count); -        for (const part of parts) { -            sum += part; -        } -        this._mapping[index] = sum; -    } - -    /** -     * @param {number} index -     * @param {number[]} items -     */ -    insert(index, ...items) { -        if (this._mapping === null) { -            this._mapping = TextSourceMap.createMapping(this._source); -        } - -        this._mapping.splice(index, 0, ...items); -    } - -    /** -     * @returns {?number[]} -     */ -    getMappingCopy() { -        return this._mapping !== null ? [...this._mapping] : null; -    } - -    /** -     * @param {string} text -     * @returns {number[]} -     */ -    static createMapping(text) { -        return new Array(text.length).fill(1); -    } - -    /** -     * @param {number[]} mapping -     * @returns {number[]} -     */ -    static normalizeMapping(mapping) { -        const result = []; -        for (const value of mapping) { -            result.push( -                (typeof value === 'number' && Number.isFinite(value)) ? -                Math.floor(value) : -                0 -            ); -        } -        return result; -    } -} diff --git a/ext/js/language/ar/arabic-text-preprocessors.js b/ext/js/language/ar/arabic-text-preprocessors.js index 6007d770..91535ccd 100644 --- a/ext/js/language/ar/arabic-text-preprocessors.js +++ b/ext/js/language/ar/arabic-text-preprocessors.js @@ -15,7 +15,7 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js';  const optionalDiacritics = [      '\u0618', // Small Fatha @@ -38,11 +38,11 @@ const optionalDiacritics = [  const diacriticsRegex = new RegExp(`[${optionalDiacritics.join('')}]`, 'g'); -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const removeArabicScriptDiacritics = {      name: 'Remove diacritics',      description: 'وَلَدَ ⬅️ ولد', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (text, setting) => {          return setting ? text.replace(diacriticsRegex, '') : text;      } diff --git a/ext/js/language/ja/japanese-text-preprocessors.js b/ext/js/language/ja/japanese-text-preprocessors.js index 06f944c1..b3d50817 100644 --- a/ext/js/language/ja/japanese-text-preprocessors.js +++ b/ext/js/language/ja/japanese-text-preprocessors.js @@ -15,7 +15,7 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js';  import {convertAlphabeticToKana} from './japanese-wanakana.js';  import {      collapseEmphaticSequences as collapseEmphaticSequencesFunction, @@ -25,28 +25,28 @@ import {      convertNumericToFullWidth  } from './japanese.js'; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const convertHalfWidthCharacters = {      name: 'Convert half width characters to full width',      description: 'ヨミチャン → ヨミチャン', -    options: basicTextPreprocessorOptions, -    process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) +    options: basicTextProcessorOptions, +    process: (str, setting) => (setting ? convertHalfWidthKanaToFullWidth(str) : str)  }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const convertNumericCharacters = {      name: 'Convert numeric characters to full width',      description: '1234 → 1234', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str)  }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const convertAlphabeticCharacters = {      name: 'Convert alphabetic characters to hiragana',      description: 'yomichan → よみちゃん', -    options: basicTextPreprocessorOptions, -    process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) +    options: basicTextProcessorOptions, +    process: (str, setting) => (setting ? convertAlphabeticToKana(str) : str)  };  /** @type {import('language').BidirectionalConversionPreprocessor} */ @@ -66,15 +66,15 @@ export const convertHiraganaToKatakana = {      }  }; -/** @type {import('language').TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ +/** @type {import('language').TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */  export const collapseEmphaticSequences = {      name: 'Collapse emphatic character sequences',      description: 'すっっごーーい → すっごーい / すごい',      options: [[false, false], [true, false], [true, true]], -    process: (str, setting, sourceMap) => { +    process: (str, setting) => {          const [collapseEmphatic, collapseEmphaticFull] = setting;          if (collapseEmphatic) { -            str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull, sourceMap); +            str = collapseEmphaticSequencesFunction(str, collapseEmphaticFull);          }          return str;      } diff --git a/ext/js/language/ja/japanese-wanakana.js b/ext/js/language/ja/japanese-wanakana.js index 32260489..a87db6b7 100644 --- a/ext/js/language/ja/japanese-wanakana.js +++ b/ext/js/language/ja/japanese-wanakana.js @@ -19,51 +19,10 @@ import * as wanakana from '../../../lib/wanakana.js';  /**   * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap - * @param {number} sourceMapStart   * @returns {string}   */ -function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { -    const result = wanakana.toHiragana(text); - -    // Generate source mapping -    if (sourceMap !== null) { -        let i = 0; -        let resultPos = 0; -        const ii = text.length; -        while (i < ii) { -            // Find smallest matching substring -            let iNext = i + 1; -            let resultPosNext = result.length; -            while (iNext < ii) { -                const t = wanakana.toHiragana(text.substring(0, iNext)); -                if (t === result.substring(0, t.length)) { -                    resultPosNext = t.length; -                    break; -                } -                ++iNext; -            } - -            // Merge characters -            const removals = iNext - i - 1; -            if (removals > 0) { -                sourceMap.combine(sourceMapStart, removals); -            } -            ++sourceMapStart; - -            // Empty elements -            const additions = resultPosNext - resultPos - 1; -            for (let j = 0; j < additions; ++j) { -                sourceMap.insert(sourceMapStart, 0); -                ++sourceMapStart; -            } - -            i = iNext; -            resultPos = resultPosNext; -        } -    } - -    return result; +function convertAlphabeticPartToKana(text) { +    return wanakana.toHiragana(text);  }  /** @@ -84,10 +43,9 @@ export function convertToRomaji(text) {  /**   * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} sourceMap   * @returns {string}   */ -export function convertAlphabeticToKana(text, sourceMap = null) { +export function convertAlphabeticToKana(text) {      let part = '';      let result = ''; @@ -106,7 +64,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {              c = 0x2d; // '-'          } else {              if (part.length > 0) { -                result += convertAlphabeticPartToKana(part, sourceMap, result.length); +                result += convertAlphabeticPartToKana(part);                  part = '';              }              result += char; @@ -116,7 +74,7 @@ export function convertAlphabeticToKana(text, sourceMap = null) {      }      if (part.length > 0) { -        result += convertAlphabeticPartToKana(part, sourceMap, result.length); +        result += convertAlphabeticPartToKana(part);      }      return result;  } diff --git a/ext/js/language/ja/japanese.js b/ext/js/language/ja/japanese.js index 2c9a1f7f..3507e5df 100644 --- a/ext/js/language/ja/japanese.js +++ b/ext/js/language/ja/japanese.js @@ -539,10 +539,9 @@ export function convertNumericToFullWidth(text) {  /**   * @param {string} text - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap]   * @returns {string}   */ -export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) { +export function convertHalfWidthKanaToFullWidth(text) {      let result = '';      // This function is safe to use charCodeAt instead of codePointAt, since all @@ -575,9 +574,6 @@ export function convertHalfWidthKanaToFullWidth(text, sourceMap = null) {              }          } -        if (sourceMap !== null && index > 0) { -            sourceMap.combine(result.length, 1); -        }          result += c2;      } @@ -705,13 +701,11 @@ export function distributeFuriganaInflected(term, reading, source) {  /**   * @param {string} text   * @param {boolean} fullCollapse - * @param {?import('../../general/text-source-map.js').TextSourceMap} [sourceMap]   * @returns {string}   */ -export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null) { +export function collapseEmphaticSequences(text, fullCollapse) {      let result = '';      let collapseCodePoint = -1; -    const hasSourceMap = (sourceMap !== null);      for (const char of text) {          const c = char.codePointAt(0);          if ( @@ -729,11 +723,6 @@ export function collapseEmphaticSequences(text, fullCollapse, sourceMap = null)          } else {              collapseCodePoint = -1;              result += char; -            continue; -        } - -        if (hasSourceMap) { -            sourceMap.combine(Math.max(0, result.length - 1), 1);          }      }      return result; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index c5c3e01e..2df2f794 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -25,7 +25,7 @@ import {isStringPartiallyJapanese} from './ja/japanese.js';  import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js';  import {oldIrishTransforms} from './sga/old-irish-transforms.js';  import {albanianTransforms} from './sq/albanian-transforms.js'; -import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-preprocessors.js'; +import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js';  const capitalizationPreprocessors = {      decapitalize, @@ -138,8 +138,7 @@ const languageDescriptors = [      {          iso: 'km',          name: 'Khmer', -        exampleText: 'អាន', -        textPreprocessors: {} +        exampleText: 'អាន'      },      {          iso: 'pl', @@ -201,8 +200,7 @@ const languageDescriptors = [      {          iso: 'th',          name: 'Thai', -        exampleText: 'อ่าน', -        textPreprocessors: {} +        exampleText: 'อ่าน'      },      {          iso: 'tr', @@ -219,8 +217,7 @@ const languageDescriptors = [      {          iso: 'zh',          name: 'Chinese', -        exampleText: '读', -        textPreprocessors: {} +        exampleText: '读'      }  ]; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js index 4b196c2c..b3890aa6 100755 --- a/ext/js/language/languages.js +++ b/ext/js/language/languages.js @@ -29,21 +29,29 @@ export function getLanguageSummaries() {  }  /** - * @returns {import('language').LanguageAndPreprocessors[]} + * @returns {import('language').LanguageAndProcessors[]}   * @throws {Error}   */ -export function getAllLanguageTextPreprocessors() { +export function getAllLanguageTextProcessors() {      const results = []; -    for (const {iso, textPreprocessors} of languageDescriptorMap.values()) { -        /** @type {import('language').TextPreprocessorWithId<unknown>[]} */ +    for (const {iso, textPreprocessors = {}, textPostprocessors = {}} of languageDescriptorMap.values()) { +        /** @type {import('language').TextProcessorWithId<unknown>[]} */          const textPreprocessorsArray = [];          for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) {              textPreprocessorsArray.push({                  id, -                textPreprocessor: /** @type {import('language').TextPreprocessor<unknown>} */ (textPreprocessor) +                textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPreprocessor)              });          } -        results.push({iso, textPreprocessors: textPreprocessorsArray}); +        /** @type {import('language').TextProcessorWithId<unknown>[]} */ +        const textPostprocessorsArray = []; +        for (const [id, textPostprocessor] of Object.entries(textPostprocessors)) { +            textPostprocessorsArray.push({ +                id, +                textProcessor: /** @type {import('language').TextProcessor<unknown>} */ (textPostprocessor) +            }); +        } +        results.push({iso, textPreprocessors: textPreprocessorsArray, textPostprocessors: textPostprocessorsArray});      }      return results;  } diff --git a/ext/js/language/ru/russian-text-preprocessors.js b/ext/js/language/ru/russian-text-preprocessors.js index fc4472e9..fbda38c7 100644 --- a/ext/js/language/ru/russian-text-preprocessors.js +++ b/ext/js/language/ru/russian-text-preprocessors.js @@ -15,23 +15,23 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {basicTextProcessorOptions} from '../text-processors.js'; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const removeRussianDiacritics = {      name: 'Remove diacritics',      description: 'A\u0301 → A, a\u0301 → a', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => {          return setting ? str.replace(/\u0301/g, '') : str;      }  }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const yoToE = {      name: 'Yo to E',      description: 'ё → е, Ё → Е', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => {          return setting ? str.replace(/ё/g, 'е').replace(/Ё/g, 'Е') : str;      } diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-processors.js index e33fccda..e7855df2 100755 --- a/ext/js/language/text-preprocessors.js +++ b/ext/js/language/text-processors.js @@ -15,22 +15,22 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -/** @type {import('language').TextPreprocessorOptions<boolean>} */ -export const basicTextPreprocessorOptions = [false, true]; +/** @type {import('language').TextProcessorOptions<boolean>} */ +export const basicTextProcessorOptions = [false, true]; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const decapitalize = {      name: 'Decapitalize text',      description: 'CAPITALIZED TEXT → capitalized text', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => (setting ? str.toLowerCase() : str)  }; -/** @type {import('language').TextPreprocessor<boolean>} */ +/** @type {import('language').TextProcessor<boolean>} */  export const capitalizeFirstLetter = {      name: 'Capitalize first letter',      description: 'lowercase text → Lowercase text', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str)  }; @@ -39,11 +39,11 @@ export const capitalizeFirstLetter = {   *          as it can result in undesirable normalization:   *            - '\u9038'.normalize('NFD') => '\u9038' (逸)   *            - '\ufa67'.normalize('NFD') => '\u9038' (逸 => 逸) - * @type {import('language').TextPreprocessor<boolean>} + * @type {import('language').TextProcessor<boolean>}   */  export const removeAlphabeticDiacritics = {      name: 'Remove Alphabetic Diacritics',      description: 'ἄήé -> αηe', -    options: basicTextPreprocessorOptions, +    options: basicTextProcessorOptions,      process: (str, setting) => (setting ? str.normalize('NFD').replace(/[\u0300-\u036f]/g, '') : str)  }; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index 6132ee82..845d53d5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -17,10 +17,9 @@   */  import {applyTextReplacement} from '../general/regex-util.js'; -import {TextSourceMap} from '../general/text-source-map.js';  import {isCodePointJapanese} from './ja/japanese.js';  import {LanguageTransformer} from './language-transformer.js'; -import {getAllLanguageTextPreprocessors} from './languages.js'; +import {getAllLanguageTextProcessors} from './languages.js';  import {MultiLanguageTransformer} from './multi-language-transformer.js';  /** @@ -41,8 +40,8 @@ export class Translator {          this._stringComparer = new Intl.Collator('en-US'); // Invariant locale          /** @type {RegExp} */          this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; -        /** @type {Map<string, {textPreprocessors: import('language').TextPreprocessorWithId<unknown>[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ -        this._textPreprocessors = new Map(); +        /** @type {import('translation-internal').TextProcessorMap} */ +        this._textProcessors = new Map();      }      /** @@ -50,13 +49,19 @@ export class Translator {       */      prepare() {          this._multiLanguageTransformer.prepare(); -        for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { -            /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ -            const optionSpace = new Map(); -            for (const {id, textPreprocessor} of textPreprocessors) { -                optionSpace.set(id, textPreprocessor.options); +        for (const {iso, textPreprocessors = [], textPostprocessors = []} of getAllLanguageTextProcessors()) { +            /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */ +            const preprocessorOptionsSpace = new Map(); +            /** @type {import('translation-internal').TextProcessorOptionsSpace}>} */ +            const postprocessorOptionsSpace = new Map(); + +            for (const {id, textProcessor} of textPreprocessors) { +                preprocessorOptionsSpace.set(id, textProcessor.options);              } -            this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); +            for (const {id, textProcessor} of textPostprocessors) { +                postprocessorOptionsSpace.set(id, textProcessor.options); +            } +            this._textProcessors.set(iso, {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace});          }      } @@ -428,7 +433,7 @@ export class Translator {          }      } -    // Deinflections and text preprocessing +    // Deinflections and text processing      /**       * @param {string} text @@ -438,57 +443,90 @@ export class Translator {       */      _getAlgorithmDeinflections(text, options) {          const {language} = options; -        const info = this._textPreprocessors.get(language); +        const info = this._textProcessors.get(language);          if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } -        const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; +        const {textPreprocessors, preprocessorOptionsSpace, textPostprocessors, postprocessorOptionsSpace} = info; -        /** @type {Map<string, import('language').TextPreprocessorOptions<unknown>>} */ -        const variantSpace = new Map(); -        variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); -        for (const [key, value] of textPreprocessorOptionsSpace) { -            variantSpace.set(key, value); -        } +        const preprocessorVariantSpace = new Map(preprocessorOptionsSpace); +        preprocessorVariantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); +        const preprocessorVariants = this._getArrayVariants(preprocessorVariantSpace); +        const postprocessorVariants = this._getArrayVariants(postprocessorOptionsSpace);          /** @type {import('translation-internal').DatabaseDeinflection[]} */          const deinflections = [];          const used = new Set(); +        /** @type {Map<string, import('core').SafeAny>} */ +        const sourceCache = new Map(); // For reusing text processors' outputs + +        for ( +            let i = text.length; +            i > 0; +            i = this._getNextSubstringLength(options.searchResolution, i, text) +        ) { +            const rawSource = text.substring(0, i); + +            for (const preprocessorVariant of preprocessorVariants) { +                let source = rawSource; + +                const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (preprocessorVariant.get('textReplacements')); +                if (textReplacements !== null) { +                    source = this._applyTextReplacements(source, textReplacements); +                } -        for (const arrayVariant of this._generateArrayVariants(variantSpace)) { -            const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); +                source = this._applyTextProcessors(textPreprocessors, preprocessorVariant, source, sourceCache); -            let text2 = text; -            const sourceMap = new TextSourceMap(text2); +                if (used.has(source)) { continue; } +                used.add(source); +                for (const deinflection of this._multiLanguageTransformer.transform(language, source)) { +                    const {trace, conditions} = deinflection; +                    for (const postprocessorVariant of postprocessorVariants) { +                        let {text: transformedText} = deinflection; +                        transformedText = this._applyTextProcessors(textPostprocessors, postprocessorVariant, transformedText, sourceCache); + +                        /** @type {import('dictionary').InflectionRuleChainCandidate} */ +                        const inflectionRuleChainCandidate = { +                            source: 'algorithm', +                            inflectionRules: trace.map((frame) => frame.transform) +                        }; +                        deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate])); +                    } +                } +            } +        } +        return deinflections; +    } -            if (textReplacements !== null) { -                text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); +    /** +     * @param {import('language').TextProcessorWithId<unknown>[]} textProcessors +     * @param {Map<string, unknown>} processorVariant +     * @param {string} text +     * @param {Map<string, import('core').SafeAny>} textCache +     * @returns {string} +     */ +    _applyTextProcessors(textProcessors, processorVariant, text, textCache) { +        for (const {id, textProcessor: {process}} of textProcessors) { +            const setting = processorVariant.get(id); +            let level1 = textCache.get(text); +            if (!level1) { +                level1 = new Map(); +                textCache.set(text, level1);              } -            for (const preprocessor of textPreprocessors.values()) { -                const {id, textPreprocessor} = preprocessor; -                const setting = arrayVariant.get(id); -                text2 = textPreprocessor.process(text2, setting, sourceMap); +            let level2 = level1.get(id); +            if (!level2) { +                level2 = new Map(); +                level1.set(id, level2);              } -            for ( -                let source = text2, i = text2.length; -                i > 0; -                i = this._getNextSubstringLength(options.searchResolution, i, source) -            ) { -                source = text2.substring(0, i); -                if (used.has(source)) { break; } -                used.add(source); -                const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); -                for (const {text: transformedText, conditions, trace} of this._multiLanguageTransformer.transform(language, source)) { -                    /** @type {import('dictionary').InflectionRuleChainCandidate} */ -                    const inflectionRuleChainCandidate = { -                        source: 'algorithm', -                        inflectionRules: trace.map((frame) => frame.transform) -                    }; -                    deinflections.push(this._createDeinflection(rawSource, source, transformedText, conditions, [inflectionRuleChainCandidate])); -                } +            if (!level2.has(setting)) { +                text = process(text, setting); +                level2.set(setting, text); +            } else { +                text = level2.get(setting);              }          } -        return deinflections; + +        return text;      }      /** @@ -507,13 +545,12 @@ export class Translator {      /**       * @param {string} text -     * @param {TextSourceMap} sourceMap       * @param {import('translation').FindTermsTextReplacement[]} replacements       * @returns {string}       */ -    _applyTextReplacements(text, sourceMap, replacements) { +    _applyTextReplacements(text, replacements) {          for (const {pattern, replacement} of replacements) { -            text = applyTextReplacement(text, sourceMap, pattern, replacement); +            text = applyTextReplacement(text, pattern, replacement);          }          return text;      } @@ -1325,10 +1362,11 @@ export class Translator {      /**       * @param {Map<string, unknown[]>} arrayVariants -     * @yields {Map<string, unknown>} -     * @returns {Generator<Map<string, unknown>, void, void>} +     * @returns {Map<string, unknown>[]}       */ -    *_generateArrayVariants(arrayVariants) { +    _getArrayVariants(arrayVariants) { +        /** @type {Map<string, unknown>[]} */ +        const results = [];          const variantKeys = [...arrayVariants.keys()];          const entryVariantLengths = [];          for (const key of variantKeys) { @@ -1350,8 +1388,9 @@ export class Translator {                  remainingIndex = Math.floor(remainingIndex / entryVariants.length);              } -            yield variant; +            results.push(variant);          } +        return results;      }      /** diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json index d8d5ab0f..162be7fe 100644 --- a/test/data/anki-note-builder-test-results.json +++ b/test/data/anki-note-builder-test-results.json @@ -2548,8 +2548,8 @@          "audio": "",          "clipboard-image": "",          "clipboard-text": "", -        "cloze-body": "打", -        "cloze-body-kana": "だ", +        "cloze-body": "打(う)", +        "cloze-body-kana": "だ(う)",          "cloze-prefix": "cloze-prefix",          "cloze-suffix": "cloze-suffix",          "conjugation": "", @@ -2577,8 +2577,8 @@          "screenshot": "",          "search-query": "fullQuery",          "selection-text": "", -        "sentence": "cloze-prefix打cloze-suffix", -        "sentence-furigana": "cloze-prefix打cloze-suffix", +        "sentence": "cloze-prefix打(う)cloze-suffix", +        "sentence-furigana": "cloze-prefix打(う)cloze-suffix",          "tags": "n",          "url": "<a href=\"url:\">url:</a>"        }, @@ -2586,8 +2586,8 @@          "audio": "",          "clipboard-image": "",          "clipboard-text": "", -        "cloze-body": "打", -        "cloze-body-kana": "ダース", +        "cloze-body": "打(う)", +        "cloze-body-kana": "ダース(う)",          "cloze-prefix": "cloze-prefix",          "cloze-suffix": "cloze-suffix",          "conjugation": "", @@ -2615,8 +2615,8 @@          "screenshot": "",          "search-query": "fullQuery",          "selection-text": "", -        "sentence": "cloze-prefix打cloze-suffix", -        "sentence-furigana": "cloze-prefix打cloze-suffix", +        "sentence": "cloze-prefix打(う)cloze-suffix", +        "sentence-furigana": "cloze-prefix打(う)cloze-suffix",          "tags": "abbr, n",          "url": "<a href=\"url:\">url:</a>"        } diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json index c9047716..9e62954e 100644 --- a/test/data/translator-test-inputs.json +++ b/test/data/translator-test-inputs.json @@ -191,7 +191,7 @@                          null,                          [                              { -                                "pattern": "\\(([^)]*)(?:\\)|$)", +                                "pattern": "\\(([^)]*)(?:\\))",                                  "flags": "g",                                  "replacement": ""                              } @@ -214,7 +214,7 @@                          null,                          [                              { -                                "pattern": "\\(([^)]*)(?:\\)|$)", +                                "pattern": "\\(([^)]*)(?:\\))",                                  "flags": "g",                                  "replacement": "$1"                              } diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index f580ac53..f0f32fa8 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -22773,7 +22773,7 @@            "type": "term",            "id": 1,            "source": "打", -          "rawSource": "打", +          "rawSource": "打(う)",            "sourceTerm": "打",            "inflectionRuleChainCandidates": [              { @@ -23087,7 +23087,7 @@            "type": "term",            "id": 2,            "source": "打", -          "rawSource": "打", +          "rawSource": "打(う)",            "sourceTerm": "打",            "inflectionRuleChainCandidates": [              { diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index da2f8da2..b3574b46 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -12904,7 +12904,7 @@          "dictionaryIndex": 0,          "dictionaryPriority": 0,          "sourceTermExactMatchCount": 1, -        "maxOriginalTextLength": 1, +        "maxOriginalTextLength": 4,          "headwords": [            {              "index": 0, @@ -12912,7 +12912,7 @@              "reading": "だ",              "sources": [                { -                "originalText": "打", +                "originalText": "打(う)",                  "transformedText": "打",                  "deinflectedText": "打",                  "matchType": "exact", @@ -13072,7 +13072,7 @@          "dictionaryIndex": 0,          "dictionaryPriority": 0,          "sourceTermExactMatchCount": 1, -        "maxOriginalTextLength": 1, +        "maxOriginalTextLength": 4,          "headwords": [            {              "index": 0, @@ -13080,7 +13080,7 @@              "reading": "ダース",              "sources": [                { -                "originalText": "打", +                "originalText": "打(う)",                  "transformedText": "打",                  "deinflectedText": "打",                  "matchType": "exact", diff --git a/test/japanese-util.test.js b/test/japanese-util.test.js index 5f64a714..bff51f85 100644 --- a/test/japanese-util.test.js +++ b/test/japanese-util.test.js @@ -17,7 +17,6 @@   */  import {describe, expect, test} from 'vitest'; -import {TextSourceMap} from '../ext/js/general/text-source-map.js';  import * as jpw from '../ext/js/language/ja/japanese-wanakana.js';  import * as jp from '../ext/js/language/ja/japanese.js'; @@ -194,54 +193,46 @@ describe('Japanese utility functions', () => {      });      describe('convertHalfWidthKanaToFullWidth', () => { -        /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */ +        /** @type {[string: string, expected: string][]} */          const data = [              ['0123456789', '0123456789'],              ['abcdefghij', 'abcdefghij'],              ['カタカナ', 'カタカナ'],              ['ひらがな', 'ひらがな'], -            ['カキ', 'カキ', [1, 1]], -            ['ガキ', 'ガキ', [2, 1]], -            ['ニホン', 'ニホン', [1, 1, 1]], -            ['ニッポン', 'ニッポン', [1, 1, 2, 1]] +            ['カキ', 'カキ'], +            ['ガキ', 'ガキ'], +            ['ニホン', 'ニホン'], +            ['ニッポン', 'ニッポン']          ]; -        for (const [string, expected, expectedSourceMapping] of data) { -            test(`${string} -> ${expected}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => { -                const sourceMap = new TextSourceMap(string); -                const actual1 = jp.convertHalfWidthKanaToFullWidth(string, null); -                const actual2 = jp.convertHalfWidthKanaToFullWidth(string, sourceMap); +        for (const [string, expected] of data) { +            test(`${string} -> ${expected}`, () => { +                const actual1 = jp.convertHalfWidthKanaToFullWidth(string); +                const actual2 = jp.convertHalfWidthKanaToFullWidth(string);                  expect(actual1).toStrictEqual(expected);                  expect(actual2).toStrictEqual(expected); -                if (typeof expectedSourceMapping !== 'undefined') { -                    expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true); -                }              });          }      });      describe('convertAlphabeticToKana', () => { -        /** @type {[string: string, expected: string, expectedSourceMapping?: number[]][]} */ +        /** @type {[string: string, expected: string][]} */          const data = [              ['0123456789', '0123456789'], -            ['abcdefghij', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]], -            ['ABCDEFGHIJ', 'あbcでfgひj', [1, 1, 1, 2, 1, 1, 2, 1]], // wanakana.toHiragana converts text to lower case +            ['abcdefghij', 'あbcでfgひj'], +            ['ABCDEFGHIJ', 'あbcでfgひj'], // wanakana.toHiragana converts text to lower case              ['カタカナ', 'カタカナ'],              ['ひらがな', 'ひらがな'], -            ['chikara', 'ちから', [3, 2, 2]], -            ['CHIKARA', 'ちから', [3, 2, 2]] +            ['chikara', 'ちから'], +            ['CHIKARA', 'ちから']          ]; -        for (const [string, expected, expectedSourceMapping] of data) { -            test(`${string} -> ${string}${typeof expectedSourceMapping !== 'undefined' ? ', ' + JSON.stringify(expectedSourceMapping) : ''}`, () => { -                const sourceMap = new TextSourceMap(string); -                const actual1 = jpw.convertAlphabeticToKana(string, null); -                const actual2 = jpw.convertAlphabeticToKana(string, sourceMap); +        for (const [string, expected] of data) { +            test(`${string} -> ${string}`, () => { +                const actual1 = jpw.convertAlphabeticToKana(string); +                const actual2 = jpw.convertAlphabeticToKana(string);                  expect(actual1).toStrictEqual(expected);                  expect(actual2).toStrictEqual(expected); -                if (typeof expectedSourceMapping !== 'undefined') { -                    expect(sourceMap.equals(new TextSourceMap(string, expectedSourceMapping))).toBe(true); -                }              });          }      }); @@ -765,59 +756,54 @@ describe('Japanese utility functions', () => {      });      describe('collapseEmphaticSequences', () => { -        /** @type {[input: [text: string, fullCollapse: boolean], output: [expected: string, expectedSourceMapping: number[]]][]} */ +        /** @type {[input: [text: string, fullCollapse: boolean], output: string][]} */          const data = [ -            [['かこい', false], ['かこい', [1, 1, 1]]], -            [['かこい', true], ['かこい', [1, 1, 1]]], -            [['かっこい', false], ['かっこい', [1, 1, 1, 1]]], -            [['かっこい', true], ['かこい', [2, 1, 1]]], -            [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]], -            [['かっっこい', true], ['かこい', [3, 1, 1]]], -            [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]], -            [['かっっっこい', true], ['かこい', [4, 1, 1]]], - -            [['こい', false], ['こい', [1, 1]]], -            [['こい', true], ['こい', [1, 1]]], -            [['っこい', false], ['っこい', [1, 1, 1]]], -            [['っこい', true], ['こい', [2, 1]]], -            [['っっこい', false], ['っこい', [2, 1, 1]]], -            [['っっこい', true], ['こい', [3, 1]]], -            [['っっっこい', false], ['っこい', [3, 1, 1]]], -            [['っっっこい', true], ['こい', [4, 1]]], - -            [['すごい', false], ['すごい', [1, 1, 1]]], -            [['すごい', true], ['すごい', [1, 1, 1]]], -            [['すごーい', false], ['すごーい', [1, 1, 1, 1]]], -            [['すごーい', true], ['すごい', [1, 2, 1]]], -            [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]], -            [['すごーーい', true], ['すごい', [1, 3, 1]]], -            [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]], -            [['すっごーい', true], ['すごい', [2, 2, 1]]], -            [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]], -            [['すっっごーーい', true], ['すごい', [3, 3, 1]]], - -            [['', false], ['', []]], -            [['', true], ['', []]], -            [['っ', false], ['っ', [1]]], -            [['っ', true], ['', [1]]], -            [['っっ', false], ['っ', [2]]], -            [['っっ', true], ['', [2]]], -            [['っっっ', false], ['っ', [3]]], -            [['っっっ', true], ['', [3]]] +            [['かこい', false], 'かこい'], +            [['かこい', true], 'かこい'], +            [['かっこい', false], 'かっこい'], +            [['かっこい', true], 'かこい'], +            [['かっっこい', false], 'かっこい'], +            [['かっっこい', true], 'かこい'], +            [['かっっっこい', false], 'かっこい'], +            [['かっっっこい', true], 'かこい'], + +            [['こい', false], 'こい'], +            [['こい', true], 'こい'], +            [['っこい', false], 'っこい'], +            [['っこい', true], 'こい'], +            [['っっこい', false], 'っこい'], +            [['っっこい', true], 'こい'], +            [['っっっこい', false], 'っこい'], +            [['っっっこい', true], 'こい'], + +            [['すごい', false], 'すごい'], +            [['すごい', true], 'すごい'], +            [['すごーい', false], 'すごーい'], +            [['すごーい', true], 'すごい'], +            [['すごーーい', false], 'すごーい'], +            [['すごーーい', true], 'すごい'], +            [['すっごーい', false], 'すっごーい'], +            [['すっごーい', true], 'すごい'], +            [['すっっごーーい', false], 'すっごーい'], +            [['すっっごーーい', true], 'すごい'], + +            [['', false], ''], +            [['', true], ''], +            [['っ', false], 'っ'], +            [['っ', true], ''], +            [['っっ', false], 'っ'], +            [['っっ', true], ''], +            [['っっっ', false], 'っ'], +            [['っっっ', true], '']          ];          test.each(data)('%o -> %o', (input, output) => {              const [text, fullCollapse] = input; -            const [expected, expectedSourceMapping] = output; - -            const sourceMap = new TextSourceMap(text); -            const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null); -            const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap); -            expect(actual1).toStrictEqual(expected); -            expect(actual2).toStrictEqual(expected); -            if (typeof expectedSourceMapping !== 'undefined') { -                expect(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))).toBe(true); -            } + +            const actual1 = jp.collapseEmphaticSequences(text, fullCollapse); +            const actual2 = jp.collapseEmphaticSequences(text, fullCollapse); +            expect(actual1).toStrictEqual(output); +            expect(actual2).toStrictEqual(output);          });      }); diff --git a/test/text-source-map.test.js b/test/text-source-map.test.js deleted file mode 100644 index 09341774..00000000 --- a/test/text-source-map.test.js +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright (C) 2023-2024  Yomitan Authors - * Copyright (C) 2020-2022  Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <https://www.gnu.org/licenses/>. - */ - -import {describe, expect, test} from 'vitest'; -import {TextSourceMap} from '../ext/js/general/text-source-map.js'; - -describe('TextSourceMap', () => { -    describe('Source', () => { -        const data = [ -            ['source1'], -            ['source2'], -            ['source3'] -        ]; - -        test.each(data)('source-test-%#', (source) => { -            const sourceMap = new TextSourceMap(source); -            expect(source).toStrictEqual(sourceMap.source); -        }); -    }); - -    describe('Equals', () => { -        /** @type {[args1: [source1: string, mapping1: ?(number[])], args2: [source2: string, mapping2: ?(number[])], expectedEquals: boolean][]} */ -        const data = [ -            [['source1', null], ['source1', null], true], -            [['source2', null], ['source2', null], true], -            [['source3', null], ['source3', null], true], - -            [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', null], true], -            [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', null], true], -            [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', null], true], - -            [['source1', null], ['source1', [1, 1, 1, 1, 1, 1, 1]], true], -            [['source2', null], ['source2', [1, 1, 1, 1, 1, 1, 1]], true], -            [['source3', null], ['source3', [1, 1, 1, 1, 1, 1, 1]], true], - -            [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source1', [1, 1, 1, 1, 1, 1, 1]], true], -            [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source2', [1, 1, 1, 1, 1, 1, 1]], true], -            [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source3', [1, 1, 1, 1, 1, 1, 1]], true], - -            [['source1', [1, 2, 1, 3]], ['source1', [1, 2, 1, 3]], true], -            [['source2', [1, 2, 1, 3]], ['source2', [1, 2, 1, 3]], true], -            [['source3', [1, 2, 1, 3]], ['source3', [1, 2, 1, 3]], true], - -            [['source1', [1, 3, 1, 2]], ['source1', [1, 2, 1, 3]], false], -            [['source2', [1, 3, 1, 2]], ['source2', [1, 2, 1, 3]], false], -            [['source3', [1, 3, 1, 2]], ['source3', [1, 2, 1, 3]], false], - -            [['source1', [1, 1, 1, 1, 1, 1, 1]], ['source4', [1, 1, 1, 1, 1, 1, 1]], false], -            [['source2', [1, 1, 1, 1, 1, 1, 1]], ['source5', [1, 1, 1, 1, 1, 1, 1]], false], -            [['source3', [1, 1, 1, 1, 1, 1, 1]], ['source6', [1, 1, 1, 1, 1, 1, 1]], false] -        ]; - -        test.each(data)('equals-test-%#', ([source1, mapping1], [source2, mapping2], expectedEquals) => { -            const sourceMap1 = new TextSourceMap(source1, mapping1); -            const sourceMap2 = new TextSourceMap(source2, mapping2); -            expect(sourceMap1.equals(sourceMap1)).toBe(true); -            expect(sourceMap2.equals(sourceMap2)).toBe(true); -            expect(sourceMap1.equals(sourceMap2)).toStrictEqual(expectedEquals); -        }); -    }); - -    describe('GetSourceLength', () => { -        /** @type {[args: [source: string, mapping: number[]], finalLength: number, expectedValue: number][]} */ -        const data = [ -            [['source', [1, 1, 1, 1, 1, 1]], 1, 1], -            [['source', [1, 1, 1, 1, 1, 1]], 2, 2], -            [['source', [1, 1, 1, 1, 1, 1]], 3, 3], -            [['source', [1, 1, 1, 1, 1, 1]], 4, 4], -            [['source', [1, 1, 1, 1, 1, 1]], 5, 5], -            [['source', [1, 1, 1, 1, 1, 1]], 6, 6], - -            [['source', [2, 2, 2]], 1, 2], -            [['source', [2, 2, 2]], 2, 4], -            [['source', [2, 2, 2]], 3, 6], - -            [['source', [3, 3]], 1, 3], -            [['source', [3, 3]], 2, 6], - -            [['source', [6, 6]], 1, 6] -        ]; - -        test.each(data)('get-source-length-test-%#', ([source, mapping], finalLength, expectedValue) => { -            const sourceMap = new TextSourceMap(source, mapping); -            expect(sourceMap.getSourceLength(finalLength)).toStrictEqual(expectedValue); -        }); -    }); - -    describe('CombineInsert', () => { -        /** @type {[args: [source: string, mapping: ?(number[])], expectedArgs: [expectedSource: string, expectedMapping: ?(number[])], operations: [operation: string, arg1: number, arg2: number][]][]} */ -        const data = [ -            // No operations -            [ -                ['source', null], -                ['source', [1, 1, 1, 1, 1, 1]], -                [] -            ], - -            // Combine -            [ -                ['source', null], -                ['source', [3, 1, 1, 1]], -                [ -                    ['combine', 0, 2] -                ] -            ], -            [ -                ['source', null], -                ['source', [1, 1, 1, 3]], -                [ -                    ['combine', 3, 2] -                ] -            ], -            [ -                ['source', null], -                ['source', [3, 3]], -                [ -                    ['combine', 0, 2], -                    ['combine', 1, 2] -                ] -            ], -            [ -                ['source', null], -                ['source', [3, 3]], -                [ -                    ['combine', 3, 2], -                    ['combine', 0, 2] -                ] -            ], - -            // Insert -            [ -                ['source', null], -                ['source', [0, 1, 1, 1, 1, 1, 1]], -                [ -                    ['insert', 0, 0] -                ] -            ], -            [ -                ['source', null], -                ['source', [1, 1, 1, 1, 1, 1, 0]], -                [ -                    ['insert', 6, 0] -                ] -            ], -            [ -                ['source', null], -                ['source', [0, 1, 1, 1, 1, 1, 1, 0]], -                [ -                    ['insert', 0, 0], -                    ['insert', 7, 0] -                ] -            ], -            [ -                ['source', null], -                ['source', [0, 1, 1, 1, 1, 1, 1, 0]], -                [ -                    ['insert', 6, 0], -                    ['insert', 0, 0] -                ] -            ], - -            // Mixed -            [ -                ['source', null], -                ['source', [3, 0, 3]], -                [ -                    ['combine', 0, 2], -                    ['insert', 1, 0], -                    ['combine', 2, 2] -                ] -            ], -            [ -                ['source', null], -                ['source', [3, 0, 3]], -                [ -                    ['combine', 0, 2], -                    ['combine', 1, 2], -                    ['insert', 1, 0] -                ] -            ], -            [ -                ['source', null], -                ['source', [3, 0, 3]], -                [ -                    ['insert', 3, 0], -                    ['combine', 0, 2], -                    ['combine', 2, 2] -                ] -            ] -        ]; - -        test.each(data)('combine-insert-test-%#', ([source, mapping], [expectedSource, expectedMapping], operations) => { -            const sourceMap = new TextSourceMap(source, mapping); -            const expectedSourceMap = new TextSourceMap(expectedSource, expectedMapping); -            for (const [operation, ...args] of operations) { -                switch (operation) { -                    case 'combine': -                        sourceMap.combine(...args); -                        break; -                    case 'insert': -                        sourceMap.insert(...args); -                        break; -                } -            } -            expect(sourceMap.equals(expectedSourceMap)).toBe(true); -        }); -    }); -}); diff --git a/types/ext/language-descriptors.d.ts b/types/ext/language-descriptors.d.ts index 6674b28c..37da106c 100644 --- a/types/ext/language-descriptors.d.ts +++ b/types/ext/language-descriptors.d.ts @@ -15,13 +15,17 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import type {TextPreprocessor, BidirectionalConversionPreprocessor} from './language'; +import type {TextProcessor, BidirectionalConversionPreprocessor} from './language';  import type {LanguageTransformDescriptor} from './language-transformer';  import type {SafeAny} from './core';  export type IsTextLookupWorthyFunction = (text: string) => boolean; -type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends TextPreprocessorDescriptor> = { +type LanguageDescriptor< +    TIso extends string, +    TTextPreprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>, +    TTextPostprocessorDescriptor extends TextProcessorDescriptor = Record<string, never>, +> = {      iso: TIso;      name: string;      exampleText: string; @@ -32,75 +36,126 @@ type LanguageDescriptor<TIso extends string, TTextPreprocessorDescriptor extends       * If no value is provided, `true` is assumed for all inputs.       */      isTextLookupWorthy?: IsTextLookupWorthyFunction; -    textPreprocessors: TTextPreprocessorDescriptor; +    textPreprocessors?: TTextPreprocessorDescriptor; +    textPostprocessors?: TTextPostprocessorDescriptor;      languageTransforms?: LanguageTransformDescriptor;  }; -type TextPreprocessorDescriptor = { -    [key: string]: TextPreprocessor<SafeAny>; +type TextProcessorDescriptor = { +    [key: string]: TextProcessor<SafeAny>;  };  type LanguageDescriptorObjectMap = { -    [key in keyof AllTextPreprocessors]: LanguageDescriptor<key, AllTextPreprocessors[key]>; +    [key in keyof AllTextProcessors]: LanguageDescriptor< +        key, +        AllTextProcessors[key] extends {pre: TextProcessorDescriptor} ? AllTextProcessors[key]['pre'] : Record<string, never>, +        AllTextProcessors[key] extends {post: TextProcessorDescriptor} ? AllTextProcessors[key]['post'] : Record<string, never> +    >;  };  export type LanguageDescriptorAny = LanguageDescriptorObjectMap[keyof LanguageDescriptorObjectMap];  type CapitalizationPreprocessors = { -    capitalizeFirstLetter: TextPreprocessor<boolean>; -    decapitalize: TextPreprocessor<boolean>; +    capitalizeFirstLetter: TextProcessor<boolean>; +    decapitalize: TextProcessor<boolean>;  };  /** - * This is a mapping of the iso tag to all of the preprocessors for that language. + * This is a mapping of the iso tag to all of the text processors for that language.   * Any new language should be added to this object.   */ -type AllTextPreprocessors = { +type AllTextProcessors = {      ar: { -        removeArabicScriptDiacritics: TextPreprocessor<boolean>; +        pre: { +            removeArabicScriptDiacritics: TextProcessor<boolean>; +        };      }; -    de: CapitalizationPreprocessors & { -        eszettPreprocessor: BidirectionalConversionPreprocessor; +    de: { +        pre: CapitalizationPreprocessors & { +            eszettPreprocessor: BidirectionalConversionPreprocessor; +        }; +    }; +    el: { +        pre: CapitalizationPreprocessors; +    }; +    en: { +        pre: CapitalizationPreprocessors; +    }; +    es: { +        pre: CapitalizationPreprocessors;      }; -    el: CapitalizationPreprocessors; -    en: CapitalizationPreprocessors; -    es: CapitalizationPreprocessors;      fa: { -        removeArabicScriptDiacritics: TextPreprocessor<boolean>; +        pre: { +            removeArabicScriptDiacritics: TextProcessor<boolean>; +        }; +    }; +    fr: { +        pre: CapitalizationPreprocessors; +    }; +    grc: { +        pre: CapitalizationPreprocessors & { +            removeAlphabeticDiacritics: TextProcessor<boolean>; +        };      }; -    fr: CapitalizationPreprocessors; -    grc: CapitalizationPreprocessors & { -        removeAlphabeticDiacritics: TextPreprocessor<boolean>; +    hu: { +        pre: CapitalizationPreprocessors;      }; -    hu: CapitalizationPreprocessors; -    id: CapitalizationPreprocessors; -    it: CapitalizationPreprocessors; -    la: CapitalizationPreprocessors & { -        removeAlphabeticDiacritics: TextPreprocessor<boolean>; +    id: { +        pre: CapitalizationPreprocessors; +    }; +    it: { +        pre: CapitalizationPreprocessors; +    }; +    la: { +        pre: CapitalizationPreprocessors & { +            removeAlphabeticDiacritics: TextProcessor<boolean>; +        };      };      ja: { -        convertHalfWidthCharacters: TextPreprocessor<boolean>; -        convertNumericCharacters: TextPreprocessor<boolean>; -        convertAlphabeticCharacters: TextPreprocessor<boolean>; -        convertHiraganaToKatakana: BidirectionalConversionPreprocessor; -        collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; +        pre: { +            convertHalfWidthCharacters: TextProcessor<boolean>; +            convertNumericCharacters: TextProcessor<boolean>; +            convertAlphabeticCharacters: TextProcessor<boolean>; +            convertHiraganaToKatakana: BidirectionalConversionPreprocessor; +            collapseEmphaticSequences: TextProcessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; +        };      };      km: Record<string, never>; -    pl: CapitalizationPreprocessors; -    pt: CapitalizationPreprocessors; -    ro: CapitalizationPreprocessors; -    ru: CapitalizationPreprocessors & { -        yoToE: TextPreprocessor<boolean>; -        removeRussianDiacritics: TextPreprocessor<boolean>; -    }; -    sga: CapitalizationPreprocessors & { -        removeAlphabeticDiacritics: TextPreprocessor<boolean>; -    }; -    sh: CapitalizationPreprocessors; -    sq: CapitalizationPreprocessors; -    sv: CapitalizationPreprocessors; +    pl: { +        pre: CapitalizationPreprocessors; +    }; +    pt: { +        pre: CapitalizationPreprocessors; +    }; +    ro: { +        pre: CapitalizationPreprocessors; +    }; +    ru: { +        pre: CapitalizationPreprocessors & { +            yoToE: TextProcessor<boolean>; +            removeRussianDiacritics: TextProcessor<boolean>; +        }; +    }; +    sga: { +        pre: CapitalizationPreprocessors & { +            removeAlphabeticDiacritics: TextProcessor<boolean>; +        }; +    }; +    sh: { +        pre: CapitalizationPreprocessors; +    }; +    sq: { +        pre: CapitalizationPreprocessors; +    }; +    sv: { +        pre: CapitalizationPreprocessors; +    };      th: Record<string, never>; -    tr: CapitalizationPreprocessors; -    vi: CapitalizationPreprocessors; +    tr: { +        pre: CapitalizationPreprocessors; +    }; +    vi: { +        pre: CapitalizationPreprocessors; +    };      zh: Record<string, never>;  }; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts index 8fa6f0e7..c708f6e7 100644 --- a/types/ext/language.d.ts +++ b/types/ext/language.d.ts @@ -15,32 +15,32 @@   * along with this program.  If not, see <https://www.gnu.org/licenses/>.   */ -import type {TextSourceMap} from '../../ext/js/general/text-source-map.js';  import type {LanguageTransformDescriptor} from './language-transformer.js'; -export type TextPreprocessorOptions<T = unknown> = T[]; +export type TextProcessorOptions<T = unknown> = T[]; -export type TextPreprocessorFunction<T = unknown> = (str: string, setting: T, sourceMap: TextSourceMap) => string; +export type TextProcessorFunction<T = unknown> = (str: string, setting: T) => string;  /** - * Text preprocessors are used during the translation process to create alternate versions of the input text to search for. + * Text pre- and post-processors are used during the translation process to create alternate versions of the input text to search for.   * This is helpful when the input text doesn't exactly match the term or expression found in the database. - * When a language has multiple preprocessors, the translator will generate variants of the text by applying all combinations of the preprocessors. + * When a language has multiple processors, the translator will generate variants of the text by applying all combinations of the processors.   */ -export type TextPreprocessor<T = unknown> = { +export type TextProcessor<T = unknown> = {      name: string;      description: string; -    options: TextPreprocessorOptions<T>; -    process: TextPreprocessorFunction<T>; +    options: TextProcessorOptions<T>; +    process: TextProcessorFunction<T>;  };  export type BidirectionalPreprocessorOptions = 'off' | 'direct' | 'inverse'; -export type BidirectionalConversionPreprocessor = TextPreprocessor<BidirectionalPreprocessorOptions>; +export type BidirectionalConversionPreprocessor = TextProcessor<BidirectionalPreprocessorOptions>; -export type LanguageAndPreprocessors = { +export type LanguageAndProcessors = {      iso: string; -    textPreprocessors: TextPreprocessorWithId<unknown>[]; +    textPreprocessors?: TextProcessorWithId<unknown>[]; +    textPostprocessors?: TextProcessorWithId<unknown>[];  };  export type LanguageAndTransforms = { @@ -48,9 +48,9 @@ export type LanguageAndTransforms = {      languageTransforms: LanguageTransformDescriptor;  }; -export type TextPreprocessorWithId<T = unknown> = { +export type TextProcessorWithId<T = unknown> = {      id: string; -    textPreprocessor: TextPreprocessor<T>; +    textProcessor: TextProcessor<T>;  };  export type LanguageSummary = { diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts index 7006221e..00056562 100644 --- a/types/ext/translation-internal.d.ts +++ b/types/ext/translation-internal.d.ts @@ -49,4 +49,14 @@ export type DatabaseDeinflection = {      databaseEntries: DictionaryDatabase.TermEntry[];  }; -export type PreprocessorOptionsSpace = Map<string, Language.TextPreprocessorOptions<unknown>>; +export type TextProcessorOptionsSpace = Map<string, Language.TextProcessorOptions<unknown>>; + +export type TextProcessorMap = Map< +    string, +    { +        textPreprocessors: Language.TextProcessorWithId<unknown>[]; +        preprocessorOptionsSpace: TextProcessorOptionsSpace; +        textPostprocessors: Language.TextProcessorWithId<unknown>[]; +        postprocessorOptionsSpace: TextProcessorOptionsSpace; +    } +>; |