From 4aaa9f15d97668203741c1731f15e710ae8b8294 Mon Sep 17 00:00:00 2001 From: StefanVukovic99 Date: Sat, 17 Feb 2024 02:45:24 +0100 Subject: add language select, abstract text transformations (#584) * Copy functions from JapaneseUtil * Remove JapaneseUtil * Update usages of JapaneseUtil functions * part1 * frotend done? * fix tests * offscreen and type complications * add tests * start fixing tests * keep fixing tests * fix tests * Copy functions from JapaneseUtil * Remove JapaneseUtil * Update usages of JapaneseUtil functions * delete pt * renames * add tests * kebab-case filenames * lint * minor fixes * merge * fixes * fix part of comments * fix more comments * delete unused types * comment * comment * do backend * other files * move fetch utils to own file * remove extra line * add extra line * remove unnecessary export * simplify folder structure * remove redundant async * fix param type in api * fix language index * undo changes to cssStyleApplier * undo changes to utilities.js * undo changes to utilities.js * simplify language util * lint * undo phantom changes to anki integration * require textTransformations options * explicit locale in localeCompare * punctuate notes * prefer early exit * rename LanguageOptionsObjectMap * rename to textPreprocessor * tuple with names instead of boolean array * safe data setting * optional chaining * simplify LanguageOptions * encapsulate languages * delete language util * nullable language in text preprocessors controller * rename transform to process * remove settings * make translation advanced again * remove unused getTextTransformations api call * comments * change language types * RIP flags * comments * fix tests * lint * Text preprocessor type changes (#10) * Add types * Update types * Simplify type check * Refactor typing and structuring of language definitions * lint * update translator benchmark * undo markdown changes * undo markdown changes * undo markdown changes * more merge * simplify language controller --------- Co-authored-by: toasted-nutbread Co-authored-by: Darius Jahandarie --- .eslintrc.json | 4 + benches/translator.bench.js | 15 +- dev/jsconfig.json | 3 + docs/anki-integration.md | 120 ++--- ext/data/schemas/options-schema.json | 41 +- ext/js/background/backend.js | 26 +- ext/js/comm/api.js | 7 + ext/js/data/options-util.js | 28 +- ext/js/language/en/language-english.js | 29 ++ ext/js/language/ja/language-japanese.js | 77 +++ ext/js/language/languages.js | 61 +++ ext/js/language/text-preprocessors.js | 35 ++ ext/js/language/translator.js | 134 +++-- ext/js/pages/settings/languages-controller.js | 49 ++ ext/js/pages/settings/settings-main.js | 4 + ext/settings.html | 91 +--- test/data/anki-note-builder-test-results.json | 135 ++++- test/data/database-test-cases.json | 6 +- .../valid-dictionary1/term_bank_1.json | 4 +- test/data/translator-test-inputs.json | 60 ++- test/data/translator-test-results-note-data1.json | 561 ++++++++++++++++++++- test/data/translator-test-results.json | 349 ++++++++++++- test/options-util.test.js | 9 +- test/utilities/translator.js | 18 +- types/ext/api.d.ts | 5 + types/ext/language-english.d.ts | 25 + types/ext/language-japanese.d.ts | 29 ++ types/ext/language.d.ts | 57 +++ types/ext/settings.d.ts | 1 + types/ext/translation-internal.d.ts | 3 + types/ext/translation.d.ts | 38 +- types/test/translator.d.ts | 11 +- 32 files changed, 1648 insertions(+), 387 deletions(-) create mode 100644 ext/js/language/en/language-english.js create mode 100644 ext/js/language/ja/language-japanese.js create mode 100755 ext/js/language/languages.js create mode 100755 ext/js/language/text-preprocessors.js create mode 100755 ext/js/pages/settings/languages-controller.js create mode 100644 types/ext/language-english.d.ts create mode 100644 types/ext/language-japanese.d.ts create mode 100644 types/ext/language.d.ts diff --git a/.eslintrc.json b/.eslintrc.json index 361e5f24..a5f0f85e 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -637,9 +637,13 @@ "ext/js/general/object-property-accessor.js", "ext/js/general/regex-util.js", "ext/js/general/text-source-map.js", + "ext/js/language/en/language-english.js", "ext/js/language/ja/japanese-wanakana.js", "ext/js/language/ja/japanese.js", + "ext/js/language/ja/language-japanese.js", "ext/js/language/language-transformer.js", + "ext/js/language/languages.js", + "ext/js/language/text-preprocessors.js", "ext/js/language/translator.js", "ext/js/media/audio-downloader.js", "ext/js/media/media-util.js", diff --git a/benches/translator.bench.js b/benches/translator.bench.js index 1231c31c..8b9b5118 100644 --- a/benches/translator.bench.js +++ b/benches/translator.bench.js @@ -20,8 +20,8 @@ import {fileURLToPath} from 'node:url'; import path from 'path'; import {bench, describe} from 'vitest'; import {parseJson} from '../dev/json.js'; -import {createFindKanjiOptions, createFindTermsOptions} from '../test/utilities/translator.js'; import {createTranslatorContext} from '../test/fixtures/translator-test.js'; +import {createFindKanjiOptions, createFindTermsOptions} from '../test/utilities/translator.js'; const dirname = path.dirname(fileURLToPath(import.meta.url)); const dictionaryName = 'Test Dictionary 2'; @@ -33,10 +33,9 @@ describe('Translator', () => { const {optionsPresets, tests} = parseJson(readFileSync(testInputsFilePath, {encoding: 'utf8'})); const findKanjiTests = tests.filter((data) => data.options === 'kanji'); - const findTermTests = tests.filter((data) => data.options === 'default'); - const findTermWithTextTransformationsTests = tests.filter((data) => data.options !== 'kanji' && data.options !== 'default'); + const findTermTests = tests.filter((data) => data.options !== 'kanji'); - bench(`Translator.prototype.findTerms - no text transformations (n=${findTermTests.length})`, async () => { + bench(`Translator.prototype.findTerms - (n=${findTermTests.length})`, async () => { for (const data of /** @type {import('test/translator').TestInputFindTerm[]} */ (findTermTests)) { const {mode, text} = data; const options = createFindTermsOptions(dictionaryName, optionsPresets, data.options); @@ -44,14 +43,6 @@ describe('Translator', () => { } }); - bench(`Translator.prototype.findTerms - text transformations (n=${findTermWithTextTransformationsTests.length})`, async () => { - for (const data of /** @type {import('test/translator').TestInputFindTerm[]} */ (findTermWithTextTransformationsTests)) { - const {mode, text} = data; - const options = createFindTermsOptions(dictionaryName, optionsPresets, data.options); - await translator.findTerms(mode, text, options); - } - }); - bench(`Translator.prototype.findKanji - (n=${findKanjiTests.length})`, async () => { for (const data of /** @type {import('test/translator').TestInputFindKanji[]} */ (findKanjiTests)) { const {text} = data; diff --git a/dev/jsconfig.json b/dev/jsconfig.json index d9465108..6a5fb13b 100644 --- a/dev/jsconfig.json +++ b/dev/jsconfig.json @@ -28,6 +28,9 @@ "error": ["../types/ext/error"], "event-listener-collection": ["../types/ext/event-listener-collection"], "japanese-util": ["../types/ext/japanese-util"], + "language": ["../types/ext/language"], + "language-english": ["../types/ext/language-english"], + "language-japanese": ["../types/ext/language-japanese"], "ext/json-schema": ["../types/ext/json-schema"], "language-transformer": ["../types/ext/language-transformer"], "language-transformer-internal": ["../types/ext/language-transformer-internal"], diff --git a/docs/anki-integration.md b/docs/anki-integration.md index 2bd9fad9..9bd5bd94 100644 --- a/docs/anki-integration.md +++ b/docs/anki-integration.md @@ -23,71 +23,71 @@ Flashcard fields can be configured with the following steps: #### Markers for Term Cards - | Marker | Description | - | -------------------------- | ------------------------------------------------------------------------------------------------------------------------ | - | `{audio}` | Audio sample of a native speaker's pronunciation in MP3 format (if available). | - | `{clipboard-image}` | An image which is stored in the system clipboard, if present. | - | `{clipboard-text}` | Text which is stored in the system clipboard, if present. | - | `{cloze-body}` | Raw, inflected term as it appeared before being reduced to dictionary form by Yomitan. | - | `{cloze-body-kana}` | Kana reading for `{cloze-body}`. | - | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. | - | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. | - | `{conjugation}` | Conjugation path from the raw inflected term to the source term. | - | `{dictionary}` | Name of the dictionary from which the card is being created (unavailable in _grouped_ mode). | - | `{document-title}` | Title of the web page that the term appeared in. | - | `{expression}` | Term expressed as kanji (will be displayed in kana if kanji is not available). | - | `{frequencies}` | Frequency information for the term. | - | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. | + | Marker | Description | + | --------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | + | `{audio}` | Audio sample of a native speaker's pronunciation in MP3 format (if available). | + | `{clipboard-image}` | An image which is stored in the system clipboard, if present. | + | `{clipboard-text}` | Text which is stored in the system clipboard, if present. | + | `{cloze-body}` | Raw, inflected term as it appeared before being reduced to dictionary form by Yomitan. | + | `{cloze-body-kana}` | Kana reading for `{cloze-body}`. | + | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. | + | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. | + | `{conjugation}` | Conjugation path from the raw inflected term to the source term. | + | `{dictionary}` | Name of the dictionary from which the card is being created (unavailable in _grouped_ mode). | + | `{document-title}` | Title of the web page that the term appeared in. | + | `{expression}` | Term expressed as kanji (will be displayed in kana if kanji is not available). | + | `{frequencies}` | Frequency information for the term. | + | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. | | `{frequency-harmonic-occurrence}` | The harmonic mean of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. | - | `{frequency-average-rank}` | The average of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. | - | `{frequency-average-occurrence}` | The average of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. | - | `{furigana}` | Term expressed as kanji with furigana displayed above it (e.g. 日本語にほんご). | - | `{furigana-plain}` | Term expressed as kanji with furigana displayed next to it in brackets (e.g. 日本語[にほんご]). | - | `{glossary}` | List of definitions for the term (output format depends on whether running in _grouped_ mode). | - | `{glossary-brief}` | List of definitions for the term in a more compact format. | - | `{glossary-no-dictionary}` | List of definitions for the term, except the dictionary tag is omitted. | - | `{part-of-speech}` | Part of speech information for the term. | - | `{phonetic-transcriptions}`| List of phonetic transcriptions for the term. | - | `{pitch-accents}` | List of pitch accent downstep notations for the term. | - | `{pitch-accent-graphs}` | List of pitch accent graphs for the term. | - | `{pitch-accent-positions}` | List of accent downstep positions for the term as a number. | - | `{pitch-accent-categories}`| List of pitch accent categories for the term (e.g. heiban, kifuku, atamadaka, odaka, nakadaka). | - | `{reading}` | Kana reading for the term (empty for terms where the expression is the reading). | - | `{screenshot}` | Screenshot of the web page taken at the time the term was added. | - | `{search-query}` | The full search query shown on the search page. | - | `{selection-text}` | The selected text on the search page or popup. | - | `{sentence}` | Sentence, quote, or phrase that the term appears in from the source content. | - | `{sentence-furigana}` | Sentence, quote, or phrase that the term appears in from the source content, with furigana added. | - | `{tags}` | Grammar and usage tags providing information about the term (unavailable in _grouped_ mode). | - | `{url}` | Address of the web page in which the term appeared in. | + | `{frequency-average-rank}` | The average of frequency data for the current term. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based term usage. | + | `{frequency-average-occurrence}` | The average of frequency data for the current term. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based term usage. | + | `{furigana}` | Term expressed as kanji with furigana displayed above it (e.g. 日本語にほんご). | + | `{furigana-plain}` | Term expressed as kanji with furigana displayed next to it in brackets (e.g. 日本語[にほんご]). | + | `{glossary}` | List of definitions for the term (output format depends on whether running in _grouped_ mode). | + | `{glossary-brief}` | List of definitions for the term in a more compact format. | + | `{glossary-no-dictionary}` | List of definitions for the term, except the dictionary tag is omitted. | + | `{part-of-speech}` | Part of speech information for the term. | + | `{phonetic-transcriptions}` | List of phonetic transcriptions for the term. | + | `{pitch-accents}` | List of pitch accent downstep notations for the term. | + | `{pitch-accent-graphs}` | List of pitch accent graphs for the term. | + | `{pitch-accent-positions}` | List of accent downstep positions for the term as a number. | + | `{pitch-accent-categories}` | List of pitch accent categories for the term (e.g. heiban, kifuku, atamadaka, odaka, nakadaka). | + | `{reading}` | Kana reading for the term (empty for terms where the expression is the reading). | + | `{screenshot}` | Screenshot of the web page taken at the time the term was added. | + | `{search-query}` | The full search query shown on the search page. | + | `{selection-text}` | The selected text on the search page or popup. | + | `{sentence}` | Sentence, quote, or phrase that the term appears in from the source content. | + | `{sentence-furigana}` | Sentence, quote, or phrase that the term appears in from the source content, with furigana added. | + | `{tags}` | Grammar and usage tags providing information about the term (unavailable in _grouped_ mode). | + | `{url}` | Address of the web page in which the term appeared in. | #### Markers for Kanji Cards - | Marker | Description | - | --------------------- | ------------------------------------------------------------------------------------------------------------------------ | - | `{character}` | Unicode glyph representing the current kanji. | - | `{clipboard-image}` | An image which is stored in the system clipboard, if present. | - | `{clipboard-text}` | Text which is stored in the system clipboard, if present. | - | `{cloze-body}` | Raw, inflected parent term as it appeared before being reduced to dictionary form by Yomitan. | - | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. | - | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. | - | `{dictionary}` | Name of the dictionary from which the card is being created. | - | `{document-title}` | Title of the web page that the kanji appeared in. | - | `{frequencies}` | Frequency information for the kanji. | - | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. | + | Marker | Description | + | --------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | + | `{character}` | Unicode glyph representing the current kanji. | + | `{clipboard-image}` | An image which is stored in the system clipboard, if present. | + | `{clipboard-text}` | Text which is stored in the system clipboard, if present. | + | `{cloze-body}` | Raw, inflected parent term as it appeared before being reduced to dictionary form by Yomitan. | + | `{cloze-prefix}` | Fragment of the containing `{sentence}` starting at the beginning of `{sentence}` until the beginning of `{cloze-body}`. | + | `{cloze-suffix}` | Fragment of the containing `{sentence}` starting at the end of `{cloze-body}` until the end of `{sentence}`. | + | `{dictionary}` | Name of the dictionary from which the card is being created. | + | `{document-title}` | Title of the web page that the kanji appeared in. | + | `{frequencies}` | Frequency information for the kanji. | + | `{frequency-harmonic-rank}` | The harmonic mean of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. | | `{frequency-harmonic-occurrence}` | The harmonic mean of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. | - | `{frequency-average-rank}` | The average of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. | - | `{frequency-average-occurrence}` | The average of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. | - | `{glossary}` | List of definitions for the kanji. | - | `{kunyomi}` | Kunyomi (Japanese reading) for the kanji expressed as katakana. | - | `{onyomi}` | Onyomi (Chinese reading) for the kanji expressed as hiragana. | - | `{screenshot}` | Screenshot of the web page taken at the time the kanji was added. | - | `{search-query}` | The full search query shown on the search page. | - | `{selection-text}` | The selected text on the search page or popup. | - | `{sentence}` | Sentence, quote, or phrase that the character appears in from the source content. | - | `{sentence-furigana}` | Sentence, quote, or phrase that the character appears in from the source content, with furigana added. | - | `{stroke-count}` | Number of strokes that the kanji character has. | - | `{url}` | Address of the web page in which the kanji appeared in. | + | `{frequency-average-rank}` | The average of frequency data for the current kanji. Defaults to rank 9999999 when frequency data is not found, indicating extremely low rank-based kanji usage. | + | `{frequency-average-occurrence}` | The average of frequency data for the current kanji. Defaults to 0 occurrences when frequency data is not found, the lowest possible occurrence-based kanji usage. | + | `{glossary}` | List of definitions for the kanji. | + | `{kunyomi}` | Kunyomi (Japanese reading) for the kanji expressed as katakana. | + | `{onyomi}` | Onyomi (Chinese reading) for the kanji expressed as hiragana. | + | `{screenshot}` | Screenshot of the web page taken at the time the kanji was added. | + | `{search-query}` | The full search query shown on the search page. | + | `{selection-text}` | The selected text on the search page or popup. | + | `{sentence}` | Sentence, quote, or phrase that the character appears in from the source content. | + | `{sentence-furigana}` | Sentence, quote, or phrase that the character appears in from the source content, with furigana added. | + | `{stroke-count}` | Number of strokes that the kanji character has. | + | `{url}` | Address of the web page in which the kanji appeared in. | When creating your model for Yomitan, _make sure that you pick a unique field to be first_; fields that will contain `{expression}` or `{character}` are ideal candidates for this. Anki does not allow duplicate flashcards to be diff --git a/ext/data/schemas/options-schema.json b/ext/data/schemas/options-schema.json index ea7caf0f..d86eedf7 100644 --- a/ext/data/schemas/options-schema.json +++ b/ext/data/schemas/options-schema.json @@ -81,6 +81,7 @@ "type": "object", "required": [ "enable", + "language", "resultOutputMode", "debugInfo", "maxResults", @@ -126,6 +127,10 @@ "type": "boolean", "default": true }, + "language": { + "type": "string", + "default": "ja" + }, "resultOutputMode": { "type": "string", "enum": ["group", "merge", "split"], @@ -722,12 +727,6 @@ "translation": { "type": "object", "required": [ - "convertHalfWidthCharacters", - "convertNumericCharacters", - "convertAlphabeticCharacters", - "convertHiraganaToKatakana", - "convertKatakanaToHiragana", - "collapseEmphaticSequences", "textReplacements", "searchResolution" ], @@ -740,36 +739,6 @@ ], "default": "letter" }, - "convertHalfWidthCharacters": { - "type": "string", - "enum": ["false", "true", "variant"], - "default": "false" - }, - "convertNumericCharacters": { - "type": "string", - "enum": ["false", "true", "variant"], - "default": "false" - }, - "convertAlphabeticCharacters": { - "type": "string", - "enum": ["false", "true", "variant"], - "default": "false" - }, - "convertHiraganaToKatakana": { - "type": "string", - "enum": ["false", "true", "variant"], - "default": "false" - }, - "convertKatakanaToHiragana": { - "type": "string", - "enum": ["false", "true", "variant"], - "default": "variant" - }, - "collapseEmphaticSequences": { - "type": "string", - "enum": ["false", "true", "full"], - "default": "false" - }, "textReplacements": { "type": "object", "required": [ diff --git a/ext/js/background/backend.js b/ext/js/background/backend.js index e246f0bb..31191612 100644 --- a/ext/js/background/backend.js +++ b/ext/js/background/backend.js @@ -34,6 +34,7 @@ import {DictionaryDatabase} from '../dictionary/dictionary-database.js'; import {Environment} from '../extension/environment.js'; import {ObjectPropertyAccessor} from '../general/object-property-accessor.js'; import {distributeFuriganaInflected, isCodePointJapanese, isStringPartiallyJapanese, convertKatakanaToHiragana as jpConvertKatakanaToHiragana} from '../language/ja/japanese.js'; +import {getLanguageSummaries} from '../language/languages.js'; import {Translator} from '../language/translator.js'; import {AudioDownloader} from '../media/audio-downloader.js'; import {getFileExtensionFromAudioMediaType, getFileExtensionFromImageMediaType} from '../media/media-util.js'; @@ -183,7 +184,8 @@ export class Backend { ['textHasJapaneseCharacters', this._onApiTextHasJapaneseCharacters.bind(this)], ['getTermFrequencies', this._onApiGetTermFrequencies.bind(this)], ['findAnkiNotes', this._onApiFindAnkiNotes.bind(this)], - ['openCrossFramePort', this._onApiOpenCrossFramePort.bind(this)] + ['openCrossFramePort', this._onApiOpenCrossFramePort.bind(this)], + ['getLanguageSummaries', this._onApiGetLanguageSummaries.bind(this)] ]); /* eslint-enable @stylistic/no-multi-spaces */ @@ -906,6 +908,11 @@ export class Backend { return {targetTabId, targetFrameId}; } + /** @type {import('api').ApiHandler<'getLanguageSummaries'>} */ + _onApiGetLanguageSummaries() { + return getLanguageSummaries(); + } + // Command handlers /** @@ -2361,15 +2368,9 @@ export class Backend { if (typeof deinflect !== 'boolean') { deinflect = true; } const enabledDictionaryMap = this._getTranslatorEnabledDictionaryMap(options); const { - general: {mainDictionary, sortFrequencyDictionary, sortFrequencyDictionaryOrder}, + general: {mainDictionary, sortFrequencyDictionary, sortFrequencyDictionaryOrder, language}, scanning: {alphanumeric}, translation: { - convertHalfWidthCharacters, - convertNumericCharacters, - convertAlphabeticCharacters, - convertHiraganaToKatakana, - convertKatakanaToHiragana, - collapseEmphaticSequences, textReplacements: textReplacementsOptions, searchResolution } @@ -2394,16 +2395,11 @@ export class Backend { sortFrequencyDictionary, sortFrequencyDictionaryOrder, removeNonJapaneseCharacters: !alphanumeric, - convertHalfWidthCharacters, - convertNumericCharacters, - convertAlphabeticCharacters, - convertHiraganaToKatakana, - convertKatakanaToHiragana, - collapseEmphaticSequences, searchResolution, textReplacements, enabledDictionaryMap, - excludeDictionaryDefinitions + excludeDictionaryDefinitions, + language }; } diff --git a/ext/js/comm/api.js b/ext/js/comm/api.js index b4fdbeb5..40b8e252 100644 --- a/ext/js/comm/api.js +++ b/ext/js/comm/api.js @@ -361,6 +361,13 @@ export class API { return this._invoke('openCrossFramePort', {targetTabId, targetFrameId}); } + /** + * @returns {Promise>} + */ + getLanguageSummaries() { + return this._invoke('getLanguageSummaries', void 0); + } + // Utilities /** diff --git a/ext/js/data/options-util.js b/ext/js/data/options-util.js index 1644df2f..7952eafc 100644 --- a/ext/js/data/options-util.js +++ b/ext/js/data/options-util.js @@ -522,7 +522,8 @@ export class OptionsUtil { this._updateVersion22, this._updateVersion23, this._updateVersion24, - this._updateVersion25 + this._updateVersion25, + this._updateVersion26 ]; /* eslint-enable @typescript-eslint/unbound-method */ if (typeof targetVersion === 'number' && targetVersion < result.length) { @@ -1154,6 +1155,31 @@ export class OptionsUtil { } } + /** + * - Added general.language. + * - Modularized text preprocessors. + * @type {import('options-util').UpdateFunction} + */ + _updateVersion26(options) { + const textPreprocessors = [ + 'convertHalfWidthCharacters', + 'convertNumericCharacters', + 'convertAlphabeticCharacters', + 'convertHiraganaToKatakana', + 'convertKatakanaToHiragana', + 'collapseEmphaticSequences' + ]; + + for (const {options: profileOptions} of options.profiles) { + profileOptions.general.language = 'ja'; + + for (const preprocessor of textPreprocessors) { + delete profileOptions.translation[preprocessor]; + } + } + } + + /** * @param {string} url * @returns {Promise} diff --git a/ext/js/language/en/language-english.js b/ext/js/language/en/language-english.js new file mode 100644 index 00000000..8268653f --- /dev/null +++ b/ext/js/language/en/language-english.js @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {capitalizeFirstLetter, decapitalize} from '../text-preprocessors.js'; + +/** @type {import('language-english').EnglishLanguageDescriptor} */ +export const descriptor = { + name: 'English', + iso: 'en', + exampleText: 'read', + textPreprocessors: { + capitalizeFirstLetter, + decapitalize + } +}; diff --git a/ext/js/language/ja/language-japanese.js b/ext/js/language/ja/language-japanese.js new file mode 100644 index 00000000..ced34bcd --- /dev/null +++ b/ext/js/language/ja/language-japanese.js @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {basicTextPreprocessorOptions} from '../text-preprocessors.js'; +import {convertAlphabeticToKana} from './japanese-wanakana.js'; +import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth} from './japanese.js'; + +/** @type {import('language-japanese').JapaneseLanguageDescriptor} */ +export const descriptor = { + name: 'Japanese', + iso: 'ja', + exampleText: '読め', + textPreprocessors: { + convertHalfWidthCharacters: { + name: 'Convert half width characters to full width', + description: 'ヨミチャン → ヨミチャン', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction} */ + process: (str, setting, sourceMap) => (setting ? convertHalfWidthKanaToFullWidth(str, sourceMap) : str) + }, + convertNumericCharacters: { + name: 'Convert numeric characters to full width', + description: '1234 → 1234', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction} */ + process: (str, setting) => (setting ? convertNumericToFullWidth(str) : str) + }, + convertAlphabeticCharacters: { + name: 'Convert alphabetic characters to hiragana', + description: 'yomichan → よみちゃん', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction} */ + process: (str, setting, sourceMap) => (setting ? convertAlphabeticToKana(str, sourceMap) : str) + }, + convertHiraganaToKatakana: { + name: 'Convert hiragana to katakana', + description: 'よみちゃん → ヨミチャン', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction} */ + process: (str, setting) => (setting ? convertHiraganaToKatakana(str) : str) + }, + convertKatakanaToHiragana: { + name: 'Convert katakana to hiragana', + description: 'ヨミチャン → よみちゃん', + options: basicTextPreprocessorOptions, + /** @type {import('language').TextPreprocessorFunction} */ + process: (str, setting) => (setting ? convertKatakanaToHiragana(str) : str) + }, + collapseEmphaticSequences: { + name: 'Collapse emphatic character sequences', + description: 'すっっごーーい → すっごーい / すごい', + options: [[false, false], [true, false], [true, true]], + /** @type {import('language').TextPreprocessorFunction<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>} */ + process: (str, setting, sourceMap) => { + const [collapseEmphatic, collapseEmphaticFull] = setting; + if (collapseEmphatic) { + str = collapseEmphaticSequences(str, collapseEmphaticFull, sourceMap); + } + return str; + } + } + } +}; diff --git a/ext/js/language/languages.js b/ext/js/language/languages.js new file mode 100755 index 00000000..f51ca163 --- /dev/null +++ b/ext/js/language/languages.js @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {descriptor as descriptorEnglish} from './en/language-english.js'; +import {descriptor as descriptorJapanese} from './ja/language-japanese.js'; + +const languageDescriptors = [ + descriptorEnglish, + descriptorJapanese +]; + +/** @type {Map} */ +const languageDescriptorMap = new Map(); +for (const languageDescriptor of languageDescriptors) { + languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); +} + +/** + * @returns {import('language').LanguageSummary[]} + */ +export function getLanguageSummaries() { + const results = []; + for (const {name, iso, exampleText} of languageDescriptorMap.values()) { + results.push({name, iso, exampleText}); + } + return results; +} + +/** + * @returns {import('language').LanguageAndPreprocessors[]} + * @throws {Error} + */ +export function getAllLanguageTextPreprocessors() { + const results = []; + for (const {iso, textPreprocessors} of languageDescriptorMap.values()) { + /** @type {import('language').TextPreprocessorWithId[]} */ + const textPreprocessorsArray = []; + for (const [id, textPreprocessor] of Object.entries(textPreprocessors)) { + textPreprocessorsArray.push({ + id, + textPreprocessor: /** @type {import('language').TextPreprocessor} */ (textPreprocessor) + }); + } + results.push({iso, textPreprocessors: textPreprocessorsArray}); + } + return results; +} diff --git a/ext/js/language/text-preprocessors.js b/ext/js/language/text-preprocessors.js new file mode 100755 index 00000000..12b3d1b6 --- /dev/null +++ b/ext/js/language/text-preprocessors.js @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** @type {import('language').TextPreprocessorOptions} */ +export const basicTextPreprocessorOptions = [false, true]; + +/** @type {import('language').TextPreprocessor} */ +export const decapitalize = { + name: 'Decapitalize text', + description: 'CAPITALIZED TEXT → capitalized text', + options: basicTextPreprocessorOptions, + process: (str, setting) => (setting ? str.toLowerCase() : str) +}; + +/** @type {import('language').TextPreprocessor} */ +export const capitalizeFirstLetter = { + name: 'Capitalize first letter', + description: 'lowercase text → Lowercase text', + options: basicTextPreprocessorOptions, + process: (str, setting) => (setting ? str.charAt(0).toUpperCase() + str.slice(1) : str) +}; diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js index b2342e8d..4f9304b5 100644 --- a/ext/js/language/translator.js +++ b/ext/js/language/translator.js @@ -18,9 +18,9 @@ import {applyTextReplacement} from '../general/regex-util.js'; import {TextSourceMap} from '../general/text-source-map.js'; -import {convertAlphabeticToKana} from './ja/japanese-wanakana.js'; -import {collapseEmphaticSequences, convertHalfWidthKanaToFullWidth, convertHiraganaToKatakana, convertKatakanaToHiragana, convertNumericToFullWidth, isCodePointJapanese} from './ja/japanese.js'; +import {isCodePointJapanese} from './ja/japanese.js'; import {LanguageTransformer} from './language-transformer.js'; +import {getAllLanguageTextPreprocessors} from './languages.js'; /** * Class which finds term and kanji dictionary entries for text. @@ -41,6 +41,8 @@ export class Translator { this._stringComparer = new Intl.Collator('en-US'); // Invariant locale /** @type {RegExp} */ this._numberRegex = /[+-]?(\d+(\.\d*)?|\.\d+)([eE][+-]?\d+)?/; + /** @type {Map[], optionSpace: import('translation-internal').PreprocessorOptionsSpace}>} */ + this._textPreprocessors = new Map(); } /** @@ -49,6 +51,14 @@ export class Translator { */ prepare(descriptor) { this._languageTransformer.addDescriptor(descriptor); + for (const {iso, textPreprocessors} of getAllLanguageTextPreprocessors()) { + /** @type {Map>} */ + const optionSpace = new Map(); + for (const {id, textPreprocessor} of textPreprocessors) { + optionSpace.set(id, textPreprocessor.options); + } + this._textPreprocessors.set(iso, {textPreprocessors, optionSpace}); + } } /** @@ -415,51 +425,45 @@ export class Translator { } } - // Deinflections and text transformations + // Deinflections and text preprocessing /** * @param {string} text * @param {import('translation').FindTermsOptions} options * @returns {import('translation-internal').DatabaseDeinflection[]} + * @throws {Error} */ _getAlgorithmDeinflections(text, options) { - /** @type {import('translation-internal').TextDeinflectionOptionsArrays} */ - const textOptionVariantArray = [ - this._getTextReplacementsVariants(options), - this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), - this._getTextOptionEntryVariants(options.convertNumericCharacters), - this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), - this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), - this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), - this._getCollapseEmphaticOptions(options) - ]; + const {language} = options; + const info = this._textPreprocessors.get(language); + if (typeof info === 'undefined') { throw new Error(`Unsupported language: ${language}`); } + const {textPreprocessors, optionSpace: textPreprocessorOptionsSpace} = info; + + /** @type {Map>} */ + const variantSpace = new Map(); + variantSpace.set('textReplacements', this._getTextReplacementsVariants(options)); + for (const [key, value] of textPreprocessorOptionsSpace) { + variantSpace.set(key, value); + } /** @type {import('translation-internal').DatabaseDeinflection[]} */ const deinflections = []; const used = new Set(); - for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of /** @type {Generator} */ (this._getArrayVariants(textOptionVariantArray))) { + + for (const arrayVariant of this._generateArrayVariants(variantSpace)) { + const textReplacements = /** @type {import('translation').FindTermsTextReplacement[] | null} */ (arrayVariant.get('textReplacements')); + let text2 = text; const sourceMap = new TextSourceMap(text2); + if (textReplacements !== null) { text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); } - if (halfWidth) { - text2 = convertHalfWidthKanaToFullWidth(text2, sourceMap); - } - if (numeric) { - text2 = convertNumericToFullWidth(text2); - } - if (alphabetic) { - text2 = convertAlphabeticToKana(text2, sourceMap); - } - if (katakana) { - text2 = convertHiraganaToKatakana(text2); - } - if (hiragana) { - text2 = convertKatakanaToHiragana(text2); - } - if (collapseEmphatic) { - text2 = collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + + for (const preprocessor of textPreprocessors.values()) { + const {id, textPreprocessor} = preprocessor; + const setting = arrayVariant.get(id); + text2 = textPreprocessor.process(text2, setting, sourceMap); } for ( @@ -526,36 +530,6 @@ export class Translator { return text; } - /** - * @param {import('translation').FindTermsVariantMode} value - * @returns {boolean[]} - */ - _getTextOptionEntryVariants(value) { - switch (value) { - case 'true': return [true]; - case 'variant': return [false, true]; - default: return [false]; - } - } - - /** - * @param {import('translation').FindTermsOptions} options - * @returns {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} - */ - _getCollapseEmphaticOptions(options) { - /** @type {[collapseEmphatic: boolean, collapseEmphaticFull: boolean][]} */ - const collapseEmphaticOptions = [[false, false]]; - switch (options.collapseEmphaticSequences) { - case 'true': - collapseEmphaticOptions.push([true, false]); - break; - case 'full': - collapseEmphaticOptions.push([true, false], [true, true]); - break; - } - return collapseEmphaticOptions; - } - /** * @param {import('translation').FindTermsOptions} options * @returns {(import('translation').FindTermsTextReplacement[] | null)[]} @@ -1343,26 +1317,32 @@ export class Translator { } /** - * @param {[...args: unknown[][]]} arrayVariants - * @yields {[...args: unknown[]]} - * @returns {Generator} + * @param {Map} arrayVariants + * @yields {Map} + * @returns {Generator, void, void>} */ - *_getArrayVariants(arrayVariants) { - const ii = arrayVariants.length; - - let total = 1; - for (let i = 0; i < ii; ++i) { - total *= arrayVariants[i].length; + *_generateArrayVariants(arrayVariants) { + const variantKeys = [...arrayVariants.keys()]; + const entryVariantLengths = []; + for (const key of variantKeys) { + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + entryVariantLengths.push(entryVariants.length); } + const totalVariants = entryVariantLengths.reduce((acc, length) => acc * length, 1); + + for (let variantIndex = 0; variantIndex < totalVariants; ++variantIndex) { + /** @type {Map} */ + const variant = new Map(); + let remainingIndex = variantIndex; - for (let a = 0; a < total; ++a) { - const variant = []; - let index = a; - for (let i = 0; i < ii; ++i) { - const entryVariants = arrayVariants[i]; - variant.push(entryVariants[index % entryVariants.length]); - index = Math.floor(index / entryVariants.length); + for (let keyIndex = 0; keyIndex < variantKeys.length; ++keyIndex) { + const key = variantKeys[keyIndex]; + const entryVariants = /** @type {unknown[]} */ (arrayVariants.get(key)); + const entryIndex = remainingIndex % entryVariants.length; + variant.set(key, entryVariants[entryIndex]); + remainingIndex = Math.floor(remainingIndex / entryVariants.length); } + yield variant; } } diff --git a/ext/js/pages/settings/languages-controller.js b/ext/js/pages/settings/languages-controller.js new file mode 100755 index 00000000..78f036df --- /dev/null +++ b/ext/js/pages/settings/languages-controller.js @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2023-2024 Yomitan Authors + * Copyright (C) 2021-2022 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {querySelectorNotNull} from '../../dom/query-selector.js'; + +export class LanguagesController { + /** + * @param {import('./settings-controller.js').SettingsController} settingsController + */ + constructor(settingsController) { + /** @type {import('./settings-controller.js').SettingsController} */ + this._settingsController = settingsController; + } + + /** */ + async prepare() { + const languages = await this._settingsController.application.api.getLanguageSummaries(); + languages.sort((a, b) => a.iso.localeCompare(b.iso, 'en')); + this._fillSelect(languages); + } + + /** + * @param {import('language').LanguageSummary[]} languages + */ + _fillSelect(languages) { + const selectElement = querySelectorNotNull(document, '#language-select'); + for (const {iso, name} of languages) { + const option = document.createElement('option'); + option.value = iso; + option.text = `(${iso}) ${name}`; + selectElement.appendChild(option); + } + } +} diff --git a/ext/js/pages/settings/settings-main.js b/ext/js/pages/settings/settings-main.js index dc4b36c9..0b115246 100644 --- a/ext/js/pages/settings/settings-main.js +++ b/ext/js/pages/settings/settings-main.js @@ -30,6 +30,7 @@ import {DictionaryImportController} from './dictionary-import-controller.js'; import {ExtensionKeyboardShortcutController} from './extension-keyboard-shortcuts-controller.js'; import {GenericSettingController} from './generic-setting-controller.js'; import {KeyboardShortcutController} from './keyboard-shortcuts-controller.js'; +import {LanguagesController} from './languages-controller.js'; import {MecabController} from './mecab-controller.js'; import {ModalController} from './modal-controller.js'; import {NestedPopupsController} from './nested-popups-controller.js'; @@ -137,6 +138,9 @@ await Application.main(async (application) => { const secondarySearchDictionaryController = new SecondarySearchDictionaryController(settingsController); secondarySearchDictionaryController.prepare(); + const languagesController = new LanguagesController(settingsController); + languagesController.prepare(); + const translationTextReplacementsController = new TranslationTextReplacementsController(settingsController); translationTextReplacementsController.prepare(); diff --git a/ext/settings.html b/ext/settings.html index 2af2a666..8dc70cf3 100644 --- a/ext/settings.html +++ b/ext/settings.html @@ -212,6 +212,19 @@ +
+
+
+ Language +
+
+ Language of the text that is being looked up. +
+
+
+ +
+
Show the welcome guide on browser startup
@@ -1531,84 +1544,6 @@
-
-
-
Convert half width characters to full width
-
ヨミタン → ヨミタン
-
-
- -
-
-
-
-
Convert numeric characters to full width
-
1234 → 1234
-
-
- -
-
-
-
-
Convert alphabetic characters to hiragana
-
yomitan → よみたん
-
-
- -
-
-
-
-
Convert hiragana to katakana
-
よみたん → ヨミタン
-
-
- -
-
-
-
-
Convert katakana to hiragana
-
ヨミタン → よみたん
-
-
- -
-
-
-
-
Collapse emphatic character sequences
-
すっっごーーい → すっごーい / すごい
-
-
- -
-
diff --git a/test/data/anki-note-builder-test-results.json b/test/data/anki-note-builder-test-results.json index a2dd0923..1d84712d 100644 --- a/test/data/anki-note-builder-test-results.json +++ b/test/data/anki-note-builder-test-results.json @@ -869,6 +869,43 @@ "sentence-furigana": "cloze-prefixダースcloze-suffix", "tags": "abbr, n", "url": "url:" + }, + { + "audio": "", + "clipboard-image": "", + "clipboard-text": "", + "cloze-body": "ダ", + "cloze-body-kana": "ダ", + "cloze-prefix": "cloze-prefix", + "cloze-suffix": "cloze-suffix", + "conjugation": "", + "dictionary": "Test Dictionary 2", + "document-title": "title", + "expression": "打", + "frequencies": "
  • Test Dictionary 2: 1
  • Test Dictionary 2: four
  • Test Dictionary 2: five (5)
  • Test Dictionary 2: 8
  • Test Dictionary 2: fourteen
  • Test Dictionary 2: twenty (20)
  • Test Dictionary 2: 26
", + "frequency-harmonic-rank": "1", + "frequency-harmonic-occurrence": "1", + "frequency-average-rank": "1", + "frequency-average-occurrence": "1", + "furigana": "", + "furigana-plain": "打[だ]", + "glossary": "
(n, Test Dictionary 2)
  • da definition 1
  • da definition 2
", + "glossary-brief": "
  • da definition 1
  • da definition 2
", + "glossary-no-dictionary": "
(n)
  • da definition 1
  • da definition 2
", + "part-of-speech": "Noun", + "pitch-accents": "No pitch accent data", + "pitch-accent-graphs": "No pitch accent data", + "pitch-accent-positions": "No pitch accent data", + "pitch-accent-categories": "", + "phonetic-transcriptions": "", + "reading": "だ", + "screenshot": "", + "search-query": "fullQuery", + "selection-text": "", + "sentence": "cloze-prefixダcloze-suffix", + "sentence-furigana": "cloze-prefixダcloze-suffix", + "tags": "n", + "url": "url:" } ] }, @@ -3437,7 +3474,7 @@ ] }, { - "name": "Test text transformations - convertNumericCharacters", + "name": "Test text preprocessors - convertNumericCharacters", "results": [ { "audio": "", @@ -3479,7 +3516,7 @@ ] }, { - "name": "Test text transformations - convertAlphabeticCharacters", + "name": "Test text preprocessors - convertAlphabeticCharacters", "results": [ { "audio": "", @@ -3558,7 +3595,7 @@ ] }, { - "name": "Test text transformations - convertKatakanaToHiragana", + "name": "Test text preprocessors - convertKatakanaToHiragana", "results": [ { "audio": "", @@ -3637,7 +3674,7 @@ ] }, { - "name": "Test text transformations - convertHiraganaToKatakana", + "name": "Test text preprocessors - convertHiraganaToKatakana", "results": [ { "audio": "", @@ -3679,7 +3716,7 @@ ] }, { - "name": "Test text transformations - convertHalfWidthCharacters, convertKatakanaToHiragana", + "name": "Test text preprocessors - convertHalfWidthCharacters, convertKatakanaToHiragana", "results": [ { "audio": "", @@ -3758,7 +3795,7 @@ ] }, { - "name": "Test text transformations - collapseEmphaticSequences", + "name": "Test text preprocessors - collapseEmphaticSequences", "results": [ { "audio": "", @@ -3768,7 +3805,7 @@ "cloze-body-kana": "すっっごーーい", "cloze-prefix": "cloze-prefix", "cloze-suffix": "cloze-suffix", - "conjugation": "", + "conjugation": "
  • adv « kansai-ben
", "dictionary": "Test Dictionary 2", "document-title": "title", "expression": "凄い", @@ -3798,5 +3835,89 @@ "url": "url:" } ] + }, + { + "name": "Test text preprocessors - capitalizeFirstLetter", + "results": [ + { + "audio": "", + "clipboard-image": "", + "clipboard-text": "", + "cloze-body": "english", + "cloze-body-kana": "english", + "cloze-prefix": "cloze-prefix", + "cloze-suffix": "cloze-suffix", + "conjugation": "", + "dictionary": "Test Dictionary 2", + "document-title": "title", + "expression": "English", + "frequencies": "", + "frequency-harmonic-rank": "9999999", + "frequency-harmonic-occurrence": "0", + "frequency-average-rank": "9999999", + "frequency-average-occurrence": "0", + "furigana": "English", + "furigana-plain": "English", + "glossary": "
(n, Test Dictionary 2) English definition
", + "glossary-brief": "
English definition
", + "glossary-no-dictionary": "
(n) English definition
", + "part-of-speech": "Noun", + "pitch-accents": "No pitch accent data", + "pitch-accent-graphs": "No pitch accent data", + "pitch-accent-positions": "No pitch accent data", + "pitch-accent-categories": "", + "phonetic-transcriptions": "", + "reading": "English", + "screenshot": "", + "search-query": "fullQuery", + "selection-text": "", + "sentence": "cloze-prefixenglishcloze-suffix", + "sentence-furigana": "cloze-prefixenglishcloze-suffix", + "tags": "n", + "url": "url:" + } + ] + }, + { + "name": "Test text preprocessors - decapitalize", + "results": [ + { + "audio": "", + "clipboard-image": "", + "clipboard-text": "", + "cloze-body": "LANGUAGE", + "cloze-body-kana": "LANGUAGE", + "cloze-prefix": "cloze-prefix", + "cloze-suffix": "cloze-suffix", + "conjugation": "", + "dictionary": "Test Dictionary 2", + "document-title": "title", + "expression": "language", + "frequencies": "", + "frequency-harmonic-rank": "9999999", + "frequency-harmonic-occurrence": "0", + "frequency-average-rank": "9999999", + "frequency-average-occurrence": "0", + "furigana": "language", + "furigana-plain": "language", + "glossary": "
(n, Test Dictionary 2) language definition
", + "glossary-brief": "
language definition
", + "glossary-no-dictionary": "
(n) language definition
", + "part-of-speech": "Noun", + "pitch-accents": "No pitch accent data", + "pitch-accent-graphs": "No pitch accent data", + "pitch-accent-positions": "No pitch accent data", + "pitch-accent-categories": "", + "phonetic-transcriptions": "", + "reading": "language", + "screenshot": "", + "search-query": "fullQuery", + "selection-text": "", + "sentence": "cloze-prefixLANGUAGEcloze-suffix", + "sentence-furigana": "cloze-prefixLANGUAGEcloze-suffix", + "tags": "n", + "url": "url:" + } + ] } ] diff --git a/test/data/database-test-cases.json b/test/data/database-test-cases.json index 6d0f74ce..5747f59e 100644 --- a/test/data/database-test-cases.json +++ b/test/data/database-test-cases.json @@ -27,7 +27,7 @@ "ipa": 1 }, "terms": { - "total": 27 + "total": 29 } } }, @@ -36,7 +36,7 @@ { "kanji": 2, "kanjiMeta": 6, - "terms": 27, + "terms": 29, "termMeta": 39, "tagMeta": 15, "media": 6 @@ -45,7 +45,7 @@ "total": { "kanji": 2, "kanjiMeta": 6, - "terms": 27, + "terms": 29, "termMeta": 39, "tagMeta": 15, "media": 6 diff --git a/test/data/dictionaries/valid-dictionary1/term_bank_1.json b/test/data/dictionaries/valid-dictionary1/term_bank_1.json index 03a9900c..e7fb015c 100644 --- a/test/data/dictionaries/valid-dictionary1/term_bank_1.json +++ b/test/data/dictionaries/valid-dictionary1/term_bank_1.json @@ -341,5 +341,7 @@ ["のたまう", "のたまう", "v5", "v5", 1, ["notamau definition"], 15, ""], ["のたもうた", "のたもうた", "", "", 1, [["のたまう", ["past"]]], 16, ""], ["39", "さんきゅう", "", "", 1, ["sankyuu definition"], 17, ""], - ["凄い", "すごい", "adj-i", "adj-i", 1, ["sugoi definition"], 18, ""] + ["凄い", "すごい", "adj-i", "adj-i", 1, ["sugoi definition"], 18, ""], + ["English", "", "n", "n", 1, ["English definition"], 19, ""], + ["language", "", "n", "n", 1, ["language definition"], 20, ""] ] diff --git a/test/data/translator-test-inputs.json b/test/data/translator-test-inputs.json index 03ec938a..c9047716 100644 --- a/test/data/translator-test-inputs.json +++ b/test/data/translator-test-inputs.json @@ -21,12 +21,7 @@ "sortFrequencyDictionary": null, "sortFrequencyDictionaryOrder": "descending", "removeNonJapaneseCharacters": true, - "convertHalfWidthCharacters": "false", - "convertNumericCharacters": "false", - "convertAlphabeticCharacters": "false", - "convertHiraganaToKatakana": "false", - "convertKatakanaToHiragana": "false", - "collapseEmphaticSequences": "false", + "language": "ja", "textReplacements": [ null ], @@ -361,7 +356,7 @@ "options": "default" }, { - "name": "Test text transformations - convertNumericCharacters", + "name": "Test text preprocessors - convertNumericCharacters", "func": "findTerms", "mode": "split", "text": "39", @@ -369,13 +364,12 @@ "default", { "type": "terms", - "convertNumericCharacters": "true", "removeNonJapaneseCharacters": false } ] }, { - "name": "Test text transformations - convertAlphabeticCharacters", + "name": "Test text preprocessors - convertAlphabeticCharacters", "func": "findTerms", "mode": "split", "text": "utsu", @@ -383,61 +377,83 @@ "default", { "type": "terms", - "convertAlphabeticCharacters": "true", "removeNonJapaneseCharacters": false } ] }, { - "name": "Test text transformations - convertKatakanaToHiragana", + "name": "Test text preprocessors - convertKatakanaToHiragana", "func": "findTerms", "mode": "split", "text": "ウツ", "options": [ "default", { - "type": "terms", - "convertKatakanaToHiragana": "true" + "type": "terms" } ] }, { - "name": "Test text transformations - convertHiraganaToKatakana", + "name": "Test text preprocessors - convertHiraganaToKatakana", "func": "findTerms", "mode": "split", "text": "てきすと", "options": [ "default", { - "type": "terms", - "convertHiraganaToKatakana": "true" + "type": "terms" } ] }, { - "name": "Test text transformations - convertHalfWidthCharacters, convertKatakanaToHiragana", + "name": "Test text preprocessors - convertHalfWidthCharacters, convertKatakanaToHiragana", "func": "findTerms", "mode": "split", "text": "ウツ", "options": [ "default", { - "type": "terms", - "convertHalfWidthCharacters": "true", - "convertKatakanaToHiragana": "true" + "type": "terms" } ] }, { - "name": "Test text transformations - collapseEmphaticSequences", + "name": "Test text preprocessors - collapseEmphaticSequences", "func": "findTerms", "mode": "split", "text": "すっっごーーい", + "options": [ + "default", + { + "type": "terms" + } + ] + }, + { + "name": "Test text preprocessors - capitalizeFirstLetter", + "func": "findTerms", + "mode": "split", + "text": "english", "options": [ "default", { "type": "terms", - "collapseEmphaticSequences": "full" + "language": "en", + "removeNonJapaneseCharacters": false + } + ] + }, + { + "name": "Test text preprocessors - decapitalize", + "func": "findTerms", + "mode": "split", + "text": "LANGUAGE", + "options": [ + "default", + { + "type": "terms", + "language": "en", + "removeNonJapaneseCharacters": false } ] } diff --git a/test/data/translator-test-results-note-data1.json b/test/data/translator-test-results-note-data1.json index f84f28a4..f580ac53 100644 --- a/test/data/translator-test-results-note-data1.json +++ b/test/data/translator-test-results-note-data1.json @@ -7276,6 +7276,320 @@ } }, "media": {} + }, + { + "marker": "{marker}", + "definition": { + "type": "term", + "id": 1, + "source": "だ", + "rawSource": "ダ", + "sourceTerm": "だ", + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "isPrimary": true, + "sequence": 1, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "dictionaryNames": [ + "Test Dictionary 2" + ], + "expression": "打", + "reading": "だ", + "expressions": [ + { + "sourceTerm": "だ", + "expression": "打", + "reading": "だ", + "termTags": [ + { + "name": "E1", + "category": "default", + "notes": "example tag 1", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "frequencies": [ + { + "index": 0, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": 1 + }, + { + "index": 1, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": "four" + }, + { + "index": 2, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": "five (5)" + }, + { + "index": 3, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": 8 + }, + { + "index": 4, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": "fourteen" + }, + { + "index": 5, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": "twenty (20)" + }, + { + "index": 6, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": 26 + } + ], + "pitches": [], + "furiganaSegments": [ + { + "text": "打", + "furigana": "だ" + } + ], + "termFrequency": "normal", + "wordClasses": [ + "n" + ] + } + ], + "glossary": [ + "da definition 1", + "da definition 2" + ], + "definitionTags": [ + { + "name": "n", + "category": "partOfSpeech", + "notes": "noun", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "termTags": [ + { + "name": "E1", + "category": "default", + "notes": "example tag 1", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "frequencies": [ + { + "index": 0, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": 1 + }, + { + "index": 1, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": "four" + }, + { + "index": 2, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": false, + "frequency": "five (5)" + }, + { + "index": 3, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": 8 + }, + { + "index": 4, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": "fourteen" + }, + { + "index": 5, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": "twenty (20)" + }, + { + "index": 6, + "expressionIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "expression": "打", + "reading": "だ", + "hasReading": true, + "frequency": 26 + } + ], + "frequencyHarmonic": 1, + "frequencyAverage": 1, + "pitches": [], + "phoneticTranscriptions": [], + "sourceTermExactMatchCount": 0, + "url": "url:", + "cloze": { + "sentence": "", + "prefix": "", + "body": "", + "bodyKana": "", + "suffix": "" + }, + "furiganaSegments": [ + { + "text": "打", + "furigana": "だ" + } + ] + }, + "glossaryLayoutMode": "default", + "compactTags": false, + "group": false, + "merge": false, + "modeTermKanji": false, + "modeTermKana": false, + "modeKanji": false, + "compactGlossaries": false, + "uniqueExpressions": [ + "打" + ], + "uniqueReadings": [ + "だ" + ], + "pitches": [], + "pitchCount": 0, + "phoneticTranscriptions": [], + "context": { + "query": "query", + "fullQuery": "fullQuery", + "document": { + "title": "title" + } + }, + "media": {} } ] }, @@ -30117,7 +30431,7 @@ ] }, { - "name": "Test text transformations - convertNumericCharacters", + "name": "Test text preprocessors - convertNumericCharacters", "noteDataList": [ { "marker": "{marker}", @@ -30219,7 +30533,7 @@ ] }, { - "name": "Test text transformations - convertAlphabeticCharacters", + "name": "Test text preprocessors - convertAlphabeticCharacters", "noteDataList": [ { "marker": "{marker}", @@ -30852,7 +31166,7 @@ ] }, { - "name": "Test text transformations - convertKatakanaToHiragana", + "name": "Test text preprocessors - convertKatakanaToHiragana", "noteDataList": [ { "marker": "{marker}", @@ -31485,7 +31799,7 @@ ] }, { - "name": "Test text transformations - convertHiraganaToKatakana", + "name": "Test text preprocessors - convertHiraganaToKatakana", "noteDataList": [ { "marker": "{marker}", @@ -31638,7 +31952,7 @@ ] }, { - "name": "Test text transformations - convertHalfWidthCharacters, convertKatakanaToHiragana", + "name": "Test text preprocessors - convertHalfWidthCharacters, convertKatakanaToHiragana", "noteDataList": [ { "marker": "{marker}", @@ -32271,7 +32585,7 @@ ] }, { - "name": "Test text transformations - collapseEmphaticSequences", + "name": "Test text preprocessors - collapseEmphaticSequences", "noteDataList": [ { "marker": "{marker}", @@ -32285,6 +32599,13 @@ { "source": "algorithm", "inflectionRules": [] + }, + { + "source": "algorithm", + "inflectionRules": [ + "adv", + "kansai-ben" + ] } ], "score": 1, @@ -32391,5 +32712,233 @@ "media": {} } ] + }, + { + "name": "Test text preprocessors - capitalizeFirstLetter", + "noteDataList": [ + { + "marker": "{marker}", + "definition": { + "type": "term", + "id": 27, + "source": "English", + "rawSource": "english", + "sourceTerm": "English", + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "isPrimary": true, + "sequence": 19, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "dictionaryNames": [ + "Test Dictionary 2" + ], + "expression": "English", + "reading": "English", + "expressions": [ + { + "sourceTerm": "English", + "expression": "English", + "reading": "English", + "termTags": [], + "frequencies": [], + "pitches": [], + "furiganaSegments": [ + { + "text": "English", + "furigana": "" + } + ], + "termFrequency": "normal", + "wordClasses": [ + "n" + ] + } + ], + "glossary": [ + "English definition" + ], + "definitionTags": [ + { + "name": "n", + "category": "partOfSpeech", + "notes": "noun", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "termTags": [], + "frequencies": [], + "frequencyHarmonic": -1, + "frequencyAverage": -1, + "pitches": [], + "phoneticTranscriptions": [], + "sourceTermExactMatchCount": 1, + "url": "url:", + "cloze": { + "sentence": "", + "prefix": "", + "body": "", + "bodyKana": "", + "suffix": "" + }, + "furiganaSegments": [ + { + "text": "English", + "furigana": "" + } + ] + }, + "glossaryLayoutMode": "default", + "compactTags": false, + "group": false, + "merge": false, + "modeTermKanji": false, + "modeTermKana": false, + "modeKanji": false, + "compactGlossaries": false, + "uniqueExpressions": [ + "English" + ], + "uniqueReadings": [ + "English" + ], + "pitches": [], + "pitchCount": 0, + "phoneticTranscriptions": [], + "context": { + "query": "query", + "fullQuery": "fullQuery", + "document": { + "title": "title" + } + }, + "media": {} + } + ] + }, + { + "name": "Test text preprocessors - decapitalize", + "noteDataList": [ + { + "marker": "{marker}", + "definition": { + "type": "term", + "id": 28, + "source": "language", + "rawSource": "LANGUAGE", + "sourceTerm": "language", + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "isPrimary": true, + "sequence": 20, + "dictionary": "Test Dictionary 2", + "dictionaryOrder": { + "index": 0, + "priority": 0 + }, + "dictionaryNames": [ + "Test Dictionary 2" + ], + "expression": "language", + "reading": "language", + "expressions": [ + { + "sourceTerm": "language", + "expression": "language", + "reading": "language", + "termTags": [], + "frequencies": [], + "pitches": [], + "furiganaSegments": [ + { + "text": "language", + "furigana": "" + } + ], + "termFrequency": "normal", + "wordClasses": [ + "n" + ] + } + ], + "glossary": [ + "language definition" + ], + "definitionTags": [ + { + "name": "n", + "category": "partOfSpeech", + "notes": "noun", + "order": 0, + "score": 0, + "dictionary": "Test Dictionary 2", + "redundant": false + } + ], + "termTags": [], + "frequencies": [], + "frequencyHarmonic": -1, + "frequencyAverage": -1, + "pitches": [], + "phoneticTranscriptions": [], + "sourceTermExactMatchCount": 1, + "url": "url:", + "cloze": { + "sentence": "", + "prefix": "", + "body": "", + "bodyKana": "", + "suffix": "" + }, + "furiganaSegments": [ + { + "text": "language", + "furigana": "" + } + ] + }, + "glossaryLayoutMode": "default", + "compactTags": false, + "group": false, + "merge": false, + "modeTermKanji": false, + "modeTermKana": false, + "modeKanji": false, + "compactGlossaries": false, + "uniqueExpressions": [ + "language" + ], + "uniqueReadings": [ + "language" + ], + "pitches": [], + "pitchCount": 0, + "phoneticTranscriptions": [], + "context": { + "query": "query", + "fullQuery": "fullQuery", + "document": { + "title": "title" + } + }, + "media": {} + } + ] } ] diff --git a/test/data/translator-test-results.json b/test/data/translator-test-results.json index e11cb4a9..cd3f7ab6 100644 --- a/test/data/translator-test-results.json +++ b/test/data/translator-test-results.json @@ -3969,6 +3969,174 @@ "displayValueParsed": false } ] + }, + { + "type": "term", + "isPrimary": true, + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "frequencyOrder": 0, + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "sourceTermExactMatchCount": 0, + "maxTransformedTextLength": 1, + "headwords": [ + { + "index": 0, + "term": "打", + "reading": "だ", + "sources": [ + { + "originalText": "ダ", + "transformedText": "だ", + "deinflectedText": "だ", + "matchType": "exact", + "matchSource": "reading", + "isPrimary": true + } + ], + "tags": [ + { + "name": "E1", + "category": "default", + "order": 0, + "score": 0, + "content": [ + "example tag 1" + ], + "dictionaries": [ + "Test Dictionary 2" + ], + "redundant": false + } + ], + "wordClasses": [ + "n" + ] + } + ], + "definitions": [ + { + "index": 0, + "headwordIndices": [ + 0 + ], + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "id": 1, + "score": 1, + "frequencyOrder": 0, + "sequences": [ + 1 + ], + "isPrimary": true, + "tags": [ + { + "name": "n", + "category": "partOfSpeech", + "order": 0, + "score": 0, + "content": [ + "noun" + ], + "dictionaries": [ + "Test Dictionary 2" + ], + "redundant": false + } + ], + "entries": [ + "da definition 1", + "da definition 2" + ] + } + ], + "pronunciations": [], + "frequencies": [ + { + "index": 0, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": false, + "frequency": 1, + "displayValue": null, + "displayValueParsed": false + }, + { + "index": 1, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": false, + "frequency": 0, + "displayValue": "four", + "displayValueParsed": true + }, + { + "index": 2, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": false, + "frequency": 5, + "displayValue": "five (5)", + "displayValueParsed": true + }, + { + "index": 3, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": true, + "frequency": 8, + "displayValue": null, + "displayValueParsed": false + }, + { + "index": 4, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": true, + "frequency": 0, + "displayValue": "fourteen", + "displayValueParsed": true + }, + { + "index": 5, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": true, + "frequency": 20, + "displayValue": "twenty (20)", + "displayValueParsed": true + }, + { + "index": 6, + "headwordIndex": 0, + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "hasReading": true, + "frequency": 26, + "displayValue": null, + "displayValueParsed": false + } + ] } ] }, @@ -16978,7 +17146,7 @@ ] }, { - "name": "Test text transformations - convertNumericCharacters", + "name": "Test text preprocessors - convertNumericCharacters", "originalTextLength": 2, "dictionaryEntries": [ { @@ -17043,7 +17211,7 @@ ] }, { - "name": "Test text transformations - convertAlphabeticCharacters", + "name": "Test text preprocessors - convertAlphabeticCharacters", "originalTextLength": 4, "dictionaryEntries": [ { @@ -17389,7 +17557,7 @@ ] }, { - "name": "Test text transformations - convertKatakanaToHiragana", + "name": "Test text preprocessors - convertKatakanaToHiragana", "originalTextLength": 2, "dictionaryEntries": [ { @@ -17735,7 +17903,7 @@ ] }, { - "name": "Test text transformations - convertHiraganaToKatakana", + "name": "Test text preprocessors - convertHiraganaToKatakana", "originalTextLength": 4, "dictionaryEntries": [ { @@ -17844,7 +18012,7 @@ ] }, { - "name": "Test text transformations - convertHalfWidthCharacters, convertKatakanaToHiragana", + "name": "Test text preprocessors - convertHalfWidthCharacters, convertKatakanaToHiragana", "originalTextLength": 2, "dictionaryEntries": [ { @@ -18190,7 +18358,7 @@ ] }, { - "name": "Test text transformations - collapseEmphaticSequences", + "name": "Test text preprocessors - collapseEmphaticSequences", "originalTextLength": 7, "dictionaryEntries": [ { @@ -18200,6 +18368,13 @@ { "source": "algorithm", "inflectionRules": [] + }, + { + "source": "algorithm", + "inflectionRules": [ + "adv", + "kansai-ben" + ] } ], "score": 1, @@ -18267,5 +18442,167 @@ "frequencies": [] } ] + }, + { + "name": "Test text preprocessors - capitalizeFirstLetter", + "originalTextLength": 7, + "dictionaryEntries": [ + { + "type": "term", + "isPrimary": true, + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "frequencyOrder": 0, + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "sourceTermExactMatchCount": 1, + "maxTransformedTextLength": 7, + "headwords": [ + { + "index": 0, + "term": "English", + "reading": "English", + "sources": [ + { + "originalText": "english", + "transformedText": "English", + "deinflectedText": "English", + "matchType": "exact", + "matchSource": "term", + "isPrimary": true + } + ], + "tags": [], + "wordClasses": [ + "n" + ] + } + ], + "definitions": [ + { + "index": 0, + "headwordIndices": [ + 0 + ], + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "id": 27, + "score": 1, + "frequencyOrder": 0, + "sequences": [ + 19 + ], + "isPrimary": true, + "tags": [ + { + "name": "n", + "category": "partOfSpeech", + "order": 0, + "score": 0, + "content": [ + "noun" + ], + "dictionaries": [ + "Test Dictionary 2" + ], + "redundant": false + } + ], + "entries": [ + "English definition" + ] + } + ], + "pronunciations": [], + "frequencies": [] + } + ] + }, + { + "name": "Test text preprocessors - decapitalize", + "originalTextLength": 8, + "dictionaryEntries": [ + { + "type": "term", + "isPrimary": true, + "inflectionRuleChainCandidates": [ + { + "source": "algorithm", + "inflectionRules": [] + } + ], + "score": 1, + "frequencyOrder": 0, + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "sourceTermExactMatchCount": 1, + "maxTransformedTextLength": 8, + "headwords": [ + { + "index": 0, + "term": "language", + "reading": "language", + "sources": [ + { + "originalText": "LANGUAGE", + "transformedText": "language", + "deinflectedText": "language", + "matchType": "exact", + "matchSource": "term", + "isPrimary": true + } + ], + "tags": [], + "wordClasses": [ + "n" + ] + } + ], + "definitions": [ + { + "index": 0, + "headwordIndices": [ + 0 + ], + "dictionary": "Test Dictionary 2", + "dictionaryIndex": 0, + "dictionaryPriority": 0, + "id": 28, + "score": 1, + "frequencyOrder": 0, + "sequences": [ + 20 + ], + "isPrimary": true, + "tags": [ + { + "name": "n", + "category": "partOfSpeech", + "order": 0, + "score": 0, + "content": [ + "noun" + ], + "dictionaries": [ + "Test Dictionary 2" + ], + "redundant": false + } + ], + "entries": [ + "language definition" + ] + } + ], + "pronunciations": [], + "frequencies": [] + } + ] } ] diff --git a/test/options-util.test.js b/test/options-util.test.js index 3a1b1efb..8c2bd775 100644 --- a/test/options-util.test.js +++ b/test/options-util.test.js @@ -241,6 +241,7 @@ function createProfileOptionsUpdatedTestData1() { return { general: { enable: true, + language: 'ja', resultOutputMode: 'group', debugInfo: false, maxResults: 32, @@ -405,12 +406,6 @@ function createProfileOptionsUpdatedTestData1() { ] }, translation: { - convertHalfWidthCharacters: 'false', - convertNumericCharacters: 'false', - convertAlphabeticCharacters: 'false', - convertHiraganaToKatakana: 'false', - convertKatakanaToHiragana: 'variant', - collapseEmphaticSequences: 'false', searchResolution: 'letter', textReplacements: { searchOriginal: true, @@ -604,7 +599,7 @@ function createOptionsUpdatedTestData1() { } ], profileCurrent: 0, - version: 25, + version: 26, global: { database: { prefixWildcardsSupported: false diff --git a/test/utilities/translator.js b/test/utilities/translator.js index f452e688..e9c08443 100644 --- a/test/utilities/translator.js +++ b/test/utilities/translator.js @@ -124,14 +124,9 @@ export function createFindTermsOptions(dictionaryName, optionsPresets, optionsAr sortFrequencyDictionary, sortFrequencyDictionaryOrder, removeNonJapaneseCharacters, - convertHalfWidthCharacters, - convertNumericCharacters, - convertAlphabeticCharacters, - convertHiraganaToKatakana, - convertKatakanaToHiragana, - collapseEmphaticSequences, excludeDictionaryDefinitions, - searchResolution + searchResolution, + language } = preset; return { @@ -141,15 +136,10 @@ export function createFindTermsOptions(dictionaryName, optionsPresets, optionsAr sortFrequencyDictionary: typeof sortFrequencyDictionary !== 'undefined' ? sortFrequencyDictionary : null, sortFrequencyDictionaryOrder: typeof sortFrequencyDictionaryOrder !== 'undefined' ? sortFrequencyDictionaryOrder : 'ascending', removeNonJapaneseCharacters: typeof removeNonJapaneseCharacters !== 'undefined' ? removeNonJapaneseCharacters : false, - convertHalfWidthCharacters: typeof convertHalfWidthCharacters !== 'undefined' ? convertHalfWidthCharacters : 'false', - convertNumericCharacters: typeof convertNumericCharacters !== 'undefined' ? convertNumericCharacters : 'false', - convertAlphabeticCharacters: typeof convertAlphabeticCharacters !== 'undefined' ? convertAlphabeticCharacters : 'false', - convertHiraganaToKatakana: typeof convertHiraganaToKatakana !== 'undefined' ? convertHiraganaToKatakana : 'false', - convertKatakanaToHiragana: typeof convertKatakanaToHiragana !== 'undefined' ? convertKatakanaToHiragana : 'false', - collapseEmphaticSequences: typeof collapseEmphaticSequences !== 'undefined' ? collapseEmphaticSequences : 'false', textReplacements, enabledDictionaryMap, excludeDictionaryDefinitions: Array.isArray(excludeDictionaryDefinitions) ? new Set(excludeDictionaryDefinitions) : null, - searchResolution: typeof searchResolution !== 'undefined' ? searchResolution : 'letter' + searchResolution: typeof searchResolution !== 'undefined' ? searchResolution : 'letter', + language: typeof language !== 'undefined' ? language : 'ja' }; } diff --git a/types/ext/api.d.ts b/types/ext/api.d.ts index 1f4fc0a9..85f4c146 100644 --- a/types/ext/api.d.ts +++ b/types/ext/api.d.ts @@ -26,6 +26,7 @@ import type * as DictionaryDatabase from './dictionary-database'; import type * as DictionaryImporter from './dictionary-importer'; import type * as Environment from './environment'; import type * as Extension from './extension'; +import type * as Language from './language'; import type * as Log from './log'; import type * as Settings from './settings'; import type * as SettingsModifications from './settings-modifications'; @@ -380,6 +381,10 @@ type ApiSurface = { params: void; return: boolean; }; + getLanguageSummaries: { + params: void; + return: Language.LanguageSummary[]; + }; }; type ApiExtraArgs = [sender: chrome.runtime.MessageSender]; diff --git a/types/ext/language-english.d.ts b/types/ext/language-english.d.ts new file mode 100644 index 00000000..ed501d57 --- /dev/null +++ b/types/ext/language-english.d.ts @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import type {LanguageDescriptor, TextPreprocessor} from './language'; + +export type EnglishTextPreprocessorDescriptor = { + capitalizeFirstLetter: TextPreprocessor; + decapitalize: TextPreprocessor; +}; + +export type EnglishLanguageDescriptor = LanguageDescriptor; diff --git a/types/ext/language-japanese.d.ts b/types/ext/language-japanese.d.ts new file mode 100644 index 00000000..1a627ed1 --- /dev/null +++ b/types/ext/language-japanese.d.ts @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import type {LanguageDescriptor, TextPreprocessor} from './language'; + +export type JapaneseTextPreprocessorDescriptor = { + convertHalfWidthCharacters: TextPreprocessor; + convertNumericCharacters: TextPreprocessor; + convertAlphabeticCharacters: TextPreprocessor; + convertHiraganaToKatakana: TextPreprocessor; + convertKatakanaToHiragana: TextPreprocessor; + collapseEmphaticSequences: TextPreprocessor<[collapseEmphatic: boolean, collapseEmphaticFull: boolean]>; +}; + +export type JapaneseLanguageDescriptor = LanguageDescriptor; diff --git a/types/ext/language.d.ts b/types/ext/language.d.ts new file mode 100644 index 00000000..247c7795 --- /dev/null +++ b/types/ext/language.d.ts @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import type {TextSourceMap} from '../../ext/js/general/text-source-map.js'; +import type {SafeAny} from './core'; + +export type TextPreprocessorOptions = T[]; + +export type TextPreprocessorFunction = (str: string, setting: T, sourceMap: TextSourceMap) => string; + +export type TextPreprocessor = { + name: string; + description: string; + options: TextPreprocessorOptions; + process: TextPreprocessorFunction; +}; + +export type LanguageAndPreprocessors = { + iso: string; + textPreprocessors: TextPreprocessorWithId[]; +}; + +export type TextPreprocessorWithId = { + id: string; + textPreprocessor: TextPreprocessor; +}; + +export type LanguageSummary = { + name: string; + iso: string; + exampleText: string; +}; + +export type LanguageDescriptor = { + name: string; + iso: string; + exampleText: string; + textPreprocessors: TTextPreprocessorDescriptor; +}; + +export type TextPreprocessorDescriptor = { + [key: string]: TextPreprocessor; +}; diff --git a/types/ext/settings.d.ts b/types/ext/settings.d.ts index a900dbe6..45466c3d 100644 --- a/types/ext/settings.d.ts +++ b/types/ext/settings.d.ts @@ -101,6 +101,7 @@ export type ProfileOptions = { export type GeneralOptions = { enable: boolean; + language: string; resultOutputMode: ResultOutputMode; debugInfo: boolean; maxResults: number; diff --git a/types/ext/translation-internal.d.ts b/types/ext/translation-internal.d.ts index 82704c54..7006221e 100644 --- a/types/ext/translation-internal.d.ts +++ b/types/ext/translation-internal.d.ts @@ -18,6 +18,7 @@ import type * as DictionaryDatabase from './dictionary-database'; import type * as Dictionary from './dictionary'; import type * as Translation from './translation'; +import type * as Language from './language'; export type TextDeinflectionOptions = [ textReplacements: Translation.FindTermsTextReplacement[] | null, @@ -47,3 +48,5 @@ export type DatabaseDeinflection = { inflectionRuleChainCandidates: Dictionary.InflectionRuleChainCandidate[]; databaseEntries: DictionaryDatabase.TermEntry[]; }; + +export type PreprocessorOptionsSpace = Map>; diff --git a/types/ext/translation.d.ts b/types/ext/translation.d.ts index c9a61be0..2e4d1a66 100644 --- a/types/ext/translation.d.ts +++ b/types/ext/translation.d.ts @@ -80,30 +80,6 @@ export type FindTermsOptions = { * Whether or not non-Japanese characters should be searched. */ removeNonJapaneseCharacters: boolean; - /** - * Whether or not half-width characters should be converted to full-width characters. - */ - convertHalfWidthCharacters: FindTermsVariantMode; - /** - * Whether or not ASCII numeric characters should be converted to full-width numeric characters. - */ - convertNumericCharacters: FindTermsVariantMode; - /** - * Whether or not alphabetic characters should be converted to kana. - */ - convertAlphabeticCharacters: FindTermsVariantMode; - /** - * Whether or not hiragana characters should be converted to katakana. - */ - convertHiraganaToKatakana: FindTermsVariantMode; - /** - * Whether or not katakana characters should be converted to hiragana. - */ - convertKatakanaToHiragana: FindTermsVariantMode; - /** - * How emphatic character sequences should be collapsed. - */ - collapseEmphaticSequences: FindTermsEmphaticSequencesMode; /** * An iterable sequence of text replacements to be applied during the term lookup process. */ @@ -121,6 +97,10 @@ export type FindTermsOptions = { * Whether every substring should be searched for, or only whole words. */ searchResolution: SearchResolution; + /** + * ISO-639 code of the language. + */ + language: string; }; /** @@ -133,16 +113,6 @@ export type FindTermsMatchType = Dictionary.TermSourceMatchType; */ export type FindTermsSortOrder = 'ascending' | 'descending'; -/** - * Mode describing how to handle variations. - */ -export type FindTermsVariantMode = 'false' | 'true' | 'variant'; - -/** - * Mode describing how to handle emphatic sequence variations. - */ -export type FindTermsEmphaticSequencesMode = 'false' | 'true' | 'full'; - /** * Information about how text should be replaced when looking up terms. */ diff --git a/types/test/translator.d.ts b/types/test/translator.d.ts index e3199225..efd5cc3f 100644 --- a/types/test/translator.d.ts +++ b/types/test/translator.d.ts @@ -15,8 +15,8 @@ * along with this program. If not, see . */ -import type {FindTermsMatchType, FindTermsSortOrder, FindTermsVariantMode, FindTermsEmphaticSequencesMode, FindKanjiDictionary, FindTermDictionary} from '../ext/translation'; -import type {SearchResolution} from 'settings'; +import type {FindTermsMatchType, FindTermsSortOrder, FindKanjiDictionary, FindTermDictionary} from '../ext/translation'; +import type {SearchResolution} from '../ext/settings'; import type {FindTermsMode} from 'translator'; import type {DictionaryEntry} from 'dictionary'; import type {NoteData} from 'anki-templates'; @@ -44,16 +44,11 @@ export type FindTermsOptionsPreset = { sortFrequencyDictionary?: string | null; sortFrequencyDictionaryOrder?: FindTermsSortOrder; removeNonJapaneseCharacters?: boolean; - convertHalfWidthCharacters?: FindTermsVariantMode; - convertNumericCharacters?: FindTermsVariantMode; - convertAlphabeticCharacters?: FindTermsVariantMode; - convertHiraganaToKatakana?: FindTermsVariantMode; - convertKatakanaToHiragana?: FindTermsVariantMode; - collapseEmphaticSequences?: FindTermsEmphaticSequencesMode; textReplacements?: (FindTermsTextReplacement[] | null)[]; enabledDictionaryMap?: [key: string, value: FindTermDictionary][]; excludeDictionaryDefinitions?: string[] | null; searchResolution?: SearchResolution; + language?: string; }; export type OptionsType = OptionsPreset['type']; -- cgit v1.2.3