diff options
-rw-r--r-- | ext/bg/data/options-schema.json | 8 | ||||
-rw-r--r-- | ext/bg/js/japanese.js | 40 | ||||
-rw-r--r-- | ext/bg/js/options.js | 3 | ||||
-rw-r--r-- | ext/bg/js/settings/main.js | 2 | ||||
-rw-r--r-- | ext/bg/js/translator.js | 17 | ||||
-rw-r--r-- | ext/bg/settings.html | 11 | ||||
-rw-r--r-- | test/test-japanese.js | 54 |
7 files changed, 129 insertions, 6 deletions
diff --git a/ext/bg/data/options-schema.json b/ext/bg/data/options-schema.json index da1f1ce0..4f9e694d 100644 --- a/ext/bg/data/options-schema.json +++ b/ext/bg/data/options-schema.json @@ -388,7 +388,8 @@ "convertNumericCharacters", "convertAlphabeticCharacters", "convertHiraganaToKatakana", - "convertKatakanaToHiragana" + "convertKatakanaToHiragana", + "collapseEmphaticSequences" ], "properties": { "convertHalfWidthCharacters": { @@ -415,6 +416,11 @@ "type": "string", "enum": ["false", "true", "variant"], "default": "variant" + }, + "collapseEmphaticSequences": { + "type": "string", + "enum": ["false", "true", "full"], + "default": "false" } } }, diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js index 5c49cca7..5fef27a7 100644 --- a/ext/bg/js/japanese.js +++ b/ext/bg/js/japanese.js @@ -82,6 +82,9 @@ const ITERATION_MARK_CODE_POINT = 0x3005; + const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; + const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; + const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; // Existing functions @@ -372,6 +375,40 @@ } + // Miscellaneous + + function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) { + let result = ''; + let collapseCodePoint = -1; + const hasSourceMap = (sourceMap !== null); + for (const char of text) { + const c = char.codePointAt(0); + if ( + c === HIRAGANA_SMALL_TSU_CODE_POINT || + c === KATAKANA_SMALL_TSU_CODE_POINT || + c === KANA_PROLONGED_SOUND_MARK_CODE_POINT + ) { + if (collapseCodePoint !== c) { + collapseCodePoint = c; + if (!fullCollapse) { + result += char; + continue; + } + } + } else { + collapseCodePoint = -1; + result += char; + continue; + } + + if (hasSourceMap) { + sourceMap.combine(Math.max(0, result.length - 1), 1); + } + } + return result; + } + + // Exports Object.assign(jp, { @@ -383,6 +420,7 @@ convertHalfWidthKanaToFullWidth, convertAlphabeticToKana, distributeFurigana, - distributeFuriganaInflected + distributeFuriganaInflected, + collapseEmphaticSequences }); })(); diff --git a/ext/bg/js/options.js b/ext/bg/js/options.js index 20df2a68..f3e5f60d 100644 --- a/ext/bg/js/options.js +++ b/ext/bg/js/options.js @@ -170,7 +170,8 @@ function profileOptionsCreateDefaults() { convertNumericCharacters: 'false', convertAlphabeticCharacters: 'false', convertHiraganaToKatakana: 'false', - convertKatakanaToHiragana: 'variant' + convertKatakanaToHiragana: 'variant', + collapseEmphaticSequences: 'false' }, dictionaries: {}, diff --git a/ext/bg/js/settings/main.js b/ext/bg/js/settings/main.js index 8fd94562..308e92eb 100644 --- a/ext/bg/js/settings/main.js +++ b/ext/bg/js/settings/main.js @@ -118,6 +118,7 @@ async function formRead(options) { options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val(); options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val(); options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val(); + options.translation.collapseEmphaticSequences = $('#translation-collapse-emphatic-sequences').val(); options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked'); options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked'); @@ -199,6 +200,7 @@ async function formWrite(options) { $('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters); $('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana); $('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana); + $('#translation-collapse-emphatic-sequences').val(options.translation.collapseEmphaticSequences); $('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser); $('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser); diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js index e4441384..aaa1a0ec 100644 --- a/ext/bg/js/translator.js +++ b/ext/bg/js/translator.js @@ -347,17 +347,27 @@ class Translator { getAllDeinflections(text, options) { const translationOptions = options.translation; + const collapseEmphaticOptions = [[false, false]]; + switch (translationOptions.collapseEmphaticSequences) { + case 'true': + collapseEmphaticOptions.push([true, false]); + break; + case 'full': + collapseEmphaticOptions.push([true, false], [true, true]); + break; + } const textOptionVariantArray = [ Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana), - Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana) + Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana), + collapseEmphaticOptions ]; const deinflections = []; const used = new Set(); - for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) { + for (const [halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of Translator.getArrayVariants(textOptionVariantArray)) { let text2 = text; const sourceMap = new TextSourceMap(text2); if (halfWidth) { @@ -375,6 +385,9 @@ class Translator { if (hiragana) { text2 = jp.convertKatakanaToHiragana(text2); } + if (collapseEmphatic) { + text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + } for (let i = text2.length; i > 0; --i) { const text2Substring = text2.substring(0, i); diff --git a/ext/bg/settings.html b/ext/bg/settings.html index 1297a9cc..96c1db82 100644 --- a/ext/bg/settings.html +++ b/ext/bg/settings.html @@ -427,7 +427,7 @@ <p class="help-block"> The conversion options below are listed in the order that the conversions are applied to the input text. - Each conversion has three possible values: + Conversions commonly have three possible values: </p> <ul class="help-block"> @@ -490,6 +490,15 @@ <option value="variant">Use both variants</option> </select> </div> + + <div class="form-group"> + <label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(すっっごーーい → すっごーい / すごい)</span></label> + <select class="form-control" id="translation-collapse-emphatic-sequences"> + <option value="false">Disabled</option> + <option value="true">Collapse into single character</option> + <option value="full">Remove all characters</option> + </select> + </div> </div> <div id="popup-content-scanning"> diff --git a/test/test-japanese.js b/test/test-japanese.js index f4b084ac..89e41c36 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -393,6 +393,59 @@ function testDistributeFuriganaInflected() { } } +function testCollapseEmphaticSequences() { + const data = [ + [['かこい', false], ['かこい', [1, 1, 1]]], + [['かこい', true], ['かこい', [1, 1, 1]]], + [['かっこい', false], ['かっこい', [1, 1, 1, 1]]], + [['かっこい', true], ['かこい', [2, 1, 1]]], + [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]], + [['かっっこい', true], ['かこい', [3, 1, 1]]], + [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]], + [['かっっっこい', true], ['かこい', [4, 1, 1]]], + + [['こい', false], ['こい', [1, 1]]], + [['こい', true], ['こい', [1, 1]]], + [['っこい', false], ['っこい', [1, 1, 1]]], + [['っこい', true], ['こい', [2, 1]]], + [['っっこい', false], ['っこい', [2, 1, 1]]], + [['っっこい', true], ['こい', [3, 1]]], + [['っっっこい', false], ['っこい', [3, 1, 1]]], + [['っっっこい', true], ['こい', [4, 1]]], + + [['すごい', false], ['すごい', [1, 1, 1]]], + [['すごい', true], ['すごい', [1, 1, 1]]], + [['すごーい', false], ['すごーい', [1, 1, 1, 1]]], + [['すごーい', true], ['すごい', [1, 2, 1]]], + [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]], + [['すごーーい', true], ['すごい', [1, 3, 1]]], + [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]], + [['すっごーい', true], ['すごい', [2, 2, 1]]], + [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]], + [['すっっごーーい', true], ['すごい', [3, 3, 1]]], + + [['', false], ['', []]], + [['', true], ['', []]], + [['っ', false], ['っ', [1]]], + [['っ', true], ['', [1]]], + [['っっ', false], ['っ', [2]]], + [['っっ', true], ['', [2]]], + [['っっっ', false], ['っ', [3]]], + [['っっっ', true], ['', [3]]] + ]; + + for (const [[text, fullCollapse], [expected, expectedSourceMapping]] of data) { + const sourceMap = new TextSourceMap(text); + const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null); + const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap); + assert.strictEqual(actual1, expected); + assert.strictEqual(actual2, expected); + if (typeof expectedSourceMapping !== 'undefined') { + assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))); + } + } +} + function testIsMoraPitchHigh() { const data = [ [[0, 0], false], @@ -462,6 +515,7 @@ function main() { testConvertAlphabeticToKana(); testDistributeFurigana(); testDistributeFuriganaInflected(); + testCollapseEmphaticSequences(); testIsMoraPitchHigh(); testGetKanaMorae(); } |