From 42a2917bf7aa3ab424ada2fc3acf224b74020a7f Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Fri, 10 Apr 2020 11:56:18 -0400 Subject: Add support for collapsing emphatic character sequences --- ext/bg/data/options-schema.json | 8 +++++++- ext/bg/js/japanese.js | 38 +++++++++++++++++++++++++++++++++++++- ext/bg/js/options.js | 3 ++- ext/bg/js/settings/main.js | 2 ++ ext/bg/js/translator.js | 17 +++++++++++++++-- ext/bg/settings.html | 11 ++++++++++- 6 files changed, 73 insertions(+), 6 deletions(-) diff --git a/ext/bg/data/options-schema.json b/ext/bg/data/options-schema.json index da1f1ce0..4f9e694d 100644 --- a/ext/bg/data/options-schema.json +++ b/ext/bg/data/options-schema.json @@ -388,7 +388,8 @@ "convertNumericCharacters", "convertAlphabeticCharacters", "convertHiraganaToKatakana", - "convertKatakanaToHiragana" + "convertKatakanaToHiragana", + "collapseEmphaticSequences" ], "properties": { "convertHalfWidthCharacters": { @@ -415,6 +416,11 @@ "type": "string", "enum": ["false", "true", "variant"], "default": "variant" + }, + "collapseEmphaticSequences": { + "type": "string", + "enum": ["false", "true", "full"], + "default": "false" } } }, diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js index 2a2b39fd..e8b258cb 100644 --- a/ext/bg/js/japanese.js +++ b/ext/bg/js/japanese.js @@ -83,6 +83,8 @@ const ITERATION_MARK_CODE_POINT = 0x3005; + const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; + const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; // Existing functions @@ -373,6 +375,39 @@ } + // Miscellaneous + + function collapseEmphaticSequences(sourceText, fullCollapse, sourceMap=null) { + let result = ''; + let collapseCodePoint = -1; + const hasSourceMap = (sourceMap !== null); + for (const char of sourceText) { + const c = char.codePointAt(0); + if (c === HIRAGANA_SMALL_TSU_CODE_POINT || c === KATAKANA_SMALL_TSU_CODE_POINT) { + if (collapseCodePoint !== c) { + collapseCodePoint = c; + if (!fullCollapse) { + result += char; + continue; + } + } + } else { + collapseCodePoint = -1; + result += char; + continue; + } + + if (hasSourceMap) { + const index = result.length; + if (index > 0) { + sourceMap.combine(index - 1, 1); + } + } + } + return result; + } + + // Exports Object.assign(jp, { @@ -384,6 +419,7 @@ convertHalfWidthKanaToFullWidth, convertAlphabeticToKana, distributeFurigana, - distributeFuriganaInflected + distributeFuriganaInflected, + collapseEmphaticSequences }); })(); diff --git a/ext/bg/js/options.js b/ext/bg/js/options.js index abb054d4..fa96c96c 100644 --- a/ext/bg/js/options.js +++ b/ext/bg/js/options.js @@ -171,7 +171,8 @@ function profileOptionsCreateDefaults() { convertNumericCharacters: 'false', convertAlphabeticCharacters: 'false', convertHiraganaToKatakana: 'false', - convertKatakanaToHiragana: 'variant' + convertKatakanaToHiragana: 'variant', + collapseEmphaticSequences: 'false' }, dictionaries: {}, diff --git a/ext/bg/js/settings/main.js b/ext/bg/js/settings/main.js index 1653ee35..18c2da73 100644 --- a/ext/bg/js/settings/main.js +++ b/ext/bg/js/settings/main.js @@ -119,6 +119,7 @@ async function formRead(options) { options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val(); options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val(); options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val(); + options.translation.collapseEmphaticSequences = $('#translation-collapse-emphatic-sequences').val(); options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked'); options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked'); @@ -200,6 +201,7 @@ async function formWrite(options) { $('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters); $('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana); $('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana); + $('#translation-collapse-emphatic-sequences').val(options.translation.collapseEmphaticSequences); $('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser); $('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser); diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js index 27f91c05..402ac6bd 100644 --- a/ext/bg/js/translator.js +++ b/ext/bg/js/translator.js @@ -348,17 +348,27 @@ class Translator { getAllDeinflections(text, options) { const translationOptions = options.translation; + const collapseEmphaticOptions = [[false, false]]; + switch (translationOptions.collapseEmphaticSequences) { + case 'true': + collapseEmphaticOptions.push([true, false]); + break; + case 'full': + collapseEmphaticOptions.push([true, true]); + break; + } const textOptionVariantArray = [ Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters), Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana), - Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana) + Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana), + collapseEmphaticOptions ]; const deinflections = []; const used = new Set(); - for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) { + for (const [halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of Translator.getArrayVariants(textOptionVariantArray)) { let text2 = text; const sourceMap = new TextSourceMap(text2); if (halfWidth) { @@ -376,6 +386,9 @@ class Translator { if (hiragana) { text2 = jp.convertKatakanaToHiragana(text2); } + if (collapseEmphatic) { + text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + } for (let i = text2.length; i > 0; --i) { const text2Substring = text2.substring(0, i); diff --git a/ext/bg/settings.html b/ext/bg/settings.html index 1297a9cc..91051f3e 100644 --- a/ext/bg/settings.html +++ b/ext/bg/settings.html @@ -427,7 +427,7 @@ <p class="help-block"> The conversion options below are listed in the order that the conversions are applied to the input text. - Each conversion has three possible values: + Conversions commonly have three possible values: </p> <ul class="help-block"> @@ -490,6 +490,15 @@ <option value="variant">Use both variants</option> </select> </div> + + <div class="form-group"> + <label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(かっっっこいい → かっこいい)</span></label> + <select class="form-control" id="translation-collapse-emphatic-sequences"> + <option value="false">Disabled</option> + <option value="true">Collapse into single character</option> + <option value="full">Remove all characters</option> + </select> + </div> </div> <div id="popup-content-scanning"> -- cgit v1.2.3 From 0b7791c103508e4b23d57717a97644993edf76d5 Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Fri, 10 Apr 2020 12:25:24 -0400 Subject: Fix source map for characters collapsed at the start of a string --- ext/bg/js/japanese.js | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js index e8b258cb..71fbebb5 100644 --- a/ext/bg/js/japanese.js +++ b/ext/bg/js/japanese.js @@ -377,11 +377,11 @@ // Miscellaneous - function collapseEmphaticSequences(sourceText, fullCollapse, sourceMap=null) { + function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) { let result = ''; let collapseCodePoint = -1; const hasSourceMap = (sourceMap !== null); - for (const char of sourceText) { + for (const char of text) { const c = char.codePointAt(0); if (c === HIRAGANA_SMALL_TSU_CODE_POINT || c === KATAKANA_SMALL_TSU_CODE_POINT) { if (collapseCodePoint !== c) { @@ -398,10 +398,7 @@ } if (hasSourceMap) { - const index = result.length; - if (index > 0) { - sourceMap.combine(index - 1, 1); - } + sourceMap.combine(Math.max(0, result.length - 1), 1); } } return result; -- cgit v1.2.3 From fb87b1ad69b37e75d1e2e46c91235aa6b44c2980 Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Fri, 10 Apr 2020 12:31:06 -0400 Subject: Add tests --- test/test-japanese.js | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/test/test-japanese.js b/test/test-japanese.js index ca65dde2..ac28a579 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -394,6 +394,50 @@ function testDistributeFuriganaInflected() { } } +function testCollapseEmphaticSequences() { + const data = [ + [['かこい', false], ['かこい', [1, 1, 1]]], + [['かこい', true], ['かこい', [1, 1, 1]]], + [['かっこい', false], ['かっこい', [1, 1, 1, 1]]], + [['かっこい', true], ['かこい', [2, 1, 1]]], + [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]], + [['かっっこい', true], ['かこい', [3, 1, 1]]], + [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]], + [['かっっっこい', true], ['かこい', [4, 1, 1]]], + + [['こい', false], ['こい', [1, 1]]], + [['こい', true], ['こい', [1, 1]]], + [['っこい', false], ['っこい', [1, 1, 1]]], + [['っこい', true], ['こい', [2, 1]]], + [['っっこい', false], ['っこい', [2, 1, 1]]], + [['っっこい', true], ['こい', [3, 1]]], + [['っっっこい', false], ['っこい', [3, 1, 1]]], + [['っっっこい', true], ['こい', [4, 1]]], + + [['', false], ['', []]], + [['', true], ['', []]], + [['っ', false], ['っ', [1]]], + [['っ', true], ['', [1]]], + [['っっ', false], ['っ', [2]]], + [['っっ', true], ['', [2]]], + [['っっっ', false], ['っ', [3]]], + [['っっっ', true], ['', [3]]] + ]; + + for (const [[text, fullCollapse], [expected, expectedSourceMapping]] of data) { + const sourceMap = new TextSourceMap(text); + const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null); + const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap); + assert.strictEqual(actual1, expected); + assert.strictEqual(actual2, expected); + if (typeof expectedSourceMapping !== 'undefined') { + console.log('actual', JSON.stringify(actual1), sourceMap); + console.log('expected', JSON.stringify(expected), new TextSourceMap(text, expectedSourceMapping)); + assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))); + } + } +} + function testIsMoraPitchHigh() { const data = [ [[0, 0], false], @@ -463,6 +507,7 @@ function main() { testConvertAlphabeticToKana(); testDistributeFurigana(); testDistributeFuriganaInflected(); + testCollapseEmphaticSequences(); testIsMoraPitchHigh(); testGetKanaMorae(); } -- cgit v1.2.3 From 90392ac9d6d3b54f811e3d056043a1ffe26fa963 Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Sat, 11 Apr 2020 15:43:12 -0400 Subject: Add support for collapsing the Katakana-Hiragana Prolonged Sound Mark --- ext/bg/js/japanese.js | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js index 71fbebb5..78f5b48f 100644 --- a/ext/bg/js/japanese.js +++ b/ext/bg/js/japanese.js @@ -85,6 +85,7 @@ const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; + const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; // Existing functions @@ -383,7 +384,11 @@ const hasSourceMap = (sourceMap !== null); for (const char of text) { const c = char.codePointAt(0); - if (c === HIRAGANA_SMALL_TSU_CODE_POINT || c === KATAKANA_SMALL_TSU_CODE_POINT) { + if ( + c === HIRAGANA_SMALL_TSU_CODE_POINT || + c === KATAKANA_SMALL_TSU_CODE_POINT || + c === KANA_PROLONGED_SOUND_MARK_CODE_POINT + ) { if (collapseCodePoint !== c) { collapseCodePoint = c; if (!fullCollapse) { -- cgit v1.2.3 From 92f2466cfff40d47fb5e6350dae5d7ff82770973 Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Sat, 11 Apr 2020 16:53:29 -0400 Subject: Add tests, remove logs --- test/test-japanese.js | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/test-japanese.js b/test/test-japanese.js index ac28a579..b1bba9a7 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -414,6 +414,17 @@ function testCollapseEmphaticSequences() { [['っっっこい', false], ['っこい', [3, 1, 1]]], [['っっっこい', true], ['こい', [4, 1]]], + [['すごい', false], ['すごい', [1, 1, 1]]], + [['すごい', true], ['すごい', [1, 1, 1]]], + [['すごーい', false], ['すごーい', [1, 1, 1, 1]]], + [['すごーい', true], ['すごい', [1, 2, 1]]], + [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]], + [['すごーーい', true], ['すごい', [1, 3, 1]]], + [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]], + [['すっごーい', true], ['すごい', [2, 2, 1]]], + [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]], + [['すっっごーーい', true], ['すごい', [3, 3, 1]]], + [['', false], ['', []]], [['', true], ['', []]], [['っ', false], ['っ', [1]]], @@ -431,8 +442,6 @@ function testCollapseEmphaticSequences() { assert.strictEqual(actual1, expected); assert.strictEqual(actual2, expected); if (typeof expectedSourceMapping !== 'undefined') { - console.log('actual', JSON.stringify(actual1), sourceMap); - console.log('expected', JSON.stringify(expected), new TextSourceMap(text, expectedSourceMapping)); assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping))); } } -- cgit v1.2.3 From 70f0b8b0cd7c85bd8af230cf6a74a0d0e1d0bbc2 Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Sat, 11 Apr 2020 18:58:14 -0400 Subject: Fix 'full' mode not being a superset of 'true' mode --- ext/bg/js/translator.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js index 402ac6bd..fd14b72d 100644 --- a/ext/bg/js/translator.js +++ b/ext/bg/js/translator.js @@ -354,7 +354,7 @@ class Translator { collapseEmphaticOptions.push([true, false]); break; case 'full': - collapseEmphaticOptions.push([true, true]); + collapseEmphaticOptions.push([true, false], [true, true]); break; } const textOptionVariantArray = [ -- cgit v1.2.3 From c2bf474d1f71c29b848e12a4af4b0860d8adb4ab Mon Sep 17 00:00:00 2001 From: toasted-nutbread <toasted-nutbread@users.noreply.github.com> Date: Sat, 11 Apr 2020 19:00:01 -0400 Subject: Update example --- ext/bg/settings.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/bg/settings.html b/ext/bg/settings.html index 91051f3e..96c1db82 100644 --- a/ext/bg/settings.html +++ b/ext/bg/settings.html @@ -492,7 +492,7 @@ </div> <div class="form-group"> - <label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(かっっっこいい → かっこいい)</span></label> + <label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(すっっごーーい → すっごーい / すごい)</span></label> <select class="form-control" id="translation-collapse-emphatic-sequences"> <option value="false">Disabled</option> <option value="true">Collapse into single character</option> -- cgit v1.2.3