aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2020-04-12 11:42:46 -0400
committerGitHub <noreply@github.com>2020-04-12 11:42:46 -0400
commit649adb13d8e1a775bb54e28b12401f7f95c5ab8d (patch)
treefe48c094ab45c00729742962e05da407b42af73a
parent82f83970001682018f1f5b595ffdcd13123fed91 (diff)
parentc2bf474d1f71c29b848e12a4af4b0860d8adb4ab (diff)
Merge pull request #440 from toasted-nutbread/collapse-emphatic-sequences
Add support for collapsing emphatic character sequences
-rw-r--r--ext/bg/data/options-schema.json8
-rw-r--r--ext/bg/js/japanese.js40
-rw-r--r--ext/bg/js/options.js3
-rw-r--r--ext/bg/js/settings/main.js2
-rw-r--r--ext/bg/js/translator.js17
-rw-r--r--ext/bg/settings.html11
-rw-r--r--test/test-japanese.js54
7 files changed, 129 insertions, 6 deletions
diff --git a/ext/bg/data/options-schema.json b/ext/bg/data/options-schema.json
index da1f1ce0..4f9e694d 100644
--- a/ext/bg/data/options-schema.json
+++ b/ext/bg/data/options-schema.json
@@ -388,7 +388,8 @@
"convertNumericCharacters",
"convertAlphabeticCharacters",
"convertHiraganaToKatakana",
- "convertKatakanaToHiragana"
+ "convertKatakanaToHiragana",
+ "collapseEmphaticSequences"
],
"properties": {
"convertHalfWidthCharacters": {
@@ -415,6 +416,11 @@
"type": "string",
"enum": ["false", "true", "variant"],
"default": "variant"
+ },
+ "collapseEmphaticSequences": {
+ "type": "string",
+ "enum": ["false", "true", "full"],
+ "default": "false"
}
}
},
diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js
index 5c49cca7..5fef27a7 100644
--- a/ext/bg/js/japanese.js
+++ b/ext/bg/js/japanese.js
@@ -82,6 +82,9 @@
const ITERATION_MARK_CODE_POINT = 0x3005;
+ const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063;
+ const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3;
+ const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc;
// Existing functions
@@ -372,6 +375,40 @@
}
+ // Miscellaneous
+
+ function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) {
+ let result = '';
+ let collapseCodePoint = -1;
+ const hasSourceMap = (sourceMap !== null);
+ for (const char of text) {
+ const c = char.codePointAt(0);
+ if (
+ c === HIRAGANA_SMALL_TSU_CODE_POINT ||
+ c === KATAKANA_SMALL_TSU_CODE_POINT ||
+ c === KANA_PROLONGED_SOUND_MARK_CODE_POINT
+ ) {
+ if (collapseCodePoint !== c) {
+ collapseCodePoint = c;
+ if (!fullCollapse) {
+ result += char;
+ continue;
+ }
+ }
+ } else {
+ collapseCodePoint = -1;
+ result += char;
+ continue;
+ }
+
+ if (hasSourceMap) {
+ sourceMap.combine(Math.max(0, result.length - 1), 1);
+ }
+ }
+ return result;
+ }
+
+
// Exports
Object.assign(jp, {
@@ -383,6 +420,7 @@
convertHalfWidthKanaToFullWidth,
convertAlphabeticToKana,
distributeFurigana,
- distributeFuriganaInflected
+ distributeFuriganaInflected,
+ collapseEmphaticSequences
});
})();
diff --git a/ext/bg/js/options.js b/ext/bg/js/options.js
index 20df2a68..f3e5f60d 100644
--- a/ext/bg/js/options.js
+++ b/ext/bg/js/options.js
@@ -170,7 +170,8 @@ function profileOptionsCreateDefaults() {
convertNumericCharacters: 'false',
convertAlphabeticCharacters: 'false',
convertHiraganaToKatakana: 'false',
- convertKatakanaToHiragana: 'variant'
+ convertKatakanaToHiragana: 'variant',
+ collapseEmphaticSequences: 'false'
},
dictionaries: {},
diff --git a/ext/bg/js/settings/main.js b/ext/bg/js/settings/main.js
index 8fd94562..308e92eb 100644
--- a/ext/bg/js/settings/main.js
+++ b/ext/bg/js/settings/main.js
@@ -118,6 +118,7 @@ async function formRead(options) {
options.translation.convertAlphabeticCharacters = $('#translation-convert-alphabetic-characters').val();
options.translation.convertHiraganaToKatakana = $('#translation-convert-hiragana-to-katakana').val();
options.translation.convertKatakanaToHiragana = $('#translation-convert-katakana-to-hiragana').val();
+ options.translation.collapseEmphaticSequences = $('#translation-collapse-emphatic-sequences').val();
options.parsing.enableScanningParser = $('#parsing-scan-enable').prop('checked');
options.parsing.enableMecabParser = $('#parsing-mecab-enable').prop('checked');
@@ -199,6 +200,7 @@ async function formWrite(options) {
$('#translation-convert-alphabetic-characters').val(options.translation.convertAlphabeticCharacters);
$('#translation-convert-hiragana-to-katakana').val(options.translation.convertHiraganaToKatakana);
$('#translation-convert-katakana-to-hiragana').val(options.translation.convertKatakanaToHiragana);
+ $('#translation-collapse-emphatic-sequences').val(options.translation.collapseEmphaticSequences);
$('#parsing-scan-enable').prop('checked', options.parsing.enableScanningParser);
$('#parsing-mecab-enable').prop('checked', options.parsing.enableMecabParser);
diff --git a/ext/bg/js/translator.js b/ext/bg/js/translator.js
index e4441384..aaa1a0ec 100644
--- a/ext/bg/js/translator.js
+++ b/ext/bg/js/translator.js
@@ -347,17 +347,27 @@ class Translator {
getAllDeinflections(text, options) {
const translationOptions = options.translation;
+ const collapseEmphaticOptions = [[false, false]];
+ switch (translationOptions.collapseEmphaticSequences) {
+ case 'true':
+ collapseEmphaticOptions.push([true, false]);
+ break;
+ case 'full':
+ collapseEmphaticOptions.push([true, false], [true, true]);
+ break;
+ }
const textOptionVariantArray = [
Translator.getTextOptionEntryVariants(translationOptions.convertHalfWidthCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertNumericCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertAlphabeticCharacters),
Translator.getTextOptionEntryVariants(translationOptions.convertHiraganaToKatakana),
- Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana)
+ Translator.getTextOptionEntryVariants(translationOptions.convertKatakanaToHiragana),
+ collapseEmphaticOptions
];
const deinflections = [];
const used = new Set();
- for (const [halfWidth, numeric, alphabetic, katakana, hiragana] of Translator.getArrayVariants(textOptionVariantArray)) {
+ for (const [halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of Translator.getArrayVariants(textOptionVariantArray)) {
let text2 = text;
const sourceMap = new TextSourceMap(text2);
if (halfWidth) {
@@ -375,6 +385,9 @@ class Translator {
if (hiragana) {
text2 = jp.convertKatakanaToHiragana(text2);
}
+ if (collapseEmphatic) {
+ text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap);
+ }
for (let i = text2.length; i > 0; --i) {
const text2Substring = text2.substring(0, i);
diff --git a/ext/bg/settings.html b/ext/bg/settings.html
index 1297a9cc..96c1db82 100644
--- a/ext/bg/settings.html
+++ b/ext/bg/settings.html
@@ -427,7 +427,7 @@
<p class="help-block">
The conversion options below are listed in the order that the conversions are applied to the input text.
- Each conversion has three possible values:
+ Conversions commonly have three possible values:
</p>
<ul class="help-block">
@@ -490,6 +490,15 @@
<option value="variant">Use both variants</option>
</select>
</div>
+
+ <div class="form-group">
+ <label for="translation-collapse-emphatic-sequences">Collapse emphatic character sequences <span class="label-light">(すっっごーーい &rarr; すっごーい / すごい)</span></label>
+ <select class="form-control" id="translation-collapse-emphatic-sequences">
+ <option value="false">Disabled</option>
+ <option value="true">Collapse into single character</option>
+ <option value="full">Remove all characters</option>
+ </select>
+ </div>
</div>
<div id="popup-content-scanning">
diff --git a/test/test-japanese.js b/test/test-japanese.js
index f4b084ac..89e41c36 100644
--- a/test/test-japanese.js
+++ b/test/test-japanese.js
@@ -393,6 +393,59 @@ function testDistributeFuriganaInflected() {
}
}
+function testCollapseEmphaticSequences() {
+ const data = [
+ [['かこい', false], ['かこい', [1, 1, 1]]],
+ [['かこい', true], ['かこい', [1, 1, 1]]],
+ [['かっこい', false], ['かっこい', [1, 1, 1, 1]]],
+ [['かっこい', true], ['かこい', [2, 1, 1]]],
+ [['かっっこい', false], ['かっこい', [1, 2, 1, 1]]],
+ [['かっっこい', true], ['かこい', [3, 1, 1]]],
+ [['かっっっこい', false], ['かっこい', [1, 3, 1, 1]]],
+ [['かっっっこい', true], ['かこい', [4, 1, 1]]],
+
+ [['こい', false], ['こい', [1, 1]]],
+ [['こい', true], ['こい', [1, 1]]],
+ [['っこい', false], ['っこい', [1, 1, 1]]],
+ [['っこい', true], ['こい', [2, 1]]],
+ [['っっこい', false], ['っこい', [2, 1, 1]]],
+ [['っっこい', true], ['こい', [3, 1]]],
+ [['っっっこい', false], ['っこい', [3, 1, 1]]],
+ [['っっっこい', true], ['こい', [4, 1]]],
+
+ [['すごい', false], ['すごい', [1, 1, 1]]],
+ [['すごい', true], ['すごい', [1, 1, 1]]],
+ [['すごーい', false], ['すごーい', [1, 1, 1, 1]]],
+ [['すごーい', true], ['すごい', [1, 2, 1]]],
+ [['すごーーい', false], ['すごーい', [1, 1, 2, 1]]],
+ [['すごーーい', true], ['すごい', [1, 3, 1]]],
+ [['すっごーい', false], ['すっごーい', [1, 1, 1, 1, 1]]],
+ [['すっごーい', true], ['すごい', [2, 2, 1]]],
+ [['すっっごーーい', false], ['すっごーい', [1, 2, 1, 2, 1]]],
+ [['すっっごーーい', true], ['すごい', [3, 3, 1]]],
+
+ [['', false], ['', []]],
+ [['', true], ['', []]],
+ [['っ', false], ['っ', [1]]],
+ [['っ', true], ['', [1]]],
+ [['っっ', false], ['っ', [2]]],
+ [['っっ', true], ['', [2]]],
+ [['っっっ', false], ['っ', [3]]],
+ [['っっっ', true], ['', [3]]]
+ ];
+
+ for (const [[text, fullCollapse], [expected, expectedSourceMapping]] of data) {
+ const sourceMap = new TextSourceMap(text);
+ const actual1 = jp.collapseEmphaticSequences(text, fullCollapse, null);
+ const actual2 = jp.collapseEmphaticSequences(text, fullCollapse, sourceMap);
+ assert.strictEqual(actual1, expected);
+ assert.strictEqual(actual2, expected);
+ if (typeof expectedSourceMapping !== 'undefined') {
+ assert.ok(sourceMap.equals(new TextSourceMap(text, expectedSourceMapping)));
+ }
+ }
+}
+
function testIsMoraPitchHigh() {
const data = [
[[0, 0], false],
@@ -462,6 +515,7 @@ function main() {
testConvertAlphabeticToKana();
testDistributeFurigana();
testDistributeFuriganaInflected();
+ testCollapseEmphaticSequences();
testIsMoraPitchHigh();
testGetKanaMorae();
}