aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-02-26 23:23:16 -0500
committerGitHub <noreply@github.com>2021-02-26 23:23:16 -0500
commit0bf0620c3579a5fe94c529673db105a83d6c3755 (patch)
tree8a49078a858c4e8b41959f93bb0a8aea162e97cc
parentb994414b14b224c02359b5e31f6994653a3d4458 (diff)
Improve kana segmentation (#1446)
* Improve edge case furigana distribution for mixed hiragana/katakana * Update/add tests
-rw-r--r--ext/js/language/japanese-util.js23
-rw-r--r--test/test-japanese.js21
2 files changed, 38 insertions, 6 deletions
diff --git a/ext/js/language/japanese-util.js b/ext/js/language/japanese-util.js
index c2ce9627..e47cdf55 100644
--- a/ext/js/language/japanese-util.js
+++ b/ext/js/language/japanese-util.js
@@ -521,8 +521,11 @@ const JapaneseUtil = (() => {
groupsStart + 1
);
if (segments !== null) {
- const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength);
- segments.unshift(this._createFuriganaSegment(text, furigana));
+ if (reading.startsWith(text)) {
+ segments.unshift(this._createFuriganaSegment(text, ''));
+ } else {
+ segments.unshift(...this._getFuriganaKanaSegments(text, reading));
+ }
return segments;
}
}
@@ -554,6 +557,22 @@ const JapaneseUtil = (() => {
}
}
+ _getFuriganaKanaSegments(text, reading) {
+ const textLength = text.length;
+ const newSegments = [];
+ let start = 0;
+ let state = (reading[0] === text[0]);
+ for (let i = 1; i < textLength; ++i) {
+ const newState = (reading[i] === text[i]);
+ if (state === newState) { continue; }
+ newSegments.push(this._createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i)));
+ state = newState;
+ start = i;
+ }
+ newSegments.push(this._createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength)));
+ return newSegments;
+ }
+
_getWanakana() {
const wanakana = this._wanakana;
if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); }
diff --git a/test/test-japanese.js b/test/test-japanese.js
index 590d3157..1a4fc494 100644
--- a/test/test-japanese.js
+++ b/test/test-japanese.js
@@ -402,7 +402,8 @@ function testDistributeFurigana() {
[
['スズメの涙', 'すずめのなみだ'],
[
- {text: 'スズメの', furigana: 'すずめの'},
+ {text: 'スズメ', furigana: 'すずめ'},
+ {text: 'の', furigana: ''},
{text: '涙', furigana: 'なみだ'}
]
],
@@ -464,14 +465,16 @@ function testDistributeFurigana() {
[
['くノ一', 'くのいち'],
[
- {text: 'くノ', furigana: 'くの'},
+ {text: 'く', furigana: ''},
+ {text: 'ノ', furigana: 'の'},
{text: '一', furigana: 'いち'}
]
],
[
['くノ一', 'くのいち'],
[
- {text: 'くノ', furigana: 'くの'},
+ {text: 'く', furigana: ''},
+ {text: 'ノ', furigana: 'の'},
{text: '一', furigana: 'いち'}
]
],
@@ -691,9 +694,19 @@ function testDistributeFurigana() {
[
['ページ違反', 'ぺーじいはん'],
[
- {text: 'ページ', furigana: 'ぺーじ'},
+ {text: 'ペ', furigana: 'ぺ'},
+ {text: 'ー', furigana: ''},
+ {text: 'ジ', furigana: 'じ'},
{text: '違反', furigana: 'いはん'}
]
+ ],
+ // Mismatched kana
+ [
+ ['サボる', 'サボル'],
+ [
+ {text: 'サボ', furigana: ''},
+ {text: 'る', furigana: 'ル'}
+ ]
]
];