aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-03-10 20:27:01 -0500
committertoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-03-10 20:29:21 -0500
commit1f09be350fd371e7618d2b9cb398c7c6a36ee7e7 (patch)
tree2e67c5f7140d30c6d39b31f72c13283eb1cfc01b
parent55ac05eba147edb3cc81e9f302055c15cf7fd7a4 (diff)
Fix incorrect furigana distribution (#1514)
* Improve distributeFuriganaInflected implementation * Update tests
-rw-r--r--ext/js/language/japanese-util.js81
-rw-r--r--test/test-japanese.js18
2 files changed, 79 insertions, 20 deletions
diff --git a/ext/js/language/japanese-util.js b/ext/js/language/japanese-util.js
index 861d66e8..2760e4af 100644
--- a/ext/js/language/japanese-util.js
+++ b/ext/js/language/japanese-util.js
@@ -466,27 +466,55 @@ const JapaneseUtil = (() => {
}
distributeFuriganaInflected(expression, reading, source) {
- let stemLength = 0;
- const shortest = Math.min(source.length, expression.length);
- const sourceHiragana = this.convertKatakanaToHiragana(source);
- const expressionHiragana = this.convertKatakanaToHiragana(expression);
- while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) {
- ++stemLength;
+ const expressionNormalized = this.convertKatakanaToHiragana(expression);
+ const readingNormalized = this.convertKatakanaToHiragana(reading);
+ const sourceNormalized = this.convertKatakanaToHiragana(source);
+
+ let mainText = expression;
+ let stemLength = this._getStemLength(expressionNormalized, sourceNormalized);
+
+ // Check if source is derived from the reading instead of the expression
+ const readingStemLength = this._getStemLength(readingNormalized, sourceNormalized);
+ if (readingStemLength > stemLength) {
+ mainText = reading;
+ stemLength = readingStemLength;
}
- const offset = source.length - stemLength;
- const stemExpression = source.substring(0, source.length - offset);
- const stemReading = reading.substring(
- 0,
- offset === 0 ? reading.length : reading.length - expression.length + stemLength
- );
- const result = this.distributeFurigana(stemExpression, stemReading);
+ const segments = [];
+ if (stemLength > 0) {
+ const segments2 = this.distributeFurigana(mainText, reading);
+ let consumed = 0;
+ for (const segment of segments2) {
+ const {text} = segment;
+ const start = consumed;
+ consumed += text.length;
+ if (consumed < stemLength) {
+ segments.push(segment);
+ } else if (consumed === stemLength) {
+ segments.push(segment);
+ break;
+ } else {
+ if (start < stemLength) {
+ segments.push(this._createFuriganaSegment(mainText.substring(start, stemLength), ''));
+ }
+ break;
+ }
+ }
+ }
- if (stemLength !== source.length) {
- result.push(this._createFuriganaSegment(source.substring(stemLength), ''));
+ if (stemLength < source.length) {
+ const remainder = source.substring(stemLength);
+ const segmentCount = segments.length;
+ if (segmentCount > 0 && segments[segmentCount - 1].furigana.length === 0) {
+ // Append to the last segment if it has an empty reading
+ segments[segmentCount - 1].text += remainder;
+ } else {
+ // Otherwise, create a new segment
+ segments.push(this._createFuriganaSegment(remainder, ''));
+ }
}
- return result;
+ return segments;
}
// Miscellaneous
@@ -648,6 +676,27 @@ const JapaneseUtil = (() => {
return result;
}
+
+ _getStemLength(text1, text2) {
+ const minLength = Math.min(text1.length, text2.length);
+ if (minLength === 0) { return 0; }
+
+ let i = 0;
+ while (true) {
+ const char1 = text1.codePointAt(i);
+ const char2 = text2.codePointAt(i);
+ if (char1 !== char2) { break; }
+ const charLength = String.fromCodePoint(char1).length;
+ i += charLength;
+ if (i >= minLength) {
+ if (i > minLength) {
+ i -= charLength; // Don't consume partial UTF16 surrogate characters
+ }
+ break;
+ }
+ }
+ return i;
+ }
}
diff --git a/test/test-japanese.js b/test/test-japanese.js
index e9fb9c90..8e8078d1 100644
--- a/test/test-japanese.js
+++ b/test/test-japanese.js
@@ -729,16 +729,26 @@ function testDistributeFuriganaInflected() {
['美味しい', 'おいしい', '美味しかた'],
[
{text: '美味', furigana: 'おい'},
- {text: 'し', furigana: ''},
- {text: 'かた', furigana: ''}
+ {text: 'しかた', furigana: ''}
]
],
[
['食べる', 'たべる', '食べた'],
[
{text: '食', furigana: 'た'},
- {text: 'べ', furigana: ''},
- {text: 'た', furigana: ''}
+ {text: 'べた', furigana: ''}
+ ]
+ ],
+ [
+ ['迄に', 'までに', 'までに'],
+ [
+ {text: 'までに', furigana: ''}
+ ]
+ ],
+ [
+ ['行う', 'おこなう', 'おこなわなかった'],
+ [
+ {text: 'おこなわなかった', furigana: ''}
]
]
];