From 800ce9ed9e33690af518e0236ec22f19f413903f Mon Sep 17 00:00:00 2001
From: toasted-nutbread <toasted-nutbread@users.noreply.github.com>
Date: Wed, 10 Mar 2021 20:27:01 -0500
Subject: Fix incorrect furigana distribution (#1514)

* Improve distributeFuriganaInflected implementation

* Update tests
---
 ext/js/language/japanese-util.js | 81 ++++++++++++++++++++++++++++++++--------
 1 file changed, 65 insertions(+), 16 deletions(-)

(limited to 'ext/js')

diff --git a/ext/js/language/japanese-util.js b/ext/js/language/japanese-util.js
index b363ed5c..aa1e3f00 100644
--- a/ext/js/language/japanese-util.js
+++ b/ext/js/language/japanese-util.js
@@ -466,27 +466,55 @@ const JapaneseUtil = (() => {
         }
 
         distributeFuriganaInflected(expression, reading, source) {
-            let stemLength = 0;
-            const shortest = Math.min(source.length, expression.length);
-            const sourceHiragana = this.convertKatakanaToHiragana(source);
-            const expressionHiragana = this.convertKatakanaToHiragana(expression);
-            while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) {
-                ++stemLength;
+            const expressionNormalized = this.convertKatakanaToHiragana(expression);
+            const readingNormalized = this.convertKatakanaToHiragana(reading);
+            const sourceNormalized = this.convertKatakanaToHiragana(source);
+
+            let mainText = expression;
+            let stemLength = this._getStemLength(expressionNormalized, sourceNormalized);
+
+            // Check if source is derived from the reading instead of the expression
+            const readingStemLength = this._getStemLength(readingNormalized, sourceNormalized);
+            if (readingStemLength > stemLength) {
+                mainText = reading;
+                stemLength = readingStemLength;
             }
-            const offset = source.length - stemLength;
 
-            const stemExpression = source.substring(0, source.length - offset);
-            const stemReading = reading.substring(
-                0,
-                offset === 0 ? reading.length : reading.length - expression.length + stemLength
-            );
-            const result = this.distributeFurigana(stemExpression, stemReading);
+            const segments = [];
+            if (stemLength > 0) {
+                const segments2 = this.distributeFurigana(mainText, reading);
+                let consumed = 0;
+                for (const segment of segments2) {
+                    const {text} = segment;
+                    const start = consumed;
+                    consumed += text.length;
+                    if (consumed < stemLength) {
+                        segments.push(segment);
+                    } else if (consumed === stemLength) {
+                        segments.push(segment);
+                        break;
+                    } else {
+                        if (start < stemLength) {
+                            segments.push(this._createFuriganaSegment(mainText.substring(start, stemLength), ''));
+                        }
+                        break;
+                    }
+                }
+            }
 
-            if (stemLength !== source.length) {
-                result.push(this._createFuriganaSegment(source.substring(stemLength), ''));
+            if (stemLength < source.length) {
+                const remainder = source.substring(stemLength);
+                const segmentCount = segments.length;
+                if (segmentCount > 0 && segments[segmentCount - 1].furigana.length === 0) {
+                    // Append to the last segment if it has an empty reading
+                    segments[segmentCount - 1].text += remainder;
+                } else {
+                    // Otherwise, create a new segment
+                    segments.push(this._createFuriganaSegment(remainder, ''));
+                }
             }
 
-            return result;
+            return segments;
         }
 
         // Miscellaneous
@@ -648,6 +676,27 @@ const JapaneseUtil = (() => {
 
             return result;
         }
+
+        _getStemLength(text1, text2) {
+            const minLength = Math.min(text1.length, text2.length);
+            if (minLength === 0) { return 0; }
+
+            let i = 0;
+            while (true) {
+                const char1 = text1.codePointAt(i);
+                const char2 = text2.codePointAt(i);
+                if (char1 !== char2) { break; }
+                const charLength = String.fromCodePoint(char1).length;
+                i += charLength;
+                if (i >= minLength) {
+                    if (i > minLength) {
+                        i -= charLength; // Don't consume partial UTF16 surrogate characters
+                    }
+                    break;
+                }
+            }
+            return i;
+        }
     }
 
 
-- 
cgit v1.2.3