diff options
| author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-02-26 23:23:16 -0500 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2021-02-26 23:23:16 -0500 | 
| commit | 0bf0620c3579a5fe94c529673db105a83d6c3755 (patch) | |
| tree | 8a49078a858c4e8b41959f93bb0a8aea162e97cc | |
| parent | b994414b14b224c02359b5e31f6994653a3d4458 (diff) | |
Improve kana segmentation (#1446)
* Improve edge case furigana distribution for mixed hiragana/katakana
* Update/add tests
| -rw-r--r-- | ext/js/language/japanese-util.js | 23 | ||||
| -rw-r--r-- | test/test-japanese.js | 21 | 
2 files changed, 38 insertions, 6 deletions
| diff --git a/ext/js/language/japanese-util.js b/ext/js/language/japanese-util.js index c2ce9627..e47cdf55 100644 --- a/ext/js/language/japanese-util.js +++ b/ext/js/language/japanese-util.js @@ -521,8 +521,11 @@ const JapaneseUtil = (() => {                          groupsStart + 1                      );                      if (segments !== null) { -                        const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength); -                        segments.unshift(this._createFuriganaSegment(text, furigana)); +                        if (reading.startsWith(text)) { +                            segments.unshift(this._createFuriganaSegment(text, '')); +                        } else { +                            segments.unshift(...this._getFuriganaKanaSegments(text, reading)); +                        }                          return segments;                      }                  } @@ -554,6 +557,22 @@ const JapaneseUtil = (() => {              }          } +        _getFuriganaKanaSegments(text, reading) { +            const textLength = text.length; +            const newSegments = []; +            let start = 0; +            let state = (reading[0] === text[0]); +            for (let i = 1; i < textLength; ++i) { +                const newState = (reading[i] === text[i]); +                if (state === newState) { continue; } +                newSegments.push(this._createFuriganaSegment(text.substring(start, i), state ? '' : reading.substring(start, i))); +                state = newState; +                start = i; +            } +            newSegments.push(this._createFuriganaSegment(text.substring(start, textLength), state ? '' : reading.substring(start, textLength))); +            return newSegments; +        } +          _getWanakana() {              const wanakana = this._wanakana;              if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } diff --git a/test/test-japanese.js b/test/test-japanese.js index 590d3157..1a4fc494 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -402,7 +402,8 @@ function testDistributeFurigana() {          [              ['スズメの涙', 'すずめのなみだ'],              [ -                {text: 'スズメの', furigana: 'すずめの'}, +                {text: 'スズメ', furigana: 'すずめ'}, +                {text: 'の', furigana: ''},                  {text: '涙', furigana: 'なみだ'}              ]          ], @@ -464,14 +465,16 @@ function testDistributeFurigana() {          [              ['くノ一', 'くのいち'],              [ -                {text: 'くノ', furigana: 'くの'}, +                {text: 'く', furigana: ''}, +                {text: 'ノ', furigana: 'の'},                  {text: '一', furigana: 'いち'}              ]          ],          [              ['くノ一', 'くのいち'],              [ -                {text: 'くノ', furigana: 'くの'}, +                {text: 'く', furigana: ''}, +                {text: 'ノ', furigana: 'の'},                  {text: '一', furigana: 'いち'}              ]          ], @@ -691,9 +694,19 @@ function testDistributeFurigana() {          [              ['ページ違反', 'ぺーじいはん'],              [ -                {text: 'ページ', furigana: 'ぺーじ'}, +                {text: 'ペ', furigana: 'ぺ'}, +                {text: 'ー', furigana: ''}, +                {text: 'ジ', furigana: 'じ'},                  {text: '違反', furigana: 'いはん'}              ] +        ], +        // Mismatched kana +        [ +            ['サボる', 'サボル'], +            [ +                {text: 'サボ', furigana: ''}, +                {text: 'る', furigana: 'ル'} +            ]          ]      ]; |