diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2020-12-22 11:02:19 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-12-22 11:02:19 -0500 |
commit | a354becd5110217f36e4b9fdb883aa0e7bc520bc (patch) | |
tree | 5611a591320b955919ec37ba933afcb0fd99a4f9 | |
parent | b083e9f08f5236c3a23534554cafa91eb558e061 (diff) |
Furigana distribution improvements (#1157)
* Improve furigana when reading kana is not an exact match
* Simplify group structure
* Return consistent type
* Add more tests
* Remove redundant isAmbiguous assignment
* Simplify group usage
* Add helper function
* Optimize returned arrays
* Use variable
* Remove s(p)lice calls
* Reduce number of convertKatakanaToHiragana calls
* Optimize text length access
* Optimize reading substring
* Move segmentize to a separate function
* Use var
* Use _createFuriganaSegment
* Optimize distributeFuriganaInflected
-rw-r--r-- | ext/mixed/js/japanese.js | 132 | ||||
-rw-r--r-- | test/test-japanese.js | 341 |
2 files changed, 418 insertions, 55 deletions
diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index 4bd0dc65..a6b5c150 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -406,72 +406,40 @@ const JapaneseUtil = (() => { distributeFurigana(expression, reading) { if (!reading || reading === expression) { // Same - return [{furigana: '', text: expression}]; + return [this._createFuriganaSegment(expression, '')]; } - let isAmbiguous = false; - const segmentize = (reading2, groups) => { - if (groups.length === 0 || isAmbiguous) { - return []; - } - - const group = groups[0]; - if (group.mode === 'kana') { - if (this.convertKatakanaToHiragana(reading2).startsWith(this.convertKatakanaToHiragana(group.text))) { - const readingLeft = reading2.substring(group.text.length); - const segs = segmentize(readingLeft, groups.splice(1)); - if (segs) { - return [{text: group.text, furigana: ''}].concat(segs); - } - } - } else { - let foundSegments = null; - for (let i = reading2.length; i >= group.text.length; --i) { - const readingUsed = reading2.substring(0, i); - const readingLeft = reading2.substring(i); - const segs = segmentize(readingLeft, groups.slice(1)); - if (segs) { - if (foundSegments !== null) { - // more than one way to segmentize the tail, mark as ambiguous - isAmbiguous = true; - return null; - } - foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); - } - // there is only one way to segmentize the last non-kana group - if (groups.length === 1) { - break; - } - } - return foundSegments; - } - }; - const groups = []; - let modePrev = null; + let groupPre = null; + let isKanaPre = null; for (const c of expression) { const codePoint = c.codePointAt(0); - const modeCurr = this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana'; - if (modeCurr === modePrev) { - groups[groups.length - 1].text += c; + const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT); + if (isKana === isKanaPre) { + groupPre.text += c; } else { - groups.push({mode: modeCurr, text: c}); - modePrev = modeCurr; + groupPre = {isKana, text: c, textNormalized: null}; + groups.push(groupPre); + isKanaPre = isKana; + } + } + for (const group of groups) { + if (group.isKana) { + group.textNormalized = this.convertKatakanaToHiragana(group.text); } } - const segments = segmentize(reading, groups); - if (segments && !isAmbiguous) { + const readingNormalized = this.convertKatakanaToHiragana(reading); + const segments = this._segmentizeFurigana(reading, readingNormalized, groups, 0); + if (segments !== null) { return segments; } // Fallback - return [{furigana: reading, text: expression}]; + return [this._createFuriganaSegment(expression, reading)]; } distributeFuriganaInflected(expression, reading, source) { - const output = []; - let stemLength = 0; const shortest = Math.min(source.length, expression.length); const sourceHiragana = this.convertKatakanaToHiragana(source); @@ -486,15 +454,13 @@ const JapaneseUtil = (() => { 0, offset === 0 ? reading.length : reading.length - expression.length + stemLength ); - for (const segment of this.distributeFurigana(stemExpression, stemReading)) { - output.push(segment); - } + const result = this.distributeFurigana(stemExpression, stemReading); if (stemLength !== source.length) { - output.push({text: source.substring(stemLength), furigana: ''}); + result.push(this._createFuriganaSegment(source.substring(stemLength), '')); } - return output; + return result; } // Miscellaneous @@ -532,6 +498,62 @@ const JapaneseUtil = (() => { // Private + _createFuriganaSegment(text, furigana) { + return {text, furigana}; + } + + _segmentizeFurigana(reading, readingNormalized, groups, groupsStart) { + const groupCount = groups.length - groupsStart; + if (groupCount <= 0) { + return []; + } + + const group = groups[groupsStart]; + const {isKana, text} = group; + const textLength = text.length; + if (isKana) { + const {textNormalized} = group; + if (readingNormalized.startsWith(textNormalized)) { + const segments = this._segmentizeFurigana( + reading.substring(textLength), + readingNormalized.substring(textLength), + groups, + groupsStart + 1 + ); + if (segments !== null) { + const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength); + segments.unshift(this._createFuriganaSegment(text, furigana)); + return segments; + } + } + return null; + } else { + let result = null; + for (let i = reading.length; i >= textLength; --i) { + const segments = this._segmentizeFurigana( + reading.substring(i), + readingNormalized.substring(i), + groups, + groupsStart + 1 + ); + if (segments !== null) { + if (result !== null) { + // More than one way to segmentize the tail; mark as ambiguous + return null; + } + const furigana = reading.substring(0, i); + segments.unshift(this._createFuriganaSegment(text, furigana)); + result = segments; + } + // There is only one way to segmentize the last non-kana group + if (groupCount === 1) { + break; + } + } + return result; + } + } + _getWanakana() { const wanakana = this._wanakana; if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } diff --git a/test/test-japanese.js b/test/test-japanese.js index 97d613fe..b4fcd7a0 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -353,6 +353,347 @@ function testDistributeFurigana() { [ {text: 'かいぬ', furigana: ''} ] + ], + // Misc + [ + ['月', 'か'], + [ + {text: '月', furigana: 'か'} + ] + ], + [ + ['月', 'カ'], + [ + {text: '月', furigana: 'カ'} + ] + ], + // Mismatched kana readings + [ + ['有り難う', 'アリガトウ'], + [ + {text: '有', furigana: 'ア'}, + {text: 'り', furigana: 'リ'}, + {text: '難', furigana: 'ガト'}, + {text: 'う', furigana: 'ウ'} + ] + ], + [ + ['ありがとう', 'アリガトウ'], + [ + {text: 'ありがとう', furigana: 'アリガトウ'} + ] + ], + // Mismatched kana readings (real examples) + [ + ['カ月', 'かげつ'], + [ + {text: 'カ', furigana: 'か'}, + {text: '月', furigana: 'げつ'} + ] + ], + [ + ['序ノ口', 'じょのくち'], + [ + {text: '序', furigana: 'じょ'}, + {text: 'ノ', furigana: 'の'}, + {text: '口', furigana: 'くち'} + ] + ], + [ + ['スズメの涙', 'すずめのなみだ'], + [ + {text: 'スズメの', furigana: 'すずめの'}, + {text: '涙', furigana: 'なみだ'} + ] + ], + [ + ['二カ所', 'にかしょ'], + [ + {text: '二', furigana: 'に'}, + {text: 'カ', furigana: 'か'}, + {text: '所', furigana: 'しょ'} + ] + ], + [ + ['八ツ橋', 'やつはし'], + [ + {text: '八', furigana: 'や'}, + {text: 'ツ', furigana: 'つ'}, + {text: '橋', furigana: 'はし'} + ] + ], + [ + ['八ツ橋', 'やつはし'], + [ + {text: '八', furigana: 'や'}, + {text: 'ツ', furigana: 'つ'}, + {text: '橋', furigana: 'はし'} + ] + ], + [ + ['一カ月', 'いっかげつ'], + [ + {text: '一', furigana: 'いっ'}, + {text: 'カ', furigana: 'か'}, + {text: '月', furigana: 'げつ'} + ] + ], + [ + ['一カ所', 'いっかしょ'], + [ + {text: '一', furigana: 'いっ'}, + {text: 'カ', furigana: 'か'}, + {text: '所', furigana: 'しょ'} + ] + ], + [ + ['カ所', 'かしょ'], + [ + {text: 'カ', furigana: 'か'}, + {text: '所', furigana: 'しょ'} + ] + ], + [ + ['数カ月', 'すうかげつ'], + [ + {text: '数', furigana: 'すう'}, + {text: 'カ', furigana: 'か'}, + {text: '月', furigana: 'げつ'} + ] + ], + [ + ['くノ一', 'くのいち'], + [ + {text: 'くノ', furigana: 'くの'}, + {text: '一', furigana: 'いち'} + ] + ], + [ + ['くノ一', 'くのいち'], + [ + {text: 'くノ', furigana: 'くの'}, + {text: '一', furigana: 'いち'} + ] + ], + [ + ['数カ国', 'すうかこく'], + [ + {text: '数', furigana: 'すう'}, + {text: 'カ', furigana: 'か'}, + {text: '国', furigana: 'こく'} + ] + ], + [ + ['数カ所', 'すうかしょ'], + [ + {text: '数', furigana: 'すう'}, + {text: 'カ', furigana: 'か'}, + {text: '所', furigana: 'しょ'} + ] + ], + [ + ['壇ノ浦の戦い', 'だんのうらのたたかい'], + [ + {text: '壇', furigana: 'だん'}, + {text: 'ノ', furigana: 'の'}, + {text: '浦', furigana: 'うら'}, + {text: 'の', furigana: ''}, + {text: '戦', furigana: 'たたか'}, + {text: 'い', furigana: ''} + ] + ], + [ + ['壇ノ浦の戦', 'だんのうらのたたかい'], + [ + {text: '壇', furigana: 'だん'}, + {text: 'ノ', furigana: 'の'}, + {text: '浦', furigana: 'うら'}, + {text: 'の', furigana: ''}, + {text: '戦', furigana: 'たたかい'} + ] + ], + [ + ['序ノ口格', 'じょのくちかく'], + [ + {text: '序', furigana: 'じょ'}, + {text: 'ノ', furigana: 'の'}, + {text: '口格', furigana: 'くちかく'} + ] + ], + [ + ['二カ国語', 'にかこくご'], + [ + {text: '二', furigana: 'に'}, + {text: 'カ', furigana: 'か'}, + {text: '国語', furigana: 'こくご'} + ] + ], + [ + ['カ国', 'かこく'], + [ + {text: 'カ', furigana: 'か'}, + {text: '国', furigana: 'こく'} + ] + ], + [ + ['カ国語', 'かこくご'], + [ + {text: 'カ', furigana: 'か'}, + {text: '国語', furigana: 'こくご'} + ] + ], + [ + ['壇ノ浦の合戦', 'だんのうらのかっせん'], + [ + {text: '壇', furigana: 'だん'}, + {text: 'ノ', furigana: 'の'}, + {text: '浦', furigana: 'うら'}, + {text: 'の', furigana: ''}, + {text: '合戦', furigana: 'かっせん'} + ] + ], + [ + ['一タ偏', 'いちたへん'], + [ + {text: '一', furigana: 'いち'}, + {text: 'タ', furigana: 'た'}, + {text: '偏', furigana: 'へん'} + ] + ], + [ + ['ル又', 'るまた'], + [ + {text: 'ル', furigana: 'る'}, + {text: '又', furigana: 'また'} + ] + ], + [ + ['ノ木偏', 'のぎへん'], + [ + {text: 'ノ', furigana: 'の'}, + {text: '木偏', furigana: 'ぎへん'} + ] + ], + [ + ['一ノ貝', 'いちのかい'], + [ + {text: '一', furigana: 'いち'}, + {text: 'ノ', furigana: 'の'}, + {text: '貝', furigana: 'かい'} + ] + ], + [ + ['虎ノ門事件', 'とらのもんじけん'], + [ + {text: '虎', furigana: 'とら'}, + {text: 'ノ', furigana: 'の'}, + {text: '門事件', furigana: 'もんじけん'} + ] + ], + [ + ['教育ニ関スル勅語', 'きょういくにかんするちょくご'], + [ + {text: '教育', furigana: 'きょういく'}, + {text: 'ニ', furigana: 'に'}, + {text: '関', furigana: 'かん'}, + {text: 'スル', furigana: 'する'}, + {text: '勅語', furigana: 'ちょくご'} + ] + ], + [ + ['二カ年', 'にかねん'], + [ + {text: '二', furigana: 'に'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['三カ年', 'さんかねん'], + [ + {text: '三', furigana: 'さん'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['四カ年', 'よんかねん'], + [ + {text: '四', furigana: 'よん'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['五カ年', 'ごかねん'], + [ + {text: '五', furigana: 'ご'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['六カ年', 'ろっかねん'], + [ + {text: '六', furigana: 'ろっ'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['七カ年', 'ななかねん'], + [ + {text: '七', furigana: 'なな'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['八カ年', 'はちかねん'], + [ + {text: '八', furigana: 'はち'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['九カ年', 'きゅうかねん'], + [ + {text: '九', furigana: 'きゅう'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['十カ年', 'じゅうかねん'], + [ + {text: '十', furigana: 'じゅう'}, + {text: 'カ', furigana: 'か'}, + {text: '年', furigana: 'ねん'} + ] + ], + [ + ['鏡ノ間', 'かがみのま'], + [ + {text: '鏡', furigana: 'かがみ'}, + {text: 'ノ', furigana: 'の'}, + {text: '間', furigana: 'ま'} + ] + ], + [ + ['鏡ノ間', 'かがみのま'], + [ + {text: '鏡', furigana: 'かがみ'}, + {text: 'ノ', furigana: 'の'}, + {text: '間', furigana: 'ま'} + ] + ], + [ + ['ページ違反', 'ぺーじいはん'], + [ + {text: 'ページ', furigana: 'ぺーじ'}, + {text: '違反', furigana: 'いはん'} + ] ] ]; |