diff options
| -rw-r--r-- | ext/mixed/js/japanese.js | 132 | ||||
| -rw-r--r-- | test/test-japanese.js | 341 | 
2 files changed, 418 insertions, 55 deletions
| diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index 4bd0dc65..a6b5c150 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -406,72 +406,40 @@ const JapaneseUtil = (() => {          distributeFurigana(expression, reading) {              if (!reading || reading === expression) {                  // Same -                return [{furigana: '', text: expression}]; +                return [this._createFuriganaSegment(expression, '')];              } -            let isAmbiguous = false; -            const segmentize = (reading2, groups) => { -                if (groups.length === 0 || isAmbiguous) { -                    return []; -                } - -                const group = groups[0]; -                if (group.mode === 'kana') { -                    if (this.convertKatakanaToHiragana(reading2).startsWith(this.convertKatakanaToHiragana(group.text))) { -                        const readingLeft = reading2.substring(group.text.length); -                        const segs = segmentize(readingLeft, groups.splice(1)); -                        if (segs) { -                            return [{text: group.text, furigana: ''}].concat(segs); -                        } -                    } -                } else { -                    let foundSegments = null; -                    for (let i = reading2.length; i >= group.text.length; --i) { -                        const readingUsed = reading2.substring(0, i); -                        const readingLeft = reading2.substring(i); -                        const segs = segmentize(readingLeft, groups.slice(1)); -                        if (segs) { -                            if (foundSegments !== null) { -                                // more than one way to segmentize the tail, mark as ambiguous -                                isAmbiguous = true; -                                return null; -                            } -                            foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); -                        } -                        // there is only one way to segmentize the last non-kana group -                        if (groups.length === 1) { -                            break; -                        } -                    } -                    return foundSegments; -                } -            }; -              const groups = []; -            let modePrev = null; +            let groupPre = null; +            let isKanaPre = null;              for (const c of expression) {                  const codePoint = c.codePointAt(0); -                const modeCurr = this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana'; -                if (modeCurr === modePrev) { -                    groups[groups.length - 1].text += c; +                const isKana = !(this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT); +                if (isKana === isKanaPre) { +                    groupPre.text += c;                  } else { -                    groups.push({mode: modeCurr, text: c}); -                    modePrev = modeCurr; +                    groupPre = {isKana, text: c, textNormalized: null}; +                    groups.push(groupPre); +                    isKanaPre = isKana; +                } +            } +            for (const group of groups) { +                if (group.isKana) { +                    group.textNormalized = this.convertKatakanaToHiragana(group.text);                  }              } -            const segments = segmentize(reading, groups); -            if (segments && !isAmbiguous) { +            const readingNormalized = this.convertKatakanaToHiragana(reading); +            const segments = this._segmentizeFurigana(reading, readingNormalized, groups, 0); +            if (segments !== null) {                  return segments;              }              // Fallback -            return [{furigana: reading, text: expression}]; +            return [this._createFuriganaSegment(expression, reading)];          }          distributeFuriganaInflected(expression, reading, source) { -            const output = []; -              let stemLength = 0;              const shortest = Math.min(source.length, expression.length);              const sourceHiragana = this.convertKatakanaToHiragana(source); @@ -486,15 +454,13 @@ const JapaneseUtil = (() => {                  0,                  offset === 0 ? reading.length : reading.length - expression.length + stemLength              ); -            for (const segment of this.distributeFurigana(stemExpression, stemReading)) { -                output.push(segment); -            } +            const result = this.distributeFurigana(stemExpression, stemReading);              if (stemLength !== source.length) { -                output.push({text: source.substring(stemLength), furigana: ''}); +                result.push(this._createFuriganaSegment(source.substring(stemLength), ''));              } -            return output; +            return result;          }          // Miscellaneous @@ -532,6 +498,62 @@ const JapaneseUtil = (() => {          // Private +        _createFuriganaSegment(text, furigana) { +            return {text, furigana}; +        } + +        _segmentizeFurigana(reading, readingNormalized, groups, groupsStart) { +            const groupCount = groups.length - groupsStart; +            if (groupCount <= 0) { +                return []; +            } + +            const group = groups[groupsStart]; +            const {isKana, text} = group; +            const textLength = text.length; +            if (isKana) { +                const {textNormalized} = group; +                if (readingNormalized.startsWith(textNormalized)) { +                    const segments = this._segmentizeFurigana( +                        reading.substring(textLength), +                        readingNormalized.substring(textLength), +                        groups, +                        groupsStart + 1 +                    ); +                    if (segments !== null) { +                        const furigana = reading.startsWith(text) ? '' : reading.substring(0, textLength); +                        segments.unshift(this._createFuriganaSegment(text, furigana)); +                        return segments; +                    } +                } +                return null; +            } else { +                let result = null; +                for (let i = reading.length; i >= textLength; --i) { +                    const segments = this._segmentizeFurigana( +                        reading.substring(i), +                        readingNormalized.substring(i), +                        groups, +                        groupsStart + 1 +                    ); +                    if (segments !== null) { +                        if (result !== null) { +                            // More than one way to segmentize the tail; mark as ambiguous +                            return null; +                        } +                        const furigana = reading.substring(0, i); +                        segments.unshift(this._createFuriganaSegment(text, furigana)); +                        result = segments; +                    } +                    // There is only one way to segmentize the last non-kana group +                    if (groupCount === 1) { +                        break; +                    } +                } +                return result; +            } +        } +          _getWanakana() {              const wanakana = this._wanakana;              if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } diff --git a/test/test-japanese.js b/test/test-japanese.js index 97d613fe..b4fcd7a0 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -353,6 +353,347 @@ function testDistributeFurigana() {              [                  {text: 'かいぬ', furigana: ''}              ] +        ], +        // Misc +        [ +            ['月', 'か'], +            [ +                {text: '月', furigana: 'か'} +            ] +        ], +        [ +            ['月', 'カ'], +            [ +                {text: '月', furigana: 'カ'} +            ] +        ], +        // Mismatched kana readings +        [ +            ['有り難う', 'アリガトウ'], +            [ +                {text: '有', furigana: 'ア'}, +                {text: 'り', furigana: 'リ'}, +                {text: '難', furigana: 'ガト'}, +                {text: 'う', furigana: 'ウ'} +            ] +        ], +        [ +            ['ありがとう', 'アリガトウ'], +            [ +                {text: 'ありがとう', furigana: 'アリガトウ'} +            ] +        ], +        // Mismatched kana readings (real examples) +        [ +            ['カ月', 'かげつ'], +            [ +                {text: 'カ', furigana: 'か'}, +                {text: '月', furigana: 'げつ'} +            ] +        ], +        [ +            ['序ノ口', 'じょのくち'], +            [ +                {text: '序', furigana: 'じょ'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '口', furigana: 'くち'} +            ] +        ], +        [ +            ['スズメの涙', 'すずめのなみだ'], +            [ +                {text: 'スズメの', furigana: 'すずめの'}, +                {text: '涙', furigana: 'なみだ'} +            ] +        ], +        [ +            ['二カ所', 'にかしょ'], +            [ +                {text: '二', furigana: 'に'}, +                {text: 'カ', furigana: 'か'}, +                {text: '所', furigana: 'しょ'} +            ] +        ], +        [ +            ['八ツ橋', 'やつはし'], +            [ +                {text: '八', furigana: 'や'}, +                {text: 'ツ', furigana: 'つ'}, +                {text: '橋', furigana: 'はし'} +            ] +        ], +        [ +            ['八ツ橋', 'やつはし'], +            [ +                {text: '八', furigana: 'や'}, +                {text: 'ツ', furigana: 'つ'}, +                {text: '橋', furigana: 'はし'} +            ] +        ], +        [ +            ['一カ月', 'いっかげつ'], +            [ +                {text: '一', furigana: 'いっ'}, +                {text: 'カ', furigana: 'か'}, +                {text: '月', furigana: 'げつ'} +            ] +        ], +        [ +            ['一カ所', 'いっかしょ'], +            [ +                {text: '一', furigana: 'いっ'}, +                {text: 'カ', furigana: 'か'}, +                {text: '所', furigana: 'しょ'} +            ] +        ], +        [ +            ['カ所', 'かしょ'], +            [ +                {text: 'カ', furigana: 'か'}, +                {text: '所', furigana: 'しょ'} +            ] +        ], +        [ +            ['数カ月', 'すうかげつ'], +            [ +                {text: '数', furigana: 'すう'}, +                {text: 'カ', furigana: 'か'}, +                {text: '月', furigana: 'げつ'} +            ] +        ], +        [ +            ['くノ一', 'くのいち'], +            [ +                {text: 'くノ', furigana: 'くの'}, +                {text: '一', furigana: 'いち'} +            ] +        ], +        [ +            ['くノ一', 'くのいち'], +            [ +                {text: 'くノ', furigana: 'くの'}, +                {text: '一', furigana: 'いち'} +            ] +        ], +        [ +            ['数カ国', 'すうかこく'], +            [ +                {text: '数', furigana: 'すう'}, +                {text: 'カ', furigana: 'か'}, +                {text: '国', furigana: 'こく'} +            ] +        ], +        [ +            ['数カ所', 'すうかしょ'], +            [ +                {text: '数', furigana: 'すう'}, +                {text: 'カ', furigana: 'か'}, +                {text: '所', furigana: 'しょ'} +            ] +        ], +        [ +            ['壇ノ浦の戦い', 'だんのうらのたたかい'], +            [ +                {text: '壇', furigana: 'だん'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '浦', furigana: 'うら'}, +                {text: 'の', furigana: ''}, +                {text: '戦', furigana: 'たたか'}, +                {text: 'い', furigana: ''} +            ] +        ], +        [ +            ['壇ノ浦の戦', 'だんのうらのたたかい'], +            [ +                {text: '壇', furigana: 'だん'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '浦', furigana: 'うら'}, +                {text: 'の', furigana: ''}, +                {text: '戦', furigana: 'たたかい'} +            ] +        ], +        [ +            ['序ノ口格', 'じょのくちかく'], +            [ +                {text: '序', furigana: 'じょ'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '口格', furigana: 'くちかく'} +            ] +        ], +        [ +            ['二カ国語', 'にかこくご'], +            [ +                {text: '二', furigana: 'に'}, +                {text: 'カ', furigana: 'か'}, +                {text: '国語', furigana: 'こくご'} +            ] +        ], +        [ +            ['カ国', 'かこく'], +            [ +                {text: 'カ', furigana: 'か'}, +                {text: '国', furigana: 'こく'} +            ] +        ], +        [ +            ['カ国語', 'かこくご'], +            [ +                {text: 'カ', furigana: 'か'}, +                {text: '国語', furigana: 'こくご'} +            ] +        ], +        [ +            ['壇ノ浦の合戦', 'だんのうらのかっせん'], +            [ +                {text: '壇', furigana: 'だん'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '浦', furigana: 'うら'}, +                {text: 'の', furigana: ''}, +                {text: '合戦', furigana: 'かっせん'} +            ] +        ], +        [ +            ['一タ偏', 'いちたへん'], +            [ +                {text: '一', furigana: 'いち'}, +                {text: 'タ', furigana: 'た'}, +                {text: '偏', furigana: 'へん'} +            ] +        ], +        [ +            ['ル又', 'るまた'], +            [ +                {text: 'ル', furigana: 'る'}, +                {text: '又', furigana: 'また'} +            ] +        ], +        [ +            ['ノ木偏', 'のぎへん'], +            [ +                {text: 'ノ', furigana: 'の'}, +                {text: '木偏', furigana: 'ぎへん'} +            ] +        ], +        [ +            ['一ノ貝', 'いちのかい'], +            [ +                {text: '一', furigana: 'いち'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '貝', furigana: 'かい'} +            ] +        ], +        [ +            ['虎ノ門事件', 'とらのもんじけん'], +            [ +                {text: '虎', furigana: 'とら'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '門事件', furigana: 'もんじけん'} +            ] +        ], +        [ +            ['教育ニ関スル勅語', 'きょういくにかんするちょくご'], +            [ +                {text: '教育', furigana: 'きょういく'}, +                {text: 'ニ', furigana: 'に'}, +                {text: '関', furigana: 'かん'}, +                {text: 'スル', furigana: 'する'}, +                {text: '勅語', furigana: 'ちょくご'} +            ] +        ], +        [ +            ['二カ年', 'にかねん'], +            [ +                {text: '二', furigana: 'に'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['三カ年', 'さんかねん'], +            [ +                {text: '三', furigana: 'さん'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['四カ年', 'よんかねん'], +            [ +                {text: '四', furigana: 'よん'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['五カ年', 'ごかねん'], +            [ +                {text: '五', furigana: 'ご'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['六カ年', 'ろっかねん'], +            [ +                {text: '六', furigana: 'ろっ'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['七カ年', 'ななかねん'], +            [ +                {text: '七', furigana: 'なな'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['八カ年', 'はちかねん'], +            [ +                {text: '八', furigana: 'はち'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['九カ年', 'きゅうかねん'], +            [ +                {text: '九', furigana: 'きゅう'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['十カ年', 'じゅうかねん'], +            [ +                {text: '十', furigana: 'じゅう'}, +                {text: 'カ', furigana: 'か'}, +                {text: '年', furigana: 'ねん'} +            ] +        ], +        [ +            ['鏡ノ間', 'かがみのま'], +            [ +                {text: '鏡', furigana: 'かがみ'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '間', furigana: 'ま'} +            ] +        ], +        [ +            ['鏡ノ間', 'かがみのま'], +            [ +                {text: '鏡', furigana: 'かがみ'}, +                {text: 'ノ', furigana: 'の'}, +                {text: '間', furigana: 'ま'} +            ] +        ], +        [ +            ['ページ違反', 'ぺーじいはん'], +            [ +                {text: 'ページ', furigana: 'ぺーじ'}, +                {text: '違反', furigana: 'いはん'} +            ]          ]      ]; |