diff options
Diffstat (limited to 'ext/mixed')
| -rw-r--r-- | ext/mixed/js/japanese.js | 507 | 
1 files changed, 452 insertions, 55 deletions
| diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index ced486dd..801dec84 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -16,6 +16,11 @@   */  const jp = (() => { +    const ITERATION_MARK_CODE_POINT = 0x3005; +    const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; +    const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; +    const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; +      const HIRAGANA_RANGE = [0x3040, 0x309f];      const KATAKANA_RANGE = [0x30a0, 0x30ff];      const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE]; @@ -65,20 +70,65 @@ const jp = (() => {      const SMALL_KANA_SET = new Set(Array.from('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ')); +    const HALFWIDTH_KATAKANA_MAPPING = new Map([ +        ['ヲ', 'ヲヺ-'], +        ['ァ', 'ァ--'], +        ['ィ', 'ィ--'], +        ['ゥ', 'ゥ--'], +        ['ェ', 'ェ--'], +        ['ォ', 'ォ--'], +        ['ャ', 'ャ--'], +        ['ュ', 'ュ--'], +        ['ョ', 'ョ--'], +        ['ッ', 'ッ--'], +        ['ー', 'ー--'], +        ['ア', 'ア--'], +        ['イ', 'イ--'], +        ['ウ', 'ウヴ-'], +        ['エ', 'エ--'], +        ['オ', 'オ--'], +        ['カ', 'カガ-'], +        ['キ', 'キギ-'], +        ['ク', 'クグ-'], +        ['ケ', 'ケゲ-'], +        ['コ', 'コゴ-'], +        ['サ', 'サザ-'], +        ['シ', 'シジ-'], +        ['ス', 'スズ-'], +        ['セ', 'セゼ-'], +        ['ソ', 'ソゾ-'], +        ['タ', 'タダ-'], +        ['チ', 'チヂ-'], +        ['ツ', 'ツヅ-'], +        ['テ', 'テデ-'], +        ['ト', 'トド-'], +        ['ナ', 'ナ--'], +        ['ニ', 'ニ--'], +        ['ヌ', 'ヌ--'], +        ['ネ', 'ネ--'], +        ['ノ', 'ノ--'], +        ['ハ', 'ハバパ'], +        ['ヒ', 'ヒビピ'], +        ['フ', 'フブプ'], +        ['ヘ', 'ヘベペ'], +        ['ホ', 'ホボポ'], +        ['マ', 'マ--'], +        ['ミ', 'ミ--'], +        ['ム', 'ム--'], +        ['メ', 'メ--'], +        ['モ', 'モ--'], +        ['ヤ', 'ヤ--'], +        ['ユ', 'ユ--'], +        ['ヨ', 'ヨ--'], +        ['ラ', 'ラ--'], +        ['リ', 'リ--'], +        ['ル', 'ル--'], +        ['レ', 'レ--'], +        ['ロ', 'ロ--'], +        ['ワ', 'ワ--'], +        ['ン', 'ン--'] +    ]); -    // Character code testing functions - -    function isCodePointKanji(codePoint) { -        return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); -    } - -    function isCodePointKana(codePoint) { -        return isCodePointInRanges(codePoint, KANA_RANGES); -    } - -    function isCodePointJapanese(codePoint) { -        return isCodePointInRanges(codePoint, JAPANESE_RANGES); -    }      function isCodePointInRanges(codePoint, ranges) {          for (const [min, max] of ranges) { @@ -89,63 +139,410 @@ const jp = (() => {          return false;      } +    function getWanakana() { +        try { +            if (typeof wanakana !== 'undefined') { +                // eslint-disable-next-line no-undef +                return wanakana; +            } +        } catch (e) { +            // NOP +        } +        return null; +    } + -    // String testing functions +    class JapaneseUtil { +        constructor(wanakana=null) { +            this._wanakana = wanakana; +        } -    function isStringEntirelyKana(str) { -        if (str.length === 0) { return false; } -        for (const c of str) { -            if (!isCodePointKana(c.codePointAt(0))) { -                return false; +        // Character code testing functions + +        isCodePointKanji(codePoint) { +            return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); +        } + +        isCodePointKana(codePoint) { +            return isCodePointInRanges(codePoint, KANA_RANGES); +        } + +        isCodePointJapanese(codePoint) { +            return isCodePointInRanges(codePoint, JAPANESE_RANGES); +        } + +        // String testing functions + +        isStringEntirelyKana(str) { +            if (str.length === 0) { return false; } +            for (const c of str) { +                if (!isCodePointInRanges(c.codePointAt(0), KANA_RANGES)) { +                    return false; +                }              } +            return true;          } -        return true; -    } -    function isStringPartiallyJapanese(str) { -        if (str.length === 0) { return false; } -        for (const c of str) { -            if (isCodePointJapanese(c.codePointAt(0))) { -                return true; +        isStringPartiallyJapanese(str) { +            if (str.length === 0) { return false; } +            for (const c of str) { +                if (isCodePointInRanges(c.codePointAt(0), JAPANESE_RANGES)) { +                    return true; +                }              } +            return false;          } -        return false; -    } +        // Mora functions -    // Mora functions +        isMoraPitchHigh(moraIndex, pitchAccentPosition) { +            switch (pitchAccentPosition) { +                case 0: return (moraIndex > 0); +                case 1: return (moraIndex < 1); +                default: return (moraIndex > 0 && moraIndex < pitchAccentPosition); +            } +        } -    function isMoraPitchHigh(moraIndex, pitchAccentPosition) { -        switch (pitchAccentPosition) { -            case 0: return (moraIndex > 0); -            case 1: return (moraIndex < 1); -            default: return (moraIndex > 0 && moraIndex < pitchAccentPosition); +        getKanaMorae(text) { +            const morae = []; +            let i; +            for (const c of text) { +                if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { +                    morae[i - 1] += c; +                } else { +                    morae.push(c); +                } +            } +            return morae;          } -    } -    function getKanaMorae(text) { -        const morae = []; -        let i; -        for (const c of text) { -            if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { -                morae[i - 1] += c; -            } else { -                morae.push(c); +        // Conversion functions + +        convertKatakanaToHiragana(text) { +            const wanakana = this._getWanakana(); +            let result = ''; +            for (const c of text) { +                if (wanakana.isKatakana(c)) { +                    result += wanakana.toHiragana(c); +                } else { +                    result += c; +                }              } + +            return result;          } -        return morae; -    } +        convertHiraganaToKatakana(text) { +            const wanakana = this._getWanakana(); +            let result = ''; +            for (const c of text) { +                if (wanakana.isHiragana(c)) { +                    result += wanakana.toKatakana(c); +                } else { +                    result += c; +                } +            } + +            return result; +        } + +        convertToRomaji(text) { +            const wanakana = this._getWanakana(); +            return wanakana.toRomaji(text); +        } + +        convertReading(expression, reading, readingMode) { +            switch (readingMode) { +                case 'hiragana': +                    return this.convertKatakanaToHiragana(reading); +                case 'katakana': +                    return this.convertHiraganaToKatakana(reading); +                case 'romaji': +                    if (reading) { +                        return this.convertToRomaji(reading); +                    } else { +                        if (this.isStringEntirelyKana(expression)) { +                            return this.convertToRomaji(expression); +                        } +                    } +                    return reading; +                case 'none': +                    return ''; +                default: +                    return reading; +            } +        } + +        convertNumericToFullWidth(text) { +            let result = ''; +            for (const char of text) { +                let c = char.codePointAt(0); +                if (c >= 0x30 && c <= 0x39) { // ['0', '9'] +                    c += 0xff10 - 0x30; // 0xff10 = '0' full width +                    result += String.fromCodePoint(c); +                } else { +                    result += char; +                } +            } +            return result; +        } + +        convertHalfWidthKanaToFullWidth(text, sourceMap=null) { +            let result = ''; + +            // This function is safe to use charCodeAt instead of codePointAt, since all +            // the relevant characters are represented with a single UTF-16 character code. +            for (let i = 0, ii = text.length; i < ii; ++i) { +                const c = text[i]; +                const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c); +                if (typeof mapping !== 'string') { +                    result += c; +                    continue; +                } + +                let index = 0; +                switch (text.charCodeAt(i + 1)) { +                    case 0xff9e: // dakuten +                        index = 1; +                        break; +                    case 0xff9f: // handakuten +                        index = 2; +                        break; +                } + +                let c2 = mapping[index]; +                if (index > 0) { +                    if (c2 === '-') { // invalid +                        index = 0; +                        c2 = mapping[0]; +                    } else { +                        ++i; +                    } +                } + +                if (sourceMap !== null && index > 0) { +                    sourceMap.combine(result.length, 1); +                } +                result += c2; +            } + +            return result; +        } + +        convertAlphabeticToKana(text, sourceMap=null) { +            let part = ''; +            let result = ''; + +            for (const char of text) { +                // Note: 0x61 is the character code for 'a' +                let c = char.codePointAt(0); +                if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] +                    c += (0x61 - 0x41); +                } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] +                    // NOP; c += (0x61 - 0x61); +                } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth +                    c += (0x61 - 0xff21); +                } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth +                    c += (0x61 - 0xff41); +                } else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash +                    c = 0x2d; // '-' +                } else { +                    if (part.length > 0) { +                        result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); +                        part = ''; +                    } +                    result += char; +                    continue; +                } +                part += String.fromCodePoint(c); +            } + +            if (part.length > 0) { +                result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); +            } +            return result; +        } + +        // Furigana distribution + +        distributeFurigana(expression, reading) { +            const fallback = [{furigana: reading, text: expression}]; +            if (!reading) { +                return fallback; +            } + +            let isAmbiguous = false; +            const segmentize = (reading2, groups) => { +                if (groups.length === 0 || isAmbiguous) { +                    return []; +                } + +                const group = groups[0]; +                if (group.mode === 'kana') { +                    if (this.convertKatakanaToHiragana(reading2).startsWith(this.convertKatakanaToHiragana(group.text))) { +                        const readingLeft = reading2.substring(group.text.length); +                        const segs = segmentize(readingLeft, groups.splice(1)); +                        if (segs) { +                            return [{text: group.text, furigana: ''}].concat(segs); +                        } +                    } +                } else { +                    let foundSegments = null; +                    for (let i = reading2.length; i >= group.text.length; --i) { +                        const readingUsed = reading2.substring(0, i); +                        const readingLeft = reading2.substring(i); +                        const segs = segmentize(readingLeft, groups.slice(1)); +                        if (segs) { +                            if (foundSegments !== null) { +                                // more than one way to segmentize the tail, mark as ambiguous +                                isAmbiguous = true; +                                return null; +                            } +                            foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); +                        } +                        // there is only one way to segmentize the last non-kana group +                        if (groups.length === 1) { +                            break; +                        } +                    } +                    return foundSegments; +                } +            }; + +            const groups = []; +            let modePrev = null; +            for (const c of expression) { +                const codePoint = c.codePointAt(0); +                const modeCurr = this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana'; +                if (modeCurr === modePrev) { +                    groups[groups.length - 1].text += c; +                } else { +                    groups.push({mode: modeCurr, text: c}); +                    modePrev = modeCurr; +                } +            } + +            const segments = segmentize(reading, groups); +            if (segments && !isAmbiguous) { +                return segments; +            } +            return fallback; +        } + +        distributeFuriganaInflected(expression, reading, source) { +            const output = []; + +            let stemLength = 0; +            const shortest = Math.min(source.length, expression.length); +            const sourceHiragana = this.convertKatakanaToHiragana(source); +            const expressionHiragana = this.convertKatakanaToHiragana(expression); +            while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) { +                ++stemLength; +            } +            const offset = source.length - stemLength; + +            const stemExpression = source.substring(0, source.length - offset); +            const stemReading = reading.substring( +                0, +                offset === 0 ? reading.length : reading.length - expression.length + stemLength +            ); +            for (const segment of this.distributeFurigana(stemExpression, stemReading)) { +                output.push(segment); +            } + +            if (stemLength !== source.length) { +                output.push({text: source.substring(stemLength), furigana: ''}); +            } + +            return output; +        } + +        // Miscellaneous + +        collapseEmphaticSequences(text, fullCollapse, sourceMap=null) { +            let result = ''; +            let collapseCodePoint = -1; +            const hasSourceMap = (sourceMap !== null); +            for (const char of text) { +                const c = char.codePointAt(0); +                if ( +                    c === HIRAGANA_SMALL_TSU_CODE_POINT || +                    c === KATAKANA_SMALL_TSU_CODE_POINT || +                    c === KANA_PROLONGED_SOUND_MARK_CODE_POINT +                ) { +                    if (collapseCodePoint !== c) { +                        collapseCodePoint = c; +                        if (!fullCollapse) { +                            result += char; +                            continue; +                        } +                    } +                } else { +                    collapseCodePoint = -1; +                    result += char; +                    continue; +                } + +                if (hasSourceMap) { +                    sourceMap.combine(Math.max(0, result.length - 1), 1); +                } +            } +            return result; +        } + +        // Private + +        _getWanakana() { +            const wanakana = this._wanakana; +            if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } +            return wanakana; +        } + +        _convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { +            const wanakana = this._getWanakana(); +            const result = wanakana.toHiragana(text); + +            // Generate source mapping +            if (sourceMap !== null) { +                let i = 0; +                let resultPos = 0; +                const ii = text.length; +                while (i < ii) { +                    // Find smallest matching substring +                    let iNext = i + 1; +                    let resultPosNext = result.length; +                    while (iNext < ii) { +                        const t = wanakana.toHiragana(text.substring(0, iNext)); +                        if (t === result.substring(0, t.length)) { +                            resultPosNext = t.length; +                            break; +                        } +                        ++iNext; +                    } + +                    // Merge characters +                    const removals = iNext - i - 1; +                    if (removals > 0) { +                        sourceMap.combine(sourceMapStart, removals); +                    } +                    ++sourceMapStart; + +                    // Empty elements +                    const additions = resultPosNext - resultPos - 1; +                    for (let j = 0; j < additions; ++j) { +                        sourceMap.insert(sourceMapStart, 0); +                        ++sourceMapStart; +                    } + +                    i = iNext; +                    resultPos = resultPosNext; +                } +            } + +            return result; +        } +    } -    // Exports -    return { -        isCodePointKanji, -        isCodePointKana, -        isCodePointJapanese, -        isStringEntirelyKana, -        isStringPartiallyJapanese, -        isMoraPitchHigh, -        getKanaMorae -    }; +    return new JapaneseUtil(getWanakana());  })(); |