diff options
| -rw-r--r-- | ext/bg/background.html | 1 | ||||
| -rw-r--r-- | ext/bg/js/japanese.js | 426 | ||||
| -rw-r--r-- | ext/bg/search.html | 1 | ||||
| -rw-r--r-- | ext/bg/settings.html | 1 | ||||
| -rw-r--r-- | ext/mixed/js/japanese.js | 507 | ||||
| -rw-r--r-- | test/test-japanese.js | 3 | 
6 files changed, 453 insertions, 486 deletions
| diff --git a/ext/bg/background.html b/ext/bg/background.html index 9c740adf..7cb76ec3 100644 --- a/ext/bg/background.html +++ b/ext/bg/background.html @@ -36,7 +36,6 @@          <script src="/bg/js/deinflector.js"></script>          <script src="/bg/js/dictionary.js"></script>          <script src="/bg/js/handlebars.js"></script> -        <script src="/bg/js/japanese.js"></script>          <script src="/bg/js/json-schema.js"></script>          <script src="/bg/js/media-utility.js"></script>          <script src="/bg/js/options.js"></script> diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js deleted file mode 100644 index ac81acb5..00000000 --- a/ext/bg/js/japanese.js +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (C) 2016-2020  Yomichan Authors - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program.  If not, see <https://www.gnu.org/licenses/>. - */ - -/* global - * jp - * wanakana - */ - -(() => { -    const HALFWIDTH_KATAKANA_MAPPING = new Map([ -        ['ヲ', 'ヲヺ-'], -        ['ァ', 'ァ--'], -        ['ィ', 'ィ--'], -        ['ゥ', 'ゥ--'], -        ['ェ', 'ェ--'], -        ['ォ', 'ォ--'], -        ['ャ', 'ャ--'], -        ['ュ', 'ュ--'], -        ['ョ', 'ョ--'], -        ['ッ', 'ッ--'], -        ['ー', 'ー--'], -        ['ア', 'ア--'], -        ['イ', 'イ--'], -        ['ウ', 'ウヴ-'], -        ['エ', 'エ--'], -        ['オ', 'オ--'], -        ['カ', 'カガ-'], -        ['キ', 'キギ-'], -        ['ク', 'クグ-'], -        ['ケ', 'ケゲ-'], -        ['コ', 'コゴ-'], -        ['サ', 'サザ-'], -        ['シ', 'シジ-'], -        ['ス', 'スズ-'], -        ['セ', 'セゼ-'], -        ['ソ', 'ソゾ-'], -        ['タ', 'タダ-'], -        ['チ', 'チヂ-'], -        ['ツ', 'ツヅ-'], -        ['テ', 'テデ-'], -        ['ト', 'トド-'], -        ['ナ', 'ナ--'], -        ['ニ', 'ニ--'], -        ['ヌ', 'ヌ--'], -        ['ネ', 'ネ--'], -        ['ノ', 'ノ--'], -        ['ハ', 'ハバパ'], -        ['ヒ', 'ヒビピ'], -        ['フ', 'フブプ'], -        ['ヘ', 'ヘベペ'], -        ['ホ', 'ホボポ'], -        ['マ', 'マ--'], -        ['ミ', 'ミ--'], -        ['ム', 'ム--'], -        ['メ', 'メ--'], -        ['モ', 'モ--'], -        ['ヤ', 'ヤ--'], -        ['ユ', 'ユ--'], -        ['ヨ', 'ヨ--'], -        ['ラ', 'ラ--'], -        ['リ', 'リ--'], -        ['ル', 'ル--'], -        ['レ', 'レ--'], -        ['ロ', 'ロ--'], -        ['ワ', 'ワ--'], -        ['ン', 'ン--'] -    ]); - -    const ITERATION_MARK_CODE_POINT = 0x3005; - -    const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; -    const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; -    const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; - -    // Existing functions - -    const isCodePointKanji = jp.isCodePointKanji; -    const isStringEntirelyKana = jp.isStringEntirelyKana; - - -    // Conversion functions - -    function convertKatakanaToHiragana(text) { -        let result = ''; -        for (const c of text) { -            if (wanakana.isKatakana(c)) { -                result += wanakana.toHiragana(c); -            } else { -                result += c; -            } -        } - -        return result; -    } - -    function convertHiraganaToKatakana(text) { -        let result = ''; -        for (const c of text) { -            if (wanakana.isHiragana(c)) { -                result += wanakana.toKatakana(c); -            } else { -                result += c; -            } -        } - -        return result; -    } - -    function convertToRomaji(text) { -        return wanakana.toRomaji(text); -    } - -    function convertReading(expression, reading, readingMode) { -        switch (readingMode) { -            case 'hiragana': -                return convertKatakanaToHiragana(reading); -            case 'katakana': -                return convertHiraganaToKatakana(reading); -            case 'romaji': -                if (reading) { -                    return convertToRomaji(reading); -                } else { -                    if (isStringEntirelyKana(expression)) { -                        return convertToRomaji(expression); -                    } -                } -                return reading; -            case 'none': -                return ''; -            default: -                return reading; -        } -    } - -    function convertNumericToFullWidth(text) { -        let result = ''; -        for (const char of text) { -            let c = char.codePointAt(0); -            if (c >= 0x30 && c <= 0x39) { // ['0', '9'] -                c += 0xff10 - 0x30; // 0xff10 = '0' full width -                result += String.fromCodePoint(c); -            } else { -                result += char; -            } -        } -        return result; -    } - -    function convertHalfWidthKanaToFullWidth(text, sourceMap=null) { -        let result = ''; - -        // This function is safe to use charCodeAt instead of codePointAt, since all -        // the relevant characters are represented with a single UTF-16 character code. -        for (let i = 0, ii = text.length; i < ii; ++i) { -            const c = text[i]; -            const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c); -            if (typeof mapping !== 'string') { -                result += c; -                continue; -            } - -            let index = 0; -            switch (text.charCodeAt(i + 1)) { -                case 0xff9e: // dakuten -                    index = 1; -                    break; -                case 0xff9f: // handakuten -                    index = 2; -                    break; -            } - -            let c2 = mapping[index]; -            if (index > 0) { -                if (c2 === '-') { // invalid -                    index = 0; -                    c2 = mapping[0]; -                } else { -                    ++i; -                } -            } - -            if (sourceMap !== null && index > 0) { -                sourceMap.combine(result.length, 1); -            } -            result += c2; -        } - -        return result; -    } - -    function convertAlphabeticToKana(text, sourceMap=null) { -        let part = ''; -        let result = ''; - -        for (const char of text) { -            // Note: 0x61 is the character code for 'a' -            let c = char.codePointAt(0); -            if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] -                c += (0x61 - 0x41); -            } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] -                // NOP; c += (0x61 - 0x61); -            } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth -                c += (0x61 - 0xff21); -            } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth -                c += (0x61 - 0xff41); -            } else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash -                c = 0x2d; // '-' -            } else { -                if (part.length > 0) { -                    result += convertAlphabeticPartToKana(part, sourceMap, result.length); -                    part = ''; -                } -                result += char; -                continue; -            } -            part += String.fromCodePoint(c); -        } - -        if (part.length > 0) { -            result += convertAlphabeticPartToKana(part, sourceMap, result.length); -        } -        return result; -    } - -    function convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { -        const result = wanakana.toHiragana(text); - -        // Generate source mapping -        if (sourceMap !== null) { -            let i = 0; -            let resultPos = 0; -            const ii = text.length; -            while (i < ii) { -                // Find smallest matching substring -                let iNext = i + 1; -                let resultPosNext = result.length; -                while (iNext < ii) { -                    const t = wanakana.toHiragana(text.substring(0, iNext)); -                    if (t === result.substring(0, t.length)) { -                        resultPosNext = t.length; -                        break; -                    } -                    ++iNext; -                } - -                // Merge characters -                const removals = iNext - i - 1; -                if (removals > 0) { -                    sourceMap.combine(sourceMapStart, removals); -                } -                ++sourceMapStart; - -                // Empty elements -                const additions = resultPosNext - resultPos - 1; -                for (let j = 0; j < additions; ++j) { -                    sourceMap.insert(sourceMapStart, 0); -                    ++sourceMapStart; -                } - -                i = iNext; -                resultPos = resultPosNext; -            } -        } - -        return result; -    } - - -    // Furigana distribution - -    function distributeFurigana(expression, reading) { -        const fallback = [{furigana: reading, text: expression}]; -        if (!reading) { -            return fallback; -        } - -        let isAmbiguous = false; -        const segmentize = (reading2, groups) => { -            if (groups.length === 0 || isAmbiguous) { -                return []; -            } - -            const group = groups[0]; -            if (group.mode === 'kana') { -                if (convertKatakanaToHiragana(reading2).startsWith(convertKatakanaToHiragana(group.text))) { -                    const readingLeft = reading2.substring(group.text.length); -                    const segs = segmentize(readingLeft, groups.splice(1)); -                    if (segs) { -                        return [{text: group.text, furigana: ''}].concat(segs); -                    } -                } -            } else { -                let foundSegments = null; -                for (let i = reading2.length; i >= group.text.length; --i) { -                    const readingUsed = reading2.substring(0, i); -                    const readingLeft = reading2.substring(i); -                    const segs = segmentize(readingLeft, groups.slice(1)); -                    if (segs) { -                        if (foundSegments !== null) { -                            // more than one way to segmentize the tail, mark as ambiguous -                            isAmbiguous = true; -                            return null; -                        } -                        foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); -                    } -                    // there is only one way to segmentize the last non-kana group -                    if (groups.length === 1) { -                        break; -                    } -                } -                return foundSegments; -            } -        }; - -        const groups = []; -        let modePrev = null; -        for (const c of expression) { -            const codePoint = c.codePointAt(0); -            const modeCurr = isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana'; -            if (modeCurr === modePrev) { -                groups[groups.length - 1].text += c; -            } else { -                groups.push({mode: modeCurr, text: c}); -                modePrev = modeCurr; -            } -        } - -        const segments = segmentize(reading, groups); -        if (segments && !isAmbiguous) { -            return segments; -        } -        return fallback; -    } - -    function distributeFuriganaInflected(expression, reading, source) { -        const output = []; - -        let stemLength = 0; -        const shortest = Math.min(source.length, expression.length); -        const sourceHiragana = convertKatakanaToHiragana(source); -        const expressionHiragana = convertKatakanaToHiragana(expression); -        while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) { -            ++stemLength; -        } -        const offset = source.length - stemLength; - -        const stemExpression = source.substring(0, source.length - offset); -        const stemReading = reading.substring( -            0, -            offset === 0 ? reading.length : reading.length - expression.length + stemLength -        ); -        for (const segment of distributeFurigana(stemExpression, stemReading)) { -            output.push(segment); -        } - -        if (stemLength !== source.length) { -            output.push({text: source.substring(stemLength), furigana: ''}); -        } - -        return output; -    } - - -    // Miscellaneous - -    function collapseEmphaticSequences(text, fullCollapse, sourceMap=null) { -        let result = ''; -        let collapseCodePoint = -1; -        const hasSourceMap = (sourceMap !== null); -        for (const char of text) { -            const c = char.codePointAt(0); -            if ( -                c === HIRAGANA_SMALL_TSU_CODE_POINT || -                c === KATAKANA_SMALL_TSU_CODE_POINT || -                c === KANA_PROLONGED_SOUND_MARK_CODE_POINT -            ) { -                if (collapseCodePoint !== c) { -                    collapseCodePoint = c; -                    if (!fullCollapse) { -                        result += char; -                        continue; -                    } -                } -            } else { -                collapseCodePoint = -1; -                result += char; -                continue; -            } - -            if (hasSourceMap) { -                sourceMap.combine(Math.max(0, result.length - 1), 1); -            } -        } -        return result; -    } - - -    // Exports - -    Object.assign(jp, { -        convertKatakanaToHiragana, -        convertHiraganaToKatakana, -        convertToRomaji, -        convertReading, -        convertNumericToFullWidth, -        convertHalfWidthKanaToFullWidth, -        convertAlphabeticToKana, -        distributeFurigana, -        distributeFuriganaInflected, -        collapseEmphaticSequences -    }); -})(); diff --git a/ext/bg/search.html b/ext/bg/search.html index 52915b76..a30b1d60 100644 --- a/ext/bg/search.html +++ b/ext/bg/search.html @@ -79,7 +79,6 @@          <script src="/bg/js/dictionary.js"></script>          <script src="/bg/js/handlebars.js"></script> -        <script src="/bg/js/japanese.js"></script>          <script src="/fg/js/document.js"></script>          <script src="/fg/js/source.js"></script>          <script src="/mixed/js/audio-system.js"></script> diff --git a/ext/bg/settings.html b/ext/bg/settings.html index b8477e46..a0981687 100644 --- a/ext/bg/settings.html +++ b/ext/bg/settings.html @@ -1139,7 +1139,6 @@          <script src="/bg/js/conditions.js"></script>          <script src="/bg/js/dictionary.js"></script>          <script src="/bg/js/handlebars.js"></script> -        <script src="/bg/js/japanese.js"></script>          <script src="/bg/js/options.js"></script>          <script src="/bg/js/page-exit-prevention.js"></script>          <script src="/bg/js/profile-conditions.js"></script> diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index ced486dd..801dec84 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -16,6 +16,11 @@   */  const jp = (() => { +    const ITERATION_MARK_CODE_POINT = 0x3005; +    const HIRAGANA_SMALL_TSU_CODE_POINT = 0x3063; +    const KATAKANA_SMALL_TSU_CODE_POINT = 0x30c3; +    const KANA_PROLONGED_SOUND_MARK_CODE_POINT = 0x30fc; +      const HIRAGANA_RANGE = [0x3040, 0x309f];      const KATAKANA_RANGE = [0x30a0, 0x30ff];      const KANA_RANGES = [HIRAGANA_RANGE, KATAKANA_RANGE]; @@ -65,20 +70,65 @@ const jp = (() => {      const SMALL_KANA_SET = new Set(Array.from('ぁぃぅぇぉゃゅょゎァィゥェォャュョヮ')); +    const HALFWIDTH_KATAKANA_MAPPING = new Map([ +        ['ヲ', 'ヲヺ-'], +        ['ァ', 'ァ--'], +        ['ィ', 'ィ--'], +        ['ゥ', 'ゥ--'], +        ['ェ', 'ェ--'], +        ['ォ', 'ォ--'], +        ['ャ', 'ャ--'], +        ['ュ', 'ュ--'], +        ['ョ', 'ョ--'], +        ['ッ', 'ッ--'], +        ['ー', 'ー--'], +        ['ア', 'ア--'], +        ['イ', 'イ--'], +        ['ウ', 'ウヴ-'], +        ['エ', 'エ--'], +        ['オ', 'オ--'], +        ['カ', 'カガ-'], +        ['キ', 'キギ-'], +        ['ク', 'クグ-'], +        ['ケ', 'ケゲ-'], +        ['コ', 'コゴ-'], +        ['サ', 'サザ-'], +        ['シ', 'シジ-'], +        ['ス', 'スズ-'], +        ['セ', 'セゼ-'], +        ['ソ', 'ソゾ-'], +        ['タ', 'タダ-'], +        ['チ', 'チヂ-'], +        ['ツ', 'ツヅ-'], +        ['テ', 'テデ-'], +        ['ト', 'トド-'], +        ['ナ', 'ナ--'], +        ['ニ', 'ニ--'], +        ['ヌ', 'ヌ--'], +        ['ネ', 'ネ--'], +        ['ノ', 'ノ--'], +        ['ハ', 'ハバパ'], +        ['ヒ', 'ヒビピ'], +        ['フ', 'フブプ'], +        ['ヘ', 'ヘベペ'], +        ['ホ', 'ホボポ'], +        ['マ', 'マ--'], +        ['ミ', 'ミ--'], +        ['ム', 'ム--'], +        ['メ', 'メ--'], +        ['モ', 'モ--'], +        ['ヤ', 'ヤ--'], +        ['ユ', 'ユ--'], +        ['ヨ', 'ヨ--'], +        ['ラ', 'ラ--'], +        ['リ', 'リ--'], +        ['ル', 'ル--'], +        ['レ', 'レ--'], +        ['ロ', 'ロ--'], +        ['ワ', 'ワ--'], +        ['ン', 'ン--'] +    ]); -    // Character code testing functions - -    function isCodePointKanji(codePoint) { -        return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); -    } - -    function isCodePointKana(codePoint) { -        return isCodePointInRanges(codePoint, KANA_RANGES); -    } - -    function isCodePointJapanese(codePoint) { -        return isCodePointInRanges(codePoint, JAPANESE_RANGES); -    }      function isCodePointInRanges(codePoint, ranges) {          for (const [min, max] of ranges) { @@ -89,63 +139,410 @@ const jp = (() => {          return false;      } +    function getWanakana() { +        try { +            if (typeof wanakana !== 'undefined') { +                // eslint-disable-next-line no-undef +                return wanakana; +            } +        } catch (e) { +            // NOP +        } +        return null; +    } + -    // String testing functions +    class JapaneseUtil { +        constructor(wanakana=null) { +            this._wanakana = wanakana; +        } -    function isStringEntirelyKana(str) { -        if (str.length === 0) { return false; } -        for (const c of str) { -            if (!isCodePointKana(c.codePointAt(0))) { -                return false; +        // Character code testing functions + +        isCodePointKanji(codePoint) { +            return isCodePointInRanges(codePoint, CJK_UNIFIED_IDEOGRAPHS_RANGES); +        } + +        isCodePointKana(codePoint) { +            return isCodePointInRanges(codePoint, KANA_RANGES); +        } + +        isCodePointJapanese(codePoint) { +            return isCodePointInRanges(codePoint, JAPANESE_RANGES); +        } + +        // String testing functions + +        isStringEntirelyKana(str) { +            if (str.length === 0) { return false; } +            for (const c of str) { +                if (!isCodePointInRanges(c.codePointAt(0), KANA_RANGES)) { +                    return false; +                }              } +            return true;          } -        return true; -    } -    function isStringPartiallyJapanese(str) { -        if (str.length === 0) { return false; } -        for (const c of str) { -            if (isCodePointJapanese(c.codePointAt(0))) { -                return true; +        isStringPartiallyJapanese(str) { +            if (str.length === 0) { return false; } +            for (const c of str) { +                if (isCodePointInRanges(c.codePointAt(0), JAPANESE_RANGES)) { +                    return true; +                }              } +            return false;          } -        return false; -    } +        // Mora functions -    // Mora functions +        isMoraPitchHigh(moraIndex, pitchAccentPosition) { +            switch (pitchAccentPosition) { +                case 0: return (moraIndex > 0); +                case 1: return (moraIndex < 1); +                default: return (moraIndex > 0 && moraIndex < pitchAccentPosition); +            } +        } -    function isMoraPitchHigh(moraIndex, pitchAccentPosition) { -        switch (pitchAccentPosition) { -            case 0: return (moraIndex > 0); -            case 1: return (moraIndex < 1); -            default: return (moraIndex > 0 && moraIndex < pitchAccentPosition); +        getKanaMorae(text) { +            const morae = []; +            let i; +            for (const c of text) { +                if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { +                    morae[i - 1] += c; +                } else { +                    morae.push(c); +                } +            } +            return morae;          } -    } -    function getKanaMorae(text) { -        const morae = []; -        let i; -        for (const c of text) { -            if (SMALL_KANA_SET.has(c) && (i = morae.length) > 0) { -                morae[i - 1] += c; -            } else { -                morae.push(c); +        // Conversion functions + +        convertKatakanaToHiragana(text) { +            const wanakana = this._getWanakana(); +            let result = ''; +            for (const c of text) { +                if (wanakana.isKatakana(c)) { +                    result += wanakana.toHiragana(c); +                } else { +                    result += c; +                }              } + +            return result;          } -        return morae; -    } +        convertHiraganaToKatakana(text) { +            const wanakana = this._getWanakana(); +            let result = ''; +            for (const c of text) { +                if (wanakana.isHiragana(c)) { +                    result += wanakana.toKatakana(c); +                } else { +                    result += c; +                } +            } + +            return result; +        } + +        convertToRomaji(text) { +            const wanakana = this._getWanakana(); +            return wanakana.toRomaji(text); +        } + +        convertReading(expression, reading, readingMode) { +            switch (readingMode) { +                case 'hiragana': +                    return this.convertKatakanaToHiragana(reading); +                case 'katakana': +                    return this.convertHiraganaToKatakana(reading); +                case 'romaji': +                    if (reading) { +                        return this.convertToRomaji(reading); +                    } else { +                        if (this.isStringEntirelyKana(expression)) { +                            return this.convertToRomaji(expression); +                        } +                    } +                    return reading; +                case 'none': +                    return ''; +                default: +                    return reading; +            } +        } + +        convertNumericToFullWidth(text) { +            let result = ''; +            for (const char of text) { +                let c = char.codePointAt(0); +                if (c >= 0x30 && c <= 0x39) { // ['0', '9'] +                    c += 0xff10 - 0x30; // 0xff10 = '0' full width +                    result += String.fromCodePoint(c); +                } else { +                    result += char; +                } +            } +            return result; +        } + +        convertHalfWidthKanaToFullWidth(text, sourceMap=null) { +            let result = ''; + +            // This function is safe to use charCodeAt instead of codePointAt, since all +            // the relevant characters are represented with a single UTF-16 character code. +            for (let i = 0, ii = text.length; i < ii; ++i) { +                const c = text[i]; +                const mapping = HALFWIDTH_KATAKANA_MAPPING.get(c); +                if (typeof mapping !== 'string') { +                    result += c; +                    continue; +                } + +                let index = 0; +                switch (text.charCodeAt(i + 1)) { +                    case 0xff9e: // dakuten +                        index = 1; +                        break; +                    case 0xff9f: // handakuten +                        index = 2; +                        break; +                } + +                let c2 = mapping[index]; +                if (index > 0) { +                    if (c2 === '-') { // invalid +                        index = 0; +                        c2 = mapping[0]; +                    } else { +                        ++i; +                    } +                } + +                if (sourceMap !== null && index > 0) { +                    sourceMap.combine(result.length, 1); +                } +                result += c2; +            } + +            return result; +        } + +        convertAlphabeticToKana(text, sourceMap=null) { +            let part = ''; +            let result = ''; + +            for (const char of text) { +                // Note: 0x61 is the character code for 'a' +                let c = char.codePointAt(0); +                if (c >= 0x41 && c <= 0x5a) { // ['A', 'Z'] +                    c += (0x61 - 0x41); +                } else if (c >= 0x61 && c <= 0x7a) { // ['a', 'z'] +                    // NOP; c += (0x61 - 0x61); +                } else if (c >= 0xff21 && c <= 0xff3a) { // ['A', 'Z'] fullwidth +                    c += (0x61 - 0xff21); +                } else if (c >= 0xff41 && c <= 0xff5a) { // ['a', 'z'] fullwidth +                    c += (0x61 - 0xff41); +                } else if (c === 0x2d || c === 0xff0d) { // '-' or fullwidth dash +                    c = 0x2d; // '-' +                } else { +                    if (part.length > 0) { +                        result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); +                        part = ''; +                    } +                    result += char; +                    continue; +                } +                part += String.fromCodePoint(c); +            } + +            if (part.length > 0) { +                result += this._convertAlphabeticPartToKana(part, sourceMap, result.length); +            } +            return result; +        } + +        // Furigana distribution + +        distributeFurigana(expression, reading) { +            const fallback = [{furigana: reading, text: expression}]; +            if (!reading) { +                return fallback; +            } + +            let isAmbiguous = false; +            const segmentize = (reading2, groups) => { +                if (groups.length === 0 || isAmbiguous) { +                    return []; +                } + +                const group = groups[0]; +                if (group.mode === 'kana') { +                    if (this.convertKatakanaToHiragana(reading2).startsWith(this.convertKatakanaToHiragana(group.text))) { +                        const readingLeft = reading2.substring(group.text.length); +                        const segs = segmentize(readingLeft, groups.splice(1)); +                        if (segs) { +                            return [{text: group.text, furigana: ''}].concat(segs); +                        } +                    } +                } else { +                    let foundSegments = null; +                    for (let i = reading2.length; i >= group.text.length; --i) { +                        const readingUsed = reading2.substring(0, i); +                        const readingLeft = reading2.substring(i); +                        const segs = segmentize(readingLeft, groups.slice(1)); +                        if (segs) { +                            if (foundSegments !== null) { +                                // more than one way to segmentize the tail, mark as ambiguous +                                isAmbiguous = true; +                                return null; +                            } +                            foundSegments = [{text: group.text, furigana: readingUsed}].concat(segs); +                        } +                        // there is only one way to segmentize the last non-kana group +                        if (groups.length === 1) { +                            break; +                        } +                    } +                    return foundSegments; +                } +            }; + +            const groups = []; +            let modePrev = null; +            for (const c of expression) { +                const codePoint = c.codePointAt(0); +                const modeCurr = this.isCodePointKanji(codePoint) || codePoint === ITERATION_MARK_CODE_POINT ? 'kanji' : 'kana'; +                if (modeCurr === modePrev) { +                    groups[groups.length - 1].text += c; +                } else { +                    groups.push({mode: modeCurr, text: c}); +                    modePrev = modeCurr; +                } +            } + +            const segments = segmentize(reading, groups); +            if (segments && !isAmbiguous) { +                return segments; +            } +            return fallback; +        } + +        distributeFuriganaInflected(expression, reading, source) { +            const output = []; + +            let stemLength = 0; +            const shortest = Math.min(source.length, expression.length); +            const sourceHiragana = this.convertKatakanaToHiragana(source); +            const expressionHiragana = this.convertKatakanaToHiragana(expression); +            while (stemLength < shortest && sourceHiragana[stemLength] === expressionHiragana[stemLength]) { +                ++stemLength; +            } +            const offset = source.length - stemLength; + +            const stemExpression = source.substring(0, source.length - offset); +            const stemReading = reading.substring( +                0, +                offset === 0 ? reading.length : reading.length - expression.length + stemLength +            ); +            for (const segment of this.distributeFurigana(stemExpression, stemReading)) { +                output.push(segment); +            } + +            if (stemLength !== source.length) { +                output.push({text: source.substring(stemLength), furigana: ''}); +            } + +            return output; +        } + +        // Miscellaneous + +        collapseEmphaticSequences(text, fullCollapse, sourceMap=null) { +            let result = ''; +            let collapseCodePoint = -1; +            const hasSourceMap = (sourceMap !== null); +            for (const char of text) { +                const c = char.codePointAt(0); +                if ( +                    c === HIRAGANA_SMALL_TSU_CODE_POINT || +                    c === KATAKANA_SMALL_TSU_CODE_POINT || +                    c === KANA_PROLONGED_SOUND_MARK_CODE_POINT +                ) { +                    if (collapseCodePoint !== c) { +                        collapseCodePoint = c; +                        if (!fullCollapse) { +                            result += char; +                            continue; +                        } +                    } +                } else { +                    collapseCodePoint = -1; +                    result += char; +                    continue; +                } + +                if (hasSourceMap) { +                    sourceMap.combine(Math.max(0, result.length - 1), 1); +                } +            } +            return result; +        } + +        // Private + +        _getWanakana() { +            const wanakana = this._wanakana; +            if (wanakana === null) { throw new Error('Functions which use WanaKana are not supported in this context'); } +            return wanakana; +        } + +        _convertAlphabeticPartToKana(text, sourceMap, sourceMapStart) { +            const wanakana = this._getWanakana(); +            const result = wanakana.toHiragana(text); + +            // Generate source mapping +            if (sourceMap !== null) { +                let i = 0; +                let resultPos = 0; +                const ii = text.length; +                while (i < ii) { +                    // Find smallest matching substring +                    let iNext = i + 1; +                    let resultPosNext = result.length; +                    while (iNext < ii) { +                        const t = wanakana.toHiragana(text.substring(0, iNext)); +                        if (t === result.substring(0, t.length)) { +                            resultPosNext = t.length; +                            break; +                        } +                        ++iNext; +                    } + +                    // Merge characters +                    const removals = iNext - i - 1; +                    if (removals > 0) { +                        sourceMap.combine(sourceMapStart, removals); +                    } +                    ++sourceMapStart; + +                    // Empty elements +                    const additions = resultPosNext - resultPos - 1; +                    for (let j = 0; j < additions; ++j) { +                        sourceMap.insert(sourceMapStart, 0); +                        ++sourceMapStart; +                    } + +                    i = iNext; +                    resultPos = resultPosNext; +                } +            } + +            return result; +        } +    } -    // Exports -    return { -        isCodePointKanji, -        isCodePointKana, -        isCodePointJapanese, -        isStringEntirelyKana, -        isStringPartiallyJapanese, -        isMoraPitchHigh, -        getKanaMorae -    }; +    return new JapaneseUtil(getWanakana());  })(); diff --git a/test/test-japanese.js b/test/test-japanese.js index 321861d5..39004128 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -22,8 +22,7 @@ const vm = new VM();  vm.execute([      'mixed/lib/wanakana.min.js',      'mixed/js/japanese.js', -    'bg/js/text-source-map.js', -    'bg/js/japanese.js' +    'bg/js/text-source-map.js'  ]);  const jp = vm.get('jp');  const TextSourceMap = vm.get('TextSourceMap'); |