diff options
author | Alex Yatskov <alex@foosoft.net> | 2017-08-26 11:57:34 -0700 |
---|---|---|
committer | Alex Yatskov <alex@foosoft.net> | 2017-08-26 11:57:34 -0700 |
commit | 190c749527c5c5f8afec7ead6956a5f3d7c1a422 (patch) | |
tree | 2945b2c95657e71b1e99212d9646e6bd30961ab3 /ext/mixed/js | |
parent | 49f0243527e504aa3fe196bc6dc759b6948f8a0b (diff) |
improved furigana support
Diffstat (limited to 'ext/mixed/js')
-rw-r--r-- | ext/mixed/js/japanese.js | 112 |
1 files changed, 41 insertions, 71 deletions
diff --git a/ext/mixed/js/japanese.js b/ext/mixed/js/japanese.js index f87ceeb0..341be1d9 100644 --- a/ext/mixed/js/japanese.js +++ b/ext/mixed/js/japanese.js @@ -9,7 +9,7 @@ * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * MERCHANTABILITY or FITNESS FOR A indexPartCULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License @@ -39,80 +39,50 @@ function jpKatakanaToHiragana(text) { return result; } -function distributeFurigana(word, reading) { - reading = reading || wanakana.toHiragana(word); - function span(str, pred) { - let i = 0; - while (i < str.length && pred(str[i])) { - i++; - } - return [str.substring(0, i), str.substring(i)]; +function jpDistributeFurigana(expression, reading) { + const fallback = [{furigana: reading, text: expression}]; + if (!reading) { + return fallback; } - const isKanji = c => jpIsKanji(c) || - c === '\u3005'; /* kurikaeshi */ - const isKana = c => jpIsKana(c) || - c === '\u30fc'; /* chouonpu */ - function parse(word) { - const res = []; - while (word.length > 0) { - const c = word.charAt(0); - if (isKana(c)) { - const [text, rest] = span(word, isKana); - res.push({ type: 'kana', text }); - word = rest; - } else if (isKanji(c)) { - const [text, rest] = span(word, isKanji); - res.push({ type: 'kanji', text }); - word = rest; - } else return null; + + const segmentize = (reading, groups) => { + if (groups.length === 0) { + return []; } - return res; - } - const fallback = () => [{ text: word, furigana: reading }]; - const parts = parse(word); - if (!parts) return fallback(); - let parti = 0; - let readingi = 0; - const res = []; - let current = null; - function backtrack() { - parti--; - const prev = res.pop(); - current = prev.furigana; - } - while (parti < parts.length) { - const part = parts[parti]; - switch (part.type) { - case 'kana': - if (reading.startsWith(wanakana.toHiragana(part.text), readingi)) { - if (parti === parts.length - 1 && readingi !== reading.length - part.text.length) { - backtrack(); - } else { - readingi += part.text.length; - res.push({ text: part.text }); - parti++; - } - } else backtrack(); - break; - case 'kanji': - current = current || ''; - if (parti === parts.length - 1) { - // last part, consume all - current += reading.substring(readingi); - } else { - const nextText = parts[parti + 1].text; - const end = reading.indexOf(nextText, readingi + 1); // consume at least one character - if (end === -1) { - return fallback(); - } - current += reading.substring(readingi, end); - readingi = end; + const group = groups[0]; + if (group.mode === 'kana') { + if (reading.startsWith(group.text)) { + const readingUsed = reading.substring(0, group.text.length); + const readingLeft = reading.substring(group.text.length); + const segs = segmentize(readingLeft, groups.splice(1)); + if (segs) { + return [{text: readingUsed}].concat(segs); + } + } + } else { + for (let i = reading.length; i >= group.text.length; --i) { + const readingUsed = reading.substring(0, i); + const readingLeft = reading.substring(i); + const segs = segmentize(readingLeft, groups.slice(1)); + if (segs) { + return [{text: group.text, furigana: readingUsed}].concat(segs); } - res.push({ text: part.text, furigana: current }); - current = null; - parti++; + } + } + }; + + const groups = []; + let modePrev = null; + for (const c of expression) { + const modeCurr = jpIsKanji(c) || c.charCodeAt(0) === 0x3005 /* noma */ ? 'kanji' : 'kana'; + if (modeCurr === modePrev) { + groups[groups.length - 1].text += c; + } else { + groups.push({mode: modeCurr, text: c}); + modePrev = modeCurr; } } - return res; + + return segmentize(reading, groups) || fallback; } |