diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2022-08-20 11:32:20 -0400 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-20 11:32:20 -0400 |
commit | c5c5308ff2addbce1f1d55dd5a8a91bbed610ee1 (patch) | |
tree | 08f484dfc9bab9666554a9d9a9b650507d6a102a | |
parent | 5c267f4bb772bb6c67576f2b40234a356c040550 (diff) |
Update DOMTextScanner to support UTF-16 surrogate pairs (#2213)
-rw-r--r-- | ext/js/dom/dom-text-scanner.js | 54 |
1 files changed, 46 insertions, 8 deletions
diff --git a/ext/js/dom/dom-text-scanner.js b/ext/js/dom/dom-text-scanner.js index 83b16028..7bed94a9 100644 --- a/ext/js/dom/dom-text-scanner.js +++ b/ext/js/dom/dom-text-scanner.js @@ -146,6 +146,44 @@ class DOMTextScanner { // Private /** + * Reads a code point in a string in the forward direction. + * @param {string} text The text to read the code point from. + * @param {number} position The index of the first character to read. + * @returns {string} The code point from the string. + */ + _readCodePointForward(text, position) { + let char = text[position]; + const charCode = char.charCodeAt(0); + if (charCode >= 0xd800 && charCode < 0xdc00 && ++position < text.length) { + const char2 = text[position]; + const charCode2 = char2.charCodeAt(0); + if (charCode2 >= 0xdc00 && charCode2 < 0xe000) { + char += char2; + } + } + return char; + } + + /** + * Reads a code point in a string in the backward direction. + * @param {string} text The text to read the code point from. + * @param {number} position The index of the first character to read. + * @returns {string} The code point from the string. + */ + _readCodePointBackward(text, position) { + let char = text[position]; + const charCode = char.charCodeAt(0); + if (charCode >= 0xdc00 && charCode < 0xe000 && position > 0) { + const char2 = text[position - 1]; + const charCode2 = char2.charCodeAt(0); + if (charCode2 >= 0xd800 && charCode2 < 0xdc00) { + char = char2 + char; + } + } + return char; + } + + /** * Seeks forward in a text node. * @param {Text} textNode The text node to use. * @param {boolean} resetOffset Whether or not the text offset should be reset. @@ -164,9 +202,9 @@ class DOMTextScanner { let newlines = this._newlines; while (offset < nodeValueLength) { - const char = nodeValue[offset]; + const char = this._readCodePointForward(nodeValue, offset); + offset += char.length; const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); - ++offset; if (charAttributes === 0) { // Character should be ignored @@ -188,7 +226,7 @@ class DOMTextScanner { lineHasContent = false; lineHasWhitespace = false; if (remainder <= 0) { - --offset; // Revert character offset + offset -= char.length; // Revert character offset break; } } @@ -200,7 +238,7 @@ class DOMTextScanner { content += ' '; lineHasWhitespace = false; if (--remainder <= 0) { - --offset; // Revert character offset + offset -= char.length; // Revert character offset break; } } else { @@ -250,8 +288,8 @@ class DOMTextScanner { let newlines = this._newlines; while (offset > 0) { - --offset; - const char = nodeValue[offset]; + const char = this._readCodePointBackward(nodeValue, offset - 1); + offset -= char.length; const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); if (charAttributes === 0) { @@ -274,7 +312,7 @@ class DOMTextScanner { lineHasContent = false; lineHasWhitespace = false; if (remainder <= 0) { - ++offset; // Revert character offset + offset += char.length; // Revert character offset break; } } @@ -286,7 +324,7 @@ class DOMTextScanner { content = ' ' + content; lineHasWhitespace = false; if (--remainder <= 0) { - ++offset; // Revert character offset + offset += char.length; // Revert character offset break; } } else { |