aboutsummaryrefslogtreecommitdiff
path: root/ext/js/dom/dom-text-scanner.js
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2022-08-20 11:32:20 -0400
committerGitHub <noreply@github.com>2022-08-20 11:32:20 -0400
commitc5c5308ff2addbce1f1d55dd5a8a91bbed610ee1 (patch)
tree08f484dfc9bab9666554a9d9a9b650507d6a102a /ext/js/dom/dom-text-scanner.js
parent5c267f4bb772bb6c67576f2b40234a356c040550 (diff)
Update DOMTextScanner to support UTF-16 surrogate pairs (#2213)
Diffstat (limited to 'ext/js/dom/dom-text-scanner.js')
-rw-r--r--ext/js/dom/dom-text-scanner.js54
1 files changed, 46 insertions, 8 deletions
diff --git a/ext/js/dom/dom-text-scanner.js b/ext/js/dom/dom-text-scanner.js
index 83b16028..7bed94a9 100644
--- a/ext/js/dom/dom-text-scanner.js
+++ b/ext/js/dom/dom-text-scanner.js
@@ -146,6 +146,44 @@ class DOMTextScanner {
// Private
/**
+ * Reads a code point in a string in the forward direction.
+ * @param {string} text The text to read the code point from.
+ * @param {number} position The index of the first character to read.
+ * @returns {string} The code point from the string.
+ */
+ _readCodePointForward(text, position) {
+ let char = text[position];
+ const charCode = char.charCodeAt(0);
+ if (charCode >= 0xd800 && charCode < 0xdc00 && ++position < text.length) {
+ const char2 = text[position];
+ const charCode2 = char2.charCodeAt(0);
+ if (charCode2 >= 0xdc00 && charCode2 < 0xe000) {
+ char += char2;
+ }
+ }
+ return char;
+ }
+
+ /**
+ * Reads a code point in a string in the backward direction.
+ * @param {string} text The text to read the code point from.
+ * @param {number} position The index of the first character to read.
+ * @returns {string} The code point from the string.
+ */
+ _readCodePointBackward(text, position) {
+ let char = text[position];
+ const charCode = char.charCodeAt(0);
+ if (charCode >= 0xdc00 && charCode < 0xe000 && position > 0) {
+ const char2 = text[position - 1];
+ const charCode2 = char2.charCodeAt(0);
+ if (charCode2 >= 0xd800 && charCode2 < 0xdc00) {
+ char = char2 + char;
+ }
+ }
+ return char;
+ }
+
+ /**
* Seeks forward in a text node.
* @param {Text} textNode The text node to use.
* @param {boolean} resetOffset Whether or not the text offset should be reset.
@@ -164,9 +202,9 @@ class DOMTextScanner {
let newlines = this._newlines;
while (offset < nodeValueLength) {
- const char = nodeValue[offset];
+ const char = this._readCodePointForward(nodeValue, offset);
+ offset += char.length;
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
- ++offset;
if (charAttributes === 0) {
// Character should be ignored
@@ -188,7 +226,7 @@ class DOMTextScanner {
lineHasContent = false;
lineHasWhitespace = false;
if (remainder <= 0) {
- --offset; // Revert character offset
+ offset -= char.length; // Revert character offset
break;
}
}
@@ -200,7 +238,7 @@ class DOMTextScanner {
content += ' ';
lineHasWhitespace = false;
if (--remainder <= 0) {
- --offset; // Revert character offset
+ offset -= char.length; // Revert character offset
break;
}
} else {
@@ -250,8 +288,8 @@ class DOMTextScanner {
let newlines = this._newlines;
while (offset > 0) {
- --offset;
- const char = nodeValue[offset];
+ const char = this._readCodePointBackward(nodeValue, offset - 1);
+ offset -= char.length;
const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
if (charAttributes === 0) {
@@ -274,7 +312,7 @@ class DOMTextScanner {
lineHasContent = false;
lineHasWhitespace = false;
if (remainder <= 0) {
- ++offset; // Revert character offset
+ offset += char.length; // Revert character offset
break;
}
}
@@ -286,7 +324,7 @@ class DOMTextScanner {
content = ' ' + content;
lineHasWhitespace = false;
if (--remainder <= 0) {
- ++offset; // Revert character offset
+ offset += char.length; // Revert character offset
break;
}
} else {