diff options
| author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2022-08-20 11:32:20 -0400 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2022-08-20 11:32:20 -0400 | 
| commit | c5c5308ff2addbce1f1d55dd5a8a91bbed610ee1 (patch) | |
| tree | 08f484dfc9bab9666554a9d9a9b650507d6a102a | |
| parent | 5c267f4bb772bb6c67576f2b40234a356c040550 (diff) | |
Update DOMTextScanner to support UTF-16 surrogate pairs (#2213)
| -rw-r--r-- | ext/js/dom/dom-text-scanner.js | 54 | 
1 files changed, 46 insertions, 8 deletions
| diff --git a/ext/js/dom/dom-text-scanner.js b/ext/js/dom/dom-text-scanner.js index 83b16028..7bed94a9 100644 --- a/ext/js/dom/dom-text-scanner.js +++ b/ext/js/dom/dom-text-scanner.js @@ -146,6 +146,44 @@ class DOMTextScanner {      // Private      /** +     * Reads a code point in a string in the forward direction. +     * @param {string} text The text to read the code point from. +     * @param {number} position The index of the first character to read. +     * @returns {string} The code point from the string. +     */ +    _readCodePointForward(text, position) { +        let char = text[position]; +        const charCode = char.charCodeAt(0); +        if (charCode >= 0xd800 && charCode < 0xdc00 && ++position < text.length) { +            const char2 = text[position]; +            const charCode2 = char2.charCodeAt(0); +            if (charCode2 >= 0xdc00 && charCode2 < 0xe000) { +                char += char2; +            } +        } +        return char; +    } + +    /** +     * Reads a code point in a string in the backward direction. +     * @param {string} text The text to read the code point from. +     * @param {number} position The index of the first character to read. +     * @returns {string} The code point from the string. +     */ +    _readCodePointBackward(text, position) { +        let char = text[position]; +        const charCode = char.charCodeAt(0); +        if (charCode >= 0xdc00 && charCode < 0xe000 && position > 0) { +            const char2 = text[position - 1]; +            const charCode2 = char2.charCodeAt(0); +            if (charCode2 >= 0xd800 && charCode2 < 0xdc00) { +                char = char2 + char; +            } +        } +        return char; +    } + +    /**       * Seeks forward in a text node.       * @param {Text} textNode The text node to use.       * @param {boolean} resetOffset Whether or not the text offset should be reset. @@ -164,9 +202,9 @@ class DOMTextScanner {          let newlines = this._newlines;          while (offset < nodeValueLength) { -            const char = nodeValue[offset]; +            const char = this._readCodePointForward(nodeValue, offset); +            offset += char.length;              const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); -            ++offset;              if (charAttributes === 0) {                  // Character should be ignored @@ -188,7 +226,7 @@ class DOMTextScanner {                      lineHasContent = false;                      lineHasWhitespace = false;                      if (remainder <= 0) { -                        --offset; // Revert character offset +                        offset -= char.length; // Revert character offset                          break;                      }                  } @@ -200,7 +238,7 @@ class DOMTextScanner {                          content += ' ';                          lineHasWhitespace = false;                          if (--remainder <= 0) { -                            --offset; // Revert character offset +                            offset -= char.length; // Revert character offset                              break;                          }                      } else { @@ -250,8 +288,8 @@ class DOMTextScanner {          let newlines = this._newlines;          while (offset > 0) { -            --offset; -            const char = nodeValue[offset]; +            const char = this._readCodePointBackward(nodeValue, offset - 1); +            offset -= char.length;              const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);              if (charAttributes === 0) { @@ -274,7 +312,7 @@ class DOMTextScanner {                      lineHasContent = false;                      lineHasWhitespace = false;                      if (remainder <= 0) { -                        ++offset; // Revert character offset +                        offset += char.length; // Revert character offset                          break;                      }                  } @@ -286,7 +324,7 @@ class DOMTextScanner {                          content = ' ' + content;                          lineHasWhitespace = false;                          if (--remainder <= 0) { -                            ++offset; // Revert character offset +                            offset += char.length; // Revert character offset                              break;                          }                      } else { |