DOMTextScanner (#458)

* Create new class for scanning text in a document * Update test styles * Add tests
author: toasted-nutbread <toasted-nutbread@users.noreply.github.com> 2020-05-02 13:05:43 -0400
committer: GitHub <noreply@github.com> 2020-05-02 13:05:43 -0400
commit: d4ae9aa501ece99ea6c5e6b8fb01c3005f5b7f03 (patch)
tree: f96211038ffac0be88da912cb40bd3980c212c18 /ext/fg/js/dom-text-scanner.js
parent: d581bffa15419b3b55773f1ed08a2e787e574f1f (diff)
1 files changed, 538 insertions, 0 deletions
diff --git a/ext/fg/js/dom-text-scanner.js b/ext/fg/js/dom-text-scanner.js
new file mode 100644
index 00000000..2de65041
--- /dev/null
+++ b/ext/fg/js/dom-text-scanner.js
@@ -0,0 +1,538 @@
+/*
+ * Copyright (C) 2020  Yomichan Authors
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+/**
+ * A class used to scan text in a document.
+ */
+class DOMTextScanner {
+    /**
+     * Creates a new instance of a DOMTextScanner.
+     * @param node The DOM Node to start at.
+     * @param offset The character offset in to start at when node is a text node.
+     *   Use 0 for non-text nodes.
+     */
+    constructor(node, offset, forcePreserveWhitespace=false, generateLayoutContent=true) {
+        const ruby = DOMTextScanner.getParentRubyElement(node);
+        const resetOffset = (ruby !== null);
+        if (resetOffset) { node = ruby; }
+
+        this._node = node;
+        this._offset = offset;
+        this._content = '';
+        this._remainder = 0;
+        this._resetOffset = resetOffset;
+        this._newlines = 0;
+        this._lineHasWhitespace = false;
+        this._lineHasContent = false;
+        this._forcePreserveWhitespace = forcePreserveWhitespace;
+        this._generateLayoutContent = generateLayoutContent;
+    }
+
+    /**
+     * Gets the current node being scanned.
+     * @returns A DOM Node.
+     */
+    get node() {
+        return this._node;
+    }
+
+    /**
+     * Gets the current offset corresponding to the node being scanned.
+     * This value is only applicable for text nodes.
+     * @returns An integer.
+     */
+    get offset() {
+        return this._offset;
+    }
+
+    /**
+     * Gets the accumulated content string resulting from calls to seek().
+     * @returns A string.
+     */
+    get content() {
+        return this._content;
+    }
+
+    /**
+     * Seeks a given length in the document and accumulates the text content.
+     * @param length A positive or negative integer corresponding to how many characters
+     *   should be added to content. Content is only added to the accumulation string,
+     *   never removed, so mixing seek calls with differently signed length values
+     *   may give unexpected results.
+     * @returns this
+     */
+    seek(length) {
+        const forward = (length >= 0);
+        this._remainder = (forward ? length : -length);
+        if (length === 0) { return this; }
+
+        const TEXT_NODE = Node.TEXT_NODE;
+        const ELEMENT_NODE = Node.ELEMENT_NODE;
+
+        const generateLayoutContent = this._generateLayoutContent;
+        let node = this._node;
+        let resetOffset = this._resetOffset;
+        let newlines = 0;
+        while (node !== null) {
+            let enterable = false;
+            const nodeType = node.nodeType;
+
+            if (nodeType === TEXT_NODE) {
+                if (!(
+                    forward ?
+                    this._seekTextNodeForward(node, resetOffset) :
+                    this._seekTextNodeBackward(node, resetOffset)
+                )) {
+                    // Length reached
+                    break;
+                }
+            } else if (nodeType === ELEMENT_NODE) {
+                [enterable, newlines] = DOMTextScanner.getElementSeekInfo(node);
+                if (newlines > this._newlines && generateLayoutContent) {
+                    this._newlines = newlines;
+                }
+            }
+
+            const exitedNodes = [];
+            node = DOMTextScanner.getNextNode(node, forward, enterable, exitedNodes);
+
+            for (const exitedNode of exitedNodes) {
+                if (exitedNode.nodeType !== ELEMENT_NODE) { continue; }
+                newlines = DOMTextScanner.getElementSeekInfo(exitedNode)[1];
+                if (newlines > this._newlines && generateLayoutContent) {
+                    this._newlines = newlines;
+                }
+            }
+
+            resetOffset = true;
+        }
+
+        this._node = node;
+        this._resetOffset = resetOffset;
+
+        return this;
+    }
+
+    // Private
+
+    /**
+     * Seeks forward in a text node.
+     * @param textNode The text node to use.
+     * @param resetOffset Whether or not the text offset should be reset.
+     * @returns true if scanning should continue, or false if the scan length has been reached.
+     */
+    _seekTextNodeForward(textNode, resetOffset) {
+        const nodeValue = textNode.nodeValue;
+        const nodeValueLength = nodeValue.length;
+        const [preserveNewlines, preserveWhitespace] = (
+            this._forcePreserveWhitespace ?
+            [true, true] :
+            DOMTextScanner.getWhitespaceSettings(textNode)
+        );
+
+        let lineHasWhitespace = this._lineHasWhitespace;
+        let lineHasContent = this._lineHasContent;
+        let content = this._content;
+        let offset = resetOffset ? 0 : this._offset;
+        let remainder = this._remainder;
+        let newlines = this._newlines;
+
+        while (offset < nodeValueLength) {
+            const char = nodeValue[offset];
+            const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
+            ++offset;
+
+            if (charAttributes === 0) {
+                // Character should be ignored
+                continue;
+            } else if (charAttributes === 1) {
+                // Character is collapsable whitespace
+                lineHasWhitespace = true;
+            } else {
+                // Character should be added to the content
+                if (newlines > 0) {
+                    if (content.length > 0) {
+                        const useNewlineCount = Math.min(remainder, newlines);
+                        content += '\n'.repeat(useNewlineCount);
+                        remainder -= useNewlineCount;
+                        newlines -= useNewlineCount;
+                    } else {
+                        newlines = 0;
+                    }
+                    lineHasContent = false;
+                    lineHasWhitespace = false;
+                    if (remainder <= 0) {
+                        --offset; // Revert character offset
+                        break;
+                    }
+                }
+
+                lineHasContent = (charAttributes === 2); // 3 = character is a newline
+
+                if (lineHasWhitespace) {
+                    if (lineHasContent) {
+                        content += ' ';
+                        lineHasWhitespace = false;
+                        if (--remainder <= 0) {
+                            --offset; // Revert character offset
+                            break;
+                        }
+                    } else {
+                        lineHasWhitespace = false;
+                    }
+                }
+
+                content += char;
+
+                if (--remainder <= 0) { break; }
+            }
+        }
+
+        this._lineHasWhitespace = lineHasWhitespace;
+        this._lineHasContent = lineHasContent;
+        this._content = content;
+        this._offset = offset;
+        this._remainder = remainder;
+        this._newlines = newlines;
+
+        return (remainder > 0);
+    }
+
+    /**
+     * Seeks backward in a text node.
+     * This function is nearly the same as _seekTextNodeForward, with the following differences:
+     * - Iteration condition is reversed to check if offset is greater than 0.
+     * - offset is reset to nodeValueLength instead of 0.
+     * - offset is decremented instead of incremented.
+     * - offset is decremented before getting the character.
+     * - offset is reverted by incrementing instead of decrementing.
+     * - content string is prepended instead of appended.
+     * @param textNode The text node to use.
+     * @param resetOffset Whether or not the text offset should be reset.
+     * @returns true if scanning should continue, or false if the scan length has been reached.
+     */
+    _seekTextNodeBackward(textNode, resetOffset) {
+        const nodeValue = textNode.nodeValue;
+        const nodeValueLength = nodeValue.length;
+        const [preserveNewlines, preserveWhitespace] = (
+            this._forcePreserveWhitespace ?
+            [true, true] :
+            DOMTextScanner.getWhitespaceSettings(textNode)
+        );
+
+        let lineHasWhitespace = this._lineHasWhitespace;
+        let lineHasContent = this._lineHasContent;
+        let content = this._content;
+        let offset = resetOffset ? nodeValueLength : this._offset;
+        let remainder = this._remainder;
+        let newlines = this._newlines;
+
+        while (offset > 0) {
+            --offset;
+            const char = nodeValue[offset];
+            const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace);
+
+            if (charAttributes === 0) {
+                // Character should be ignored
+                continue;
+            } else if (charAttributes === 1) {
+                // Character is collapsable whitespace
+                lineHasWhitespace = true;
+            } else {
+                // Character should be added to the content
+                if (newlines > 0) {
+                    if (content.length > 0) {
+                        const useNewlineCount = Math.min(remainder, newlines);
+                        content = '\n'.repeat(useNewlineCount) + content;
+                        remainder -= useNewlineCount;
+                        newlines -= useNewlineCount;
+                    } else {
+                        newlines = 0;
+                    }
+                    lineHasContent = false;
+                    lineHasWhitespace = false;
+                    if (remainder <= 0) {
+                        ++offset; // Revert character offset
+                        break;
+                    }
+                }
+
+                lineHasContent = (charAttributes === 2); // 3 = character is a newline
+
+                if (lineHasWhitespace) {
+                    if (lineHasContent) {
+                        content = ' ' + content;
+                        lineHasWhitespace = false;
+                        if (--remainder <= 0) {
+                            ++offset; // Revert character offset
+                            break;
+                        }
+                    } else {
+                        lineHasWhitespace = false;
+                    }
+                }
+
+                content = char + content;
+
+                if (--remainder <= 0) { break; }
+            }
+        }
+
+        this._lineHasWhitespace = lineHasWhitespace;
+        this._lineHasContent = lineHasContent;
+        this._content = content;
+        this._offset = offset;
+        this._remainder = remainder;
+        this._newlines = newlines;
+
+        return (remainder > 0);
+    }
+
+    // Static helpers
+
+    /**
+     * Gets the next node in the document for a specified scanning direction.
+     * @param node The current DOM Node.
+     * @param forward Whether to scan forward in the document or backward.
+     * @param visitChildren Whether the children of the current node should be visited.
+     * @param exitedNodes An array which stores nodes which were exited.
+     * @returns The next node in the document, or null if there is no next node.
+     */
+    static getNextNode(node, forward, visitChildren, exitedNodes) {
+        let next = visitChildren ? (forward ? node.firstChild : node.lastChild) : null;
+        if (next === null) {
+            while (true) {
+                exitedNodes.push(node);
+
+                next = (forward ? node.nextSibling : node.previousSibling);
+                if (next !== null) { break; }
+
+                next = node.parentNode;
+                if (next === null) { break; }
+
+                node = next;
+            }
+        }
+        return next;
+    }
+
+    /**
+     * Gets the parent element of a given Node.
+     * @param node The node to check.
+     * @returns The parent element if one exists, otherwise null.
+     */
+    static getParentElement(node) {
+        while (node !== null && node.nodeType !== Node.ELEMENT_NODE) {
+            node = node.parentNode;
+        }
+        return node;
+    }
+
+    /**
+     * Gets the parent <ruby> element of a given node, if one exists. For efficiency purposes,
+     * this only checks the immediate parent elements and does not check all ancestors, so
+     * there are cases where the node may be in a ruby element but it is not returned.
+     * @param node The node to check.
+     * @returns A <ruby> node if the input node is contained in one, otherwise null.
+     */
+    static getParentRubyElement(node) {
+        node = DOMTextScanner.getParentElement(node);
+        if (node !== null && node.nodeName.toUpperCase() === 'RT') {
+            node = node.parentNode;
+            if (node !== null && node.nodeName.toUpperCase() === 'RUBY') {
+                return node;
+            }
+        }
+        return null;
+    }
+
+    /**
+     * @returns [enterable: boolean, newlines: integer]
+     *   The enterable value indicates whether the content of this node should be entered.
+     *   The newlines value corresponds to the number of newline characters that should be added.
+     *     1 newline corresponds to a simple new line in the layout.
+     *     2 newlines corresponds to a significant visual distinction since the previous content.
+     */
+    static getElementSeekInfo(element) {
+        let enterable = true;
+        switch (element.nodeName.toUpperCase()) {
+            case 'HEAD':
+            case 'RT':
+            case 'SCRIPT':
+            case 'STYLE':
+                return [false, 0];
+            case 'BR':
+                return [false, 1];
+            case 'TEXTAREA':
+            case 'INPUT':
+            case 'BUTTON':
+                enterable = false;
+                break;
+        }
+
+        const style = window.getComputedStyle(element);
+        const display = style.display;
+
+        const visible = (display !== 'none' && DOMTextScanner.isStyleVisible(style));
+        let newlines = 0;
+
+        if (!visible) {
+            enterable = false;
+        } else {
+            switch (style.position) {
+                case 'absolute':
+                case 'fixed':
+                case 'sticky':
+                    newlines = 2;
+                    break;
+            }
+            if (newlines === 0 && DOMTextScanner.doesCSSDisplayChangeLayout(display)) {
+                newlines = 1;
+            }
+        }
+
+        return [enterable, newlines];
+    }
+
+    /**
+     * Gets information about how whitespace characters are treated.
+     * @param textNode The Text node to check.
+     * @returns [preserveNewlines: boolean, preserveWhitespace: boolean]
+     *   The value of preserveNewlines indicates whether or not newline characters are treated as line breaks.
+     *   The value of preserveWhitespace indicates whether or not sequences of whitespace characters are collapsed.
+     */
+    static getWhitespaceSettings(textNode) {
+        const element = DOMTextScanner.getParentElement(textNode);
+        if (element !== null) {
+            const style = window.getComputedStyle(element);
+            switch (style.whiteSpace) {
+                case 'pre':
+                case 'pre-wrap':
+                case 'break-spaces':
+                    return [true, true];
+                case 'pre-line':
+                    return [true, false];
+            }
+        }
+        return [false, false];
+    }
+
+    /**
+     * Gets attributes for the specified character.
+     * @param character A string containing a single character.
+     * @returns An integer representing the attributes of the character.
+     *   0: Character should be ignored.
+     *   1: Character is collapsable whitespace.
+     *   2: Character should be added to the content.
+     *   3: Character should be added to the content and is a newline.
+     */
+    static getCharacterAttributes(character, preserveNewlines, preserveWhitespace) {
+        switch (character.charCodeAt(0)) {
+            case 0x09: // Tab ('\t')
+            case 0x0c: // Form feed ('\f')
+            case 0x0d: // Carriage return ('\r')
+            case 0x20: // Space (' ')
+                return preserveWhitespace ? 2 : 1;
+            case 0x0a: // Line feed ('\n')
+                return preserveNewlines ? 3 : 1;
+            case 0x200c: // Zero-width non-joiner ('\u200c')
+                return 0;
+            default: // Other
+                return 2;
+        }
+    }
+
+    /**
+     * Checks whether a given style is visible or not.
+     * This function does not check style.display === 'none'.
+     * @param style An object implementing the CSSStyleDeclaration interface.
+     * @returns true if the style should result in an element being visible, otherwise false.
+     */
+    static isStyleVisible(style) {
+        return !(
+            style.visibility === 'hidden' ||
+            parseFloat(style.opacity) <= 0 ||
+            parseFloat(style.fontSize) <= 0 ||
+            (
+                !DOMTextScanner.isStyleSelectable(style) &&
+                (
+                    DOMTextScanner.isCSSColorTransparent(style.color) ||
+                    DOMTextScanner.isCSSColorTransparent(style.webkitTextFillColor)
+                )
+            )
+        );
+    }
+
+    /**
+     * Checks whether a given style is selectable or not.
+     * @param style An object implementing the CSSStyleDeclaration interface.
+     * @returns true if the style is selectable, otherwise false.
+     */
+    static isStyleSelectable(style) {
+        return !(
+            style.userSelect === 'none' ||
+            style.webkitUserSelect === 'none' ||
+            style.MozUserSelect === 'none' ||
+            style.msUserSelect === 'none'
+        );
+    }
+
+    /**
+     * Checks whether a CSS color is transparent or not.
+     * @param cssColor A CSS color string, expected to be encoded in rgb(a) form.
+     * @returns true if the color is transparent, otherwise false.
+     */
+    static isCSSColorTransparent(cssColor) {
+        return (
+            typeof cssColor === 'string' &&
+            cssColor.startsWith('rgba(') &&
+            /,\s*0.?0*\)$/.test(cssColor)
+        );
+    }
+
+    /**
+     * Checks whether a CSS display value will cause a layout change for text.
+     * @param cssDisplay A CSS string corresponding to the value of the display property.
+     * @returns true if the layout is changed by this value, otherwise false.
+     */
+    static doesCSSDisplayChangeLayout(cssDisplay) {
+        let pos = cssDisplay.indexOf(' ');
+        if (pos >= 0) {
+            // Truncate to <display-outside> part
+            cssDisplay = cssDisplay.substring(0, pos);
+        }
+
+        pos = cssDisplay.indexOf('-');
+        if (pos >= 0) {
+            // Truncate to first part of kebab-case value
+            cssDisplay = cssDisplay.substring(0, pos);
+        }
+
+        switch (cssDisplay) {
+            case 'block':
+            case 'flex':
+            case 'grid':
+            case 'list': // list-item
+            case 'table': // table, table-*
+                return true;
+            case 'ruby': // rubt-*
+                return (pos >= 0);
+            default:
+                return false;
+        }
+    }
+}
author	toasted-nutbread <toasted-nutbread@users.noreply.github.com>	2020-05-02 13:05:43 -0400
committer	GitHub <noreply@github.com>	2020-05-02 13:05:43 -0400
commit	d4ae9aa501ece99ea6c5e6b8fb01c3005f5b7f03 (patch)
tree	f96211038ffac0be88da912cb40bd3980c212c18 /ext/fg/js/dom-text-scanner.js
parent	d581bffa15419b3b55773f1ed08a2e787e574f1f (diff)