/* * Copyright (C) 2023 Yomitan Authors * Copyright (C) 2020-2022 Yomichan Authors * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see <https://www.gnu.org/licenses/>. */ import {StringUtil} from '../data/sandbox/string-util.js'; /** * A class used to scan text in a document. */ export class DOMTextScanner { /** * Creates a new instance of a DOMTextScanner. * @param {Node} node The DOM Node to start at. * @param {number} offset The character offset in to start at when node is a text node. * Use 0 for non-text nodes. * @param {boolean} forcePreserveWhitespace Whether or not whitespace should be forced to be preserved, * regardless of CSS styling. * @param {boolean} generateLayoutContent Whether or not newlines should be added based on CSS styling. */ constructor(node, offset, forcePreserveWhitespace=false, generateLayoutContent=true) { const ruby = DOMTextScanner.getParentRubyElement(node); const resetOffset = (ruby !== null); if (resetOffset) { node = ruby; } this._node = node; this._offset = offset; this._content = ''; this._remainder = 0; this._resetOffset = resetOffset; this._newlines = 0; this._lineHasWhitespace = false; this._lineHasContent = false; this._forcePreserveWhitespace = forcePreserveWhitespace; this._generateLayoutContent = generateLayoutContent; } /** * Gets the current node being scanned. * @type {Node} */ get node() { return this._node; } /** * Gets the current offset corresponding to the node being scanned. * This value is only applicable for text nodes. * @type {number} */ get offset() { return this._offset; } /** * Gets the remaining number of characters that weren't scanned in the last seek() call. * This value is usually 0 unless the end of the document was reached. * @type {number} */ get remainder() { return this._remainder; } /** * Gets the accumulated content string resulting from calls to seek(). * @type {string} */ get content() { return this._content; } /** * Seeks a given length in the document and accumulates the text content. * @param {number} length A positive or negative integer corresponding to how many characters * should be added to content. Content is only added to the accumulation string, * never removed, so mixing seek calls with differently signed length values * may give unexpected results. * @returns {DOMTextScanner} this */ seek(length) { const forward = (length >= 0); this._remainder = (forward ? length : -length); if (length === 0) { return this; } const TEXT_NODE = Node.TEXT_NODE; const ELEMENT_NODE = Node.ELEMENT_NODE; const generateLayoutContent = this._generateLayoutContent; let node = this._node; let lastNode = node; let resetOffset = this._resetOffset; let newlines = 0; while (node !== null) { let enterable = false; const nodeType = node.nodeType; if (nodeType === TEXT_NODE) { lastNode = node; if (!( forward ? this._seekTextNodeForward(node, resetOffset) : this._seekTextNodeBackward(node, resetOffset) )) { // Length reached break; } } else if (nodeType === ELEMENT_NODE) { lastNode = node; this._offset = 0; ({enterable, newlines} = DOMTextScanner.getElementSeekInfo(node)); if (newlines > this._newlines && generateLayoutContent) { this._newlines = newlines; } } const exitedNodes = []; node = DOMTextScanner.getNextNode(node, forward, enterable, exitedNodes); for (const exitedNode of exitedNodes) { if (exitedNode.nodeType !== ELEMENT_NODE) { continue; } ({newlines} = DOMTextScanner.getElementSeekInfo(exitedNode)); if (newlines > this._newlines && generateLayoutContent) { this._newlines = newlines; } } resetOffset = true; } this._node = lastNode; this._resetOffset = resetOffset; return this; } // Private /** * Seeks forward in a text node. * @param {Text} textNode The text node to use. * @param {boolean} resetOffset Whether or not the text offset should be reset. * @returns {boolean} `true` if scanning should continue, or `false` if the scan length has been reached. */ _seekTextNodeForward(textNode, resetOffset) { const nodeValue = textNode.nodeValue; const nodeValueLength = nodeValue.length; const {preserveNewlines, preserveWhitespace} = this._getWhitespaceSettings(textNode); let lineHasWhitespace = this._lineHasWhitespace; let lineHasContent = this._lineHasContent; let content = this._content; let offset = resetOffset ? 0 : this._offset; let remainder = this._remainder; let newlines = this._newlines; while (offset < nodeValueLength) { const char = StringUtil.readCodePointsForward(nodeValue, offset, 1); offset += char.length; const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); if (charAttributes === 0) { // Character should be ignored continue; } else if (charAttributes === 1) { // Character is collapsible whitespace lineHasWhitespace = true; } else { // Character should be added to the content if (newlines > 0) { if (content.length > 0) { const useNewlineCount = Math.min(remainder, newlines); content += '\n'.repeat(useNewlineCount); remainder -= useNewlineCount; newlines -= useNewlineCount; } else { newlines = 0; } lineHasContent = false; lineHasWhitespace = false; if (remainder <= 0) { offset -= char.length; // Revert character offset break; } } lineHasContent = (charAttributes === 2); // 3 = character is a newline if (lineHasWhitespace) { if (lineHasContent) { content += ' '; lineHasWhitespace = false; if (--remainder <= 0) { offset -= char.length; // Revert character offset break; } } else { lineHasWhitespace = false; } } content += char; if (--remainder <= 0) { break; } } } this._lineHasWhitespace = lineHasWhitespace; this._lineHasContent = lineHasContent; this._content = content; this._offset = offset; this._remainder = remainder; this._newlines = newlines; return (remainder > 0); } /** * Seeks backward in a text node. * This function is nearly the same as _seekTextNodeForward, with the following differences: * - Iteration condition is reversed to check if offset is greater than 0. * - offset is reset to nodeValueLength instead of 0. * - offset is decremented instead of incremented. * - offset is decremented before getting the character. * - offset is reverted by incrementing instead of decrementing. * - content string is prepended instead of appended. * @param {Text} textNode The text node to use. * @param {boolean} resetOffset Whether or not the text offset should be reset. * @returns {boolean} `true` if scanning should continue, or `false` if the scan length has been reached. */ _seekTextNodeBackward(textNode, resetOffset) { const nodeValue = textNode.nodeValue; const nodeValueLength = nodeValue.length; const {preserveNewlines, preserveWhitespace} = this._getWhitespaceSettings(textNode); let lineHasWhitespace = this._lineHasWhitespace; let lineHasContent = this._lineHasContent; let content = this._content; let offset = resetOffset ? nodeValueLength : this._offset; let remainder = this._remainder; let newlines = this._newlines; while (offset > 0) { const char = StringUtil.readCodePointsBackward(nodeValue, offset - 1, 1); offset -= char.length; const charAttributes = DOMTextScanner.getCharacterAttributes(char, preserveNewlines, preserveWhitespace); if (charAttributes === 0) { // Character should be ignored continue; } else if (charAttributes === 1) { // Character is collapsible whitespace lineHasWhitespace = true; } else { // Character should be added to the content if (newlines > 0) { if (content.length > 0) { const useNewlineCount = Math.min(remainder, newlines); content = '\n'.repeat(useNewlineCount) + content; remainder -= useNewlineCount; newlines -= useNewlineCount; } else { newlines = 0; } lineHasContent = false; lineHasWhitespace = false; if (remainder <= 0) { offset += char.length; // Revert character offset break; } } lineHasContent = (charAttributes === 2); // 3 = character is a newline if (lineHasWhitespace) { if (lineHasContent) { content = ' ' + content; lineHasWhitespace = false; if (--remainder <= 0) { offset += char.length; // Revert character offset break; } } else { lineHasWhitespace = false; } } content = char + content; if (--remainder <= 0) { break; } } } this._lineHasWhitespace = lineHasWhitespace; this._lineHasContent = lineHasContent; this._content = content; this._offset = offset; this._remainder = remainder; this._newlines = newlines; return (remainder > 0); } /** * Gets information about how whitespace characters are treated. * @param {Text} textNode The text node to check. * @returns {{preserveNewlines: boolean, preserveWhitespace: boolean}} Information about the whitespace. * The value of `preserveNewlines` indicates whether or not newline characters are treated as line breaks. * The value of `preserveWhitespace` indicates whether or not sequences of whitespace characters are collapsed. */ _getWhitespaceSettings(textNode) { if (this._forcePreserveWhitespace) { return {preserveNewlines: true, preserveWhitespace: true}; } const element = DOMTextScanner.getParentElement(textNode); if (element !== null) { const style = window.getComputedStyle(element); switch (style.whiteSpace) { case 'pre': case 'pre-wrap': case 'break-spaces': return {preserveNewlines: true, preserveWhitespace: true}; case 'pre-line': return {preserveNewlines: true, preserveWhitespace: false}; } } return {preserveNewlines: false, preserveWhitespace: false}; } // Static helpers /** * Gets the next node in the document for a specified scanning direction. * @param {Node} node The current DOM Node. * @param {boolean} forward Whether to scan forward in the document or backward. * @param {boolean} visitChildren Whether the children of the current node should be visited. * @param {Node[]} exitedNodes An array which stores nodes which were exited. * @returns {?Node} The next node in the document, or `null` if there is no next node. */ static getNextNode(node, forward, visitChildren, exitedNodes) { let next = visitChildren ? (forward ? node.firstChild : node.lastChild) : null; if (next === null) { while (true) { exitedNodes.push(node); next = (forward ? node.nextSibling : node.previousSibling); if (next !== null) { break; } next = node.parentNode; if (next === null) { break; } node = next; } } return next; } /** * Gets the parent element of a given Node. * @param {Node} node The node to check. * @returns {?Node} The parent element if one exists, otherwise `null`. */ static getParentElement(node) { while (node !== null && node.nodeType !== Node.ELEMENT_NODE) { node = node.parentNode; } return node; } /** * Gets the parent <ruby> element of a given node, if one exists. For efficiency purposes, * this only checks the immediate parent elements and does not check all ancestors, so * there are cases where the node may be in a ruby element but it is not returned. * @param {Node} node The node to check. * @returns {?HTMLElement} A <ruby> node if the input node is contained in one, otherwise `null`. */ static getParentRubyElement(node) { node = DOMTextScanner.getParentElement(node); if (node !== null && node.nodeName.toUpperCase() === 'RT') { node = node.parentNode; if (node !== null && node.nodeName.toUpperCase() === 'RUBY') { return node; } } return null; } /** * Gets seek information about an element. * @param {Element} element The element to check. * @returns {{enterable: boolean, newlines: number}} The seek information. * The `enterable` value indicates whether the content of this node should be entered. * The `newlines` value corresponds to the number of newline characters that should be added. * - 1 newline corresponds to a simple new line in the layout. * - 2 newlines corresponds to a significant visual distinction since the previous content. */ static getElementSeekInfo(element) { let enterable = true; switch (element.nodeName.toUpperCase()) { case 'HEAD': case 'RT': case 'SCRIPT': case 'STYLE': return {enterable: false, newlines: 0}; case 'BR': return {enterable: false, newlines: 1}; case 'TEXTAREA': case 'INPUT': case 'BUTTON': enterable = false; break; } const style = window.getComputedStyle(element); const display = style.display; const visible = (display !== 'none' && DOMTextScanner.isStyleVisible(style)); let newlines = 0; if (!visible) { enterable = false; } else { switch (style.position) { case 'absolute': case 'fixed': case 'sticky': newlines = 2; break; } if (newlines === 0 && DOMTextScanner.doesCSSDisplayChangeLayout(display)) { newlines = 1; } } return {enterable, newlines}; } /** * Gets attributes for the specified character. * @param {string} character A string containing a single character. * @param {boolean} preserveNewlines Whether or not newlines should be preserved. * @param {boolean} preserveWhitespace Whether or not whitespace should be preserved. * @returns {number} An integer representing the attributes of the character. * 0: Character should be ignored. * 1: Character is collapsible whitespace. * 2: Character should be added to the content. * 3: Character should be added to the content and is a newline. */ static getCharacterAttributes(character, preserveNewlines, preserveWhitespace) { switch (character.charCodeAt(0)) { case 0x09: // Tab ('\t') case 0x0c: // Form feed ('\f') case 0x0d: // Carriage return ('\r') case 0x20: // Space (' ') return preserveWhitespace ? 2 : 1; case 0x0a: // Line feed ('\n') return preserveNewlines ? 3 : 1; case 0x200b: // Zero-width space case 0x200c: // Zero-width non-joiner return 0; default: // Other return 2; } } /** * Checks whether a given style is visible or not. * This function does not check `style.display === 'none'`. * @param {CSSStyleDeclaration} style An object implementing the CSSStyleDeclaration interface. * @returns {boolean} `true` if the style should result in an element being visible, otherwise `false`. */ static isStyleVisible(style) { return !( style.visibility === 'hidden' || parseFloat(style.opacity) <= 0 || parseFloat(style.fontSize) <= 0 || ( !DOMTextScanner.isStyleSelectable(style) && ( DOMTextScanner.isCSSColorTransparent(style.color) || DOMTextScanner.isCSSColorTransparent(style.webkitTextFillColor) ) ) ); } /** * Checks whether a given style is selectable or not. * @param {CSSStyleDeclaration} style An object implementing the CSSStyleDeclaration interface. * @returns {boolean} `true` if the style is selectable, otherwise `false`. */ static isStyleSelectable(style) { return !( style.userSelect === 'none' || style.webkitUserSelect === 'none' || style.MozUserSelect === 'none' || style.msUserSelect === 'none' ); } /** * Checks whether a CSS color is transparent or not. * @param {string} cssColor A CSS color string, expected to be encoded in rgb(a) form. * @returns {false} `true` if the color is transparent, otherwise `false`. */ static isCSSColorTransparent(cssColor) { return ( typeof cssColor === 'string' && cssColor.startsWith('rgba(') && /,\s*0.?0*\)$/.test(cssColor) ); } /** * Checks whether a CSS display value will cause a layout change for text. * @param {string} cssDisplay A CSS string corresponding to the value of the display property. * @returns {boolean} `true` if the layout is changed by this value, otherwise `false`. */ static doesCSSDisplayChangeLayout(cssDisplay) { let pos = cssDisplay.indexOf(' '); if (pos >= 0) { // Truncate to <display-outside> part cssDisplay = cssDisplay.substring(0, pos); } pos = cssDisplay.indexOf('-'); if (pos >= 0) { // Truncate to first part of kebab-case value cssDisplay = cssDisplay.substring(0, pos); } switch (cssDisplay) { case 'block': case 'flex': case 'grid': case 'list': // list-item case 'table': // table, table-* return true; case 'ruby': // ruby-* return (pos >= 0); default: return false; } } }