From 521e87d01142063c785054741d3703de37a1636c Mon Sep 17 00:00:00 2001 From: Cashew <52880648+Scrub1492@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:45:14 +0900 Subject: capture all terminators and quotes in the sentence (#360) * capture all terminators and quotes in the sentence * fix negative position, add documents and tests * fix comments giving wrong semantics * add test case coverage * remove cursor namespace for possible performance penalties * while loop optimization --- ext/js/dom/document-util.js | 65 ++++++++++++++++++++++++++++---------- test/data/html/test-document1.html | 32 +++++++++++++++++++ 2 files changed, 81 insertions(+), 16 deletions(-) diff --git a/ext/js/dom/document-util.js b/ext/js/dom/document-util.js index fe10d670..9e4f451a 100644 --- a/ext/js/dom/document-util.js +++ b/ext/js/dom/document-util.js @@ -113,19 +113,30 @@ export class DocumentUtil { const text = source.text(); const textLength = text.length; const textEndAnchor = textLength - endLength; - let pos1 = startLength; - let pos2 = textEndAnchor; + + /** Relative start position of the sentence (inclusive). */ + let cursorStart = startLength; + /** Relative end position of the sentence (exclusive). */ + let cursorEnd = textEndAnchor; // Move backward let quoteStack = []; - for (; pos1 > 0; --pos1) { - const c = text[pos1 - 1]; + for (; cursorStart > 0; --cursorStart) { + // Check if the previous character should be included. + let c = text[cursorStart - 1]; if (c === '\n' && terminateAtNewlines) { break; } if (quoteStack.length === 0) { - const terminatorInfo = terminatorMap.get(c); + let terminatorInfo = terminatorMap.get(c); if (typeof terminatorInfo !== 'undefined') { - if (terminatorInfo[0]) { --pos1; } + // Include the previous character while it is a terminator character and is included at start. + while (terminatorInfo[0] && cursorStart > 0) { + --cursorStart; + if (cursorStart === 0) { break; } + c = text[cursorStart - 1]; + terminatorInfo = terminatorMap.get(c); + if (typeof terminatorInfo === 'undefined') { break; } + } break; } } @@ -133,7 +144,14 @@ export class DocumentUtil { let quoteInfo = forwardQuoteMap.get(c); if (typeof quoteInfo !== 'undefined') { if (quoteStack.length === 0) { - if (quoteInfo[1]) { --pos1; } + // Include the previous character while it is a quote character and is included at start. + while (quoteInfo[1] && cursorStart > 0) { + --cursorStart; + if (cursorStart === 0) { break; } + c = text[cursorStart - 1]; + quoteInfo = forwardQuoteMap.get(c); + if (typeof quoteInfo === 'undefined') { break; } + } break; } else if (quoteStack[0] === c) { quoteStack.pop(); @@ -149,14 +167,22 @@ export class DocumentUtil { // Move forward quoteStack = []; - for (; pos2 < textLength; ++pos2) { - const c = text[pos2]; + for (; cursorEnd < textLength; ++cursorEnd) { + // Check if the following character should be included. + let c = text[cursorEnd]; if (c === '\n' && terminateAtNewlines) { break; } if (quoteStack.length === 0) { - const terminatorInfo = terminatorMap.get(c); + let terminatorInfo = terminatorMap.get(c); if (typeof terminatorInfo !== 'undefined') { - if (terminatorInfo[1]) { ++pos2; } + // Include the following character while it is a terminator character and is included at end. + while (terminatorInfo[1] && cursorEnd < textLength) { + ++cursorEnd; + if (cursorEnd === textLength) { break; } + c = text[cursorEnd]; + terminatorInfo = terminatorMap.get(c); + if (typeof terminatorInfo === 'undefined') { break; } + } break; } } @@ -164,7 +190,14 @@ export class DocumentUtil { let quoteInfo = backwardQuoteMap.get(c); if (typeof quoteInfo !== 'undefined') { if (quoteStack.length === 0) { - if (quoteInfo[1]) { ++pos2; } + // Include the following character while it is a quote character and is included at end. + while (quoteInfo[1] && cursorEnd < textLength) { + ++cursorEnd; + if (cursorEnd === textLength) { break; } + c = text[cursorEnd]; + quoteInfo = forwardQuoteMap.get(c); + if (typeof quoteInfo === 'undefined') { break; } + } break; } else if (quoteStack[0] === c) { quoteStack.pop(); @@ -179,13 +212,13 @@ export class DocumentUtil { } // Trim whitespace - for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ } - for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ } + for (; cursorStart < startLength && this._isWhitespace(text[cursorStart]); ++cursorStart) { /* NOP */ } + for (; cursorEnd > textEndAnchor && this._isWhitespace(text[cursorEnd - 1]); --cursorEnd) { /* NOP */ } // Result return { - text: text.substring(pos1, pos2), - offset: startLength - pos1 + text: text.substring(cursorStart, cursorEnd), + offset: startLength - cursorStart }; } diff --git a/test/data/html/test-document1.html b/test/data/html/test-document1.html index 964d12c0..d66e459d 100644 --- a/test/data/html/test-document1.html +++ b/test/data/html/test-document1.html @@ -91,6 +91,38 @@ ありがとございます。ありがとございます。 +