From 521e87d01142063c785054741d3703de37a1636c Mon Sep 17 00:00:00 2001
From: Cashew <52880648+Scrub1492@users.noreply.github.com>
Date: Tue, 19 Dec 2023 12:45:14 +0900
Subject: capture all terminators and quotes in the sentence (#360)

* capture all terminators and quotes in the sentence

* fix negative position, add documents and tests

* fix comments giving wrong semantics

* add test case coverage

* remove cursor namespace for possible performance penalties

* while loop optimization
---
 ext/js/dom/document-util.js        | 65 ++++++++++++++++++++++++++++----------
 test/data/html/test-document1.html | 32 +++++++++++++++++++
 2 files changed, 81 insertions(+), 16 deletions(-)

diff --git a/ext/js/dom/document-util.js b/ext/js/dom/document-util.js
index fe10d670..9e4f451a 100644
--- a/ext/js/dom/document-util.js
+++ b/ext/js/dom/document-util.js
@@ -113,19 +113,30 @@ export class DocumentUtil {
         const text = source.text();
         const textLength = text.length;
         const textEndAnchor = textLength - endLength;
-        let pos1 = startLength;
-        let pos2 = textEndAnchor;
+
+        /** Relative start position of the sentence (inclusive). */
+        let cursorStart = startLength;
+        /** Relative end position of the sentence (exclusive). */
+        let cursorEnd = textEndAnchor;
 
         // Move backward
         let quoteStack = [];
-        for (; pos1 > 0; --pos1) {
-            const c = text[pos1 - 1];
+        for (; cursorStart > 0; --cursorStart) {
+            // Check if the previous character should be included.
+            let c = text[cursorStart - 1];
             if (c === '\n' && terminateAtNewlines) { break; }
 
             if (quoteStack.length === 0) {
-                const terminatorInfo = terminatorMap.get(c);
+                let terminatorInfo = terminatorMap.get(c);
                 if (typeof terminatorInfo !== 'undefined') {
-                    if (terminatorInfo[0]) { --pos1; }
+                    // Include the previous character while it is a terminator character and is included at start.
+                    while (terminatorInfo[0] && cursorStart > 0) {
+                        --cursorStart;
+                        if (cursorStart === 0) { break; }
+                        c = text[cursorStart - 1];
+                        terminatorInfo = terminatorMap.get(c);
+                        if (typeof terminatorInfo === 'undefined') { break; }
+                    }
                     break;
                 }
             }
@@ -133,7 +144,14 @@ export class DocumentUtil {
             let quoteInfo = forwardQuoteMap.get(c);
             if (typeof quoteInfo !== 'undefined') {
                 if (quoteStack.length === 0) {
-                    if (quoteInfo[1]) { --pos1; }
+                    // Include the previous character while it is a quote character and is included at start.
+                    while (quoteInfo[1] && cursorStart > 0) {
+                        --cursorStart;
+                        if (cursorStart === 0) { break; }
+                        c = text[cursorStart - 1];
+                        quoteInfo = forwardQuoteMap.get(c);
+                        if (typeof quoteInfo === 'undefined') { break; }
+                    }
                     break;
                 } else if (quoteStack[0] === c) {
                     quoteStack.pop();
@@ -149,14 +167,22 @@ export class DocumentUtil {
 
         // Move forward
         quoteStack = [];
-        for (; pos2 < textLength; ++pos2) {
-            const c = text[pos2];
+        for (; cursorEnd < textLength; ++cursorEnd) {
+            // Check if the following character should be included.
+            let c = text[cursorEnd];
             if (c === '\n' && terminateAtNewlines) { break; }
 
             if (quoteStack.length === 0) {
-                const terminatorInfo = terminatorMap.get(c);
+                let terminatorInfo = terminatorMap.get(c);
                 if (typeof terminatorInfo !== 'undefined') {
-                    if (terminatorInfo[1]) { ++pos2; }
+                    // Include the following character while it is a terminator character and is included at end.
+                    while (terminatorInfo[1] && cursorEnd < textLength) {
+                        ++cursorEnd;
+                        if (cursorEnd === textLength) { break; }
+                        c = text[cursorEnd];
+                        terminatorInfo = terminatorMap.get(c);
+                        if (typeof terminatorInfo === 'undefined') { break; }
+                    }
                     break;
                 }
             }
@@ -164,7 +190,14 @@ export class DocumentUtil {
             let quoteInfo = backwardQuoteMap.get(c);
             if (typeof quoteInfo !== 'undefined') {
                 if (quoteStack.length === 0) {
-                    if (quoteInfo[1]) { ++pos2; }
+                    // Include the following character while it is a quote character and is included at end.
+                    while (quoteInfo[1] && cursorEnd < textLength) {
+                        ++cursorEnd;
+                        if (cursorEnd === textLength) { break; }
+                        c = text[cursorEnd];
+                        quoteInfo = forwardQuoteMap.get(c);
+                        if (typeof quoteInfo === 'undefined') { break; }
+                    }
                     break;
                 } else if (quoteStack[0] === c) {
                     quoteStack.pop();
@@ -179,13 +212,13 @@ export class DocumentUtil {
         }
 
         // Trim whitespace
-        for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
-        for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
+        for (; cursorStart < startLength && this._isWhitespace(text[cursorStart]); ++cursorStart) { /* NOP */ }
+        for (; cursorEnd > textEndAnchor && this._isWhitespace(text[cursorEnd - 1]); --cursorEnd) { /* NOP */ }
 
         // Result
         return {
-            text: text.substring(pos1, pos2),
-            offset: startLength - pos1
+            text: text.substring(cursorStart, cursorEnd),
+            offset: startLength - cursorStart
         };
     }
 
diff --git a/test/data/html/test-document1.html b/test/data/html/test-document1.html
index 964d12c0..d66e459d 100644
--- a/test/data/html/test-document1.html
+++ b/test/data/html/test-document1.html
@@ -91,6 +91,38 @@
         <span>ありがとございます。ありがとございます。</span>
     </div>
 
+    <div
+        class="test"
+        data-test-type="scan"
+        data-element-from-point-selector="span"
+        data-caret-range-from-point-selector="span"
+        data-start-node-selector="span"
+        data-start-offset="4"
+        data-end-node-selector="span"
+        data-end-offset="4"
+        data-result-type="TextSourceRange"
+        data-sentence-scan-extent="100"
+        data-sentence="ありがとございます。!?"
+    >
+        <span>ありがとございます。!?ありがとございます。!?</span>
+    </div>
+
+    <div
+        class="test"
+        data-test-type="scan"
+        data-element-from-point-selector="span"
+        data-caret-range-from-point-selector="span"
+        data-start-node-selector="span"
+        data-start-offset="4"
+        data-end-node-selector="span"
+        data-end-offset="4"
+        data-result-type="TextSourceRange"
+        data-sentence-scan-extent="100"
+        data-sentence="ありがとございます!!!"
+    >
+        <span>ありがとございます!!!ありがとございます!!!</span>
+    </div>
+
     <div
         class="test"
         data-test-type="scan"
-- 
cgit v1.2.3