summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-01-09 19:02:51 -0500
committerGitHub <noreply@github.com>2021-01-09 19:02:51 -0500
commitd698911bc9bf7cb431bce939412131a90f24ee45 (patch)
treeaffa1b6f5b10180c3aa65ba33160775cff26c856
parent11e9eb2295ebcbbc6e560e4a46c550c979c23500 (diff)
Refactor sentence scanning (#1213)
* Update tests * Update extractSentence implementation * Remove old extractSentence implementation * Optimize maps/sets
-rw-r--r--ext/mixed/js/document-util.js113
-rw-r--r--test/data/html/test-document1.html48
2 files changed, 115 insertions, 46 deletions
diff --git a/ext/mixed/js/document-util.js b/ext/mixed/js/document-util.js
index 59313314..46ed321e 100644
--- a/ext/mixed/js/document-util.js
+++ b/ext/mixed/js/document-util.js
@@ -24,6 +24,20 @@
class DocumentUtil {
constructor() {
this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/;
+
+ const quoteArray = [
+ ['「', '」'],
+ ['『', '』'],
+ ['\'', '\''],
+ ['"', '"']
+ ];
+ this._terminatorSet = new Set(['…', '。', '.', '.', '?', '?', '!', '!']);
+ this._startQuoteMap = new Map();
+ this._endQuoteMap = new Map();
+ for (const [char1, char2] of quoteArray) {
+ this._startQuoteMap.set(char1, char2);
+ this._endQuoteMap.set(char2, char1);
+ }
}
getRangeFromPoint(x, y, deepContentScan) {
@@ -64,72 +78,79 @@ class DocumentUtil {
}
extractSentence(source, extent, layoutAwareScan) {
- const quotesFwd = {'「': '」', '『': '』', "'": "'", '"': '"'};
- const quotesBwd = {'」': '「', '』': '『', "'": "'", '"': '"'};
- const terminators = '…。..??!!';
-
- const sourceLocal = source.clone();
- const position = sourceLocal.setStartOffset(extent, layoutAwareScan);
- sourceLocal.setEndOffset(extent * 2 - position, layoutAwareScan, true);
- const content = sourceLocal.text();
-
+ const terminatorSet = this._terminatorSet;
+ const startQuoteMap = this._startQuoteMap;
+ const endQuoteMap = this._endQuoteMap;
+
+ // Scan text
+ source = source.clone();
+ const startLength = source.setStartOffset(extent, layoutAwareScan);
+ const endLength = source.setEndOffset(extent * 2 - startLength, layoutAwareScan, true);
+ const text = source.text();
+ const textLength = text.length;
+ const textEndAnchor = textLength - endLength;
+ let pos1 = startLength;
+ let pos2 = textEndAnchor;
+
+ // Move backward
let quoteStack = [];
+ for (; pos1 > 0; --pos1) {
+ const c = text[pos1 - 1];
+ if (c === '\n') { break; }
- let startPos = 0;
- for (let i = position; i >= startPos; --i) {
- const c = content[i];
-
- if (c === '\n') {
- startPos = i + 1;
+ if (quoteStack.length === 0 && terminatorSet.has(c)) {
break;
}
- if (quoteStack.length === 0 && (terminators.includes(c) || c in quotesFwd)) {
- startPos = i + 1;
- break;
- }
-
- if (quoteStack.length > 0 && c === quoteStack[0]) {
- quoteStack.pop();
- } else if (c in quotesBwd) {
- quoteStack.unshift(quotesBwd[c]);
+ let otherQuote = startQuoteMap.get(c);
+ if (typeof otherQuote !== 'undefined') {
+ if (quoteStack.length === 0) {
+ break;
+ } else if (quoteStack[0] === c) {
+ quoteStack.pop();
+ }
+ } else {
+ otherQuote = endQuoteMap.get(c);
+ if (typeof otherQuote !== 'undefined') {
+ quoteStack.unshift(otherQuote);
+ }
}
}
+ // Move forward
quoteStack = [];
+ for (; pos2 < textLength; ++pos2) {
+ const c = text[pos2];
+ if (c === '\n') { break; }
- let endPos = content.length;
- for (let i = position; i <= endPos; ++i) {
- const c = content[i];
-
- if (c === '\n') {
- endPos = i + 1;
+ if (quoteStack.length === 0 && terminatorSet.has(c)) {
+ ++pos2;
break;
}
- if (quoteStack.length === 0) {
- if (terminators.includes(c)) {
- endPos = i + 1;
- break;
- } else if (c in quotesBwd) {
- endPos = i;
+ let otherQuote = endQuoteMap.get(c);
+ if (typeof otherQuote !== 'undefined') {
+ if (quoteStack.length === 0) {
break;
+ } else if (quoteStack[0] === c) {
+ quoteStack.pop();
+ }
+ } else {
+ otherQuote = startQuoteMap.get(c);
+ if (typeof otherQuote !== 'undefined') {
+ quoteStack.unshift(otherQuote);
}
- }
-
- if (quoteStack.length > 0 && c === quoteStack[0]) {
- quoteStack.pop();
- } else if (c in quotesFwd) {
- quoteStack.unshift(quotesFwd[c]);
}
}
- const text = content.substring(startPos, endPos);
- const padding = text.length - text.replace(/^\s+/, '').length;
+ // Trim whitespace
+ for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ }
+ for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ }
+ // Result
return {
- text: text.trim(),
- offset: position - startPos - padding
+ text: text.substring(pos1, pos2),
+ offset: startLength - pos1
};
}
diff --git a/test/data/html/test-document1.html b/test/data/html/test-document1.html
index 98a6fb44..37dbb017 100644
--- a/test/data/html/test-document1.html
+++ b/test/data/html/test-document1.html
@@ -46,6 +46,54 @@
<div
class="test"
data-test-type="scan"
+ data-element-from-point-selector="span"
+ data-caret-range-from-point-selector="span"
+ data-start-node-selector="span"
+ data-start-offset="16"
+ data-end-node-selector="span"
+ data-end-offset="16"
+ data-result-type="TextSourceRange",
+ data-sentence-extent="100"
+ data-sentence="心配して「くださって」、ありがと「ございます」"
+ >
+ <span>真白「心配して「くださって」、ありがと「ございます」」</span>
+ </div>
+
+ <div
+ class="test"
+ data-test-type="scan"
+ data-element-from-point-selector="span"
+ data-caret-range-from-point-selector="span"
+ data-start-node-selector="span"
+ data-start-offset="4"
+ data-end-node-selector="span"
+ data-end-offset="4"
+ data-result-type="TextSourceRange",
+ data-sentence-extent="100"
+ data-sentence="ありがとございます。"
+ >
+ <span>ありがとございます。ありがとございます。</span>
+ </div>
+
+ <div
+ class="test"
+ data-test-type="scan"
+ data-element-from-point-selector="span"
+ data-caret-range-from-point-selector="span"
+ data-start-node-selector="span"
+ data-start-offset="14"
+ data-end-node-selector="span"
+ data-end-offset="14"
+ data-result-type="TextSourceRange",
+ data-sentence-extent="100"
+ data-sentence="ありがとございます。"
+ >
+ <span>ありがとございます。ありがとございます。</span>
+ </div>
+
+ <div
+ class="test"
+ data-test-type="scan"
data-element-from-point-selector="input"
data-caret-range-from-point-selector="input"
data-start-node-selector="input"