summaryrefslogtreecommitdiff
path: root/ext/mixed/js/document-util.js
diff options
context:
space:
mode:
authortoasted-nutbread <toasted-nutbread@users.noreply.github.com>2021-01-09 23:10:55 -0500
committerGitHub <noreply@github.com>2021-01-09 23:10:55 -0500
commit083da93142ec6302021ee1c29428121b54fc9e68 (patch)
treec07c7e3ae808682e127603303f4698056204abd7 /ext/mixed/js/document-util.js
parentda1e1e5c5b4dc20cc6aa46b51c55a496d094ed5c (diff)
Refactor sentence parsing (#1215)
* Rename sentenceExtent with sentenceScanExtent * Update TextScanner.setOptions * Change function argument order * Rename quote map variables * Fix edge case quote handling * Update terminator maps to support character inclusion
Diffstat (limited to 'ext/mixed/js/document-util.js')
-rw-r--r--ext/mixed/js/document-util.js71
1 files changed, 43 insertions, 28 deletions
diff --git a/ext/mixed/js/document-util.js b/ext/mixed/js/document-util.js
index 46ed321e..647cbedc 100644
--- a/ext/mixed/js/document-util.js
+++ b/ext/mixed/js/document-util.js
@@ -31,12 +31,16 @@ class DocumentUtil {
['\'', '\''],
['"', '"']
];
- this._terminatorSet = new Set(['…', '。', '.', '.', '?', '?', '!', '!']);
- this._startQuoteMap = new Map();
- this._endQuoteMap = new Map();
+ const terminatorString = '…。..??!!';
+ this._terminatorMap = new Map();
+ for (const char of terminatorString) {
+ this._terminatorMap.set(char, [false, true]);
+ }
+ this._forwardQuoteMap = new Map();
+ this._backwardQuoteMap = new Map();
for (const [char1, char2] of quoteArray) {
- this._startQuoteMap.set(char1, char2);
- this._endQuoteMap.set(char2, char1);
+ this._forwardQuoteMap.set(char1, [char2, false]);
+ this._backwardQuoteMap.set(char2, [char1, false]);
}
}
@@ -77,10 +81,10 @@ class DocumentUtil {
}
}
- extractSentence(source, extent, layoutAwareScan) {
- const terminatorSet = this._terminatorSet;
- const startQuoteMap = this._startQuoteMap;
- const endQuoteMap = this._endQuoteMap;
+ extractSentence(source, layoutAwareScan, extent) {
+ const terminatorMap = this._terminatorMap;
+ const forwardQuoteMap = this._forwardQuoteMap;
+ const backwardQuoteMap = this._backwardQuoteMap;
// Scan text
source = source.clone();
@@ -98,22 +102,28 @@ class DocumentUtil {
const c = text[pos1 - 1];
if (c === '\n') { break; }
- if (quoteStack.length === 0 && terminatorSet.has(c)) {
- break;
+ if (quoteStack.length === 0) {
+ const terminatorInfo = terminatorMap.get(c);
+ if (typeof terminatorInfo !== 'undefined') {
+ if (terminatorInfo[0]) { --pos1; }
+ break;
+ }
}
- let otherQuote = startQuoteMap.get(c);
- if (typeof otherQuote !== 'undefined') {
+ let quoteInfo = forwardQuoteMap.get(c);
+ if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) {
+ if (quoteInfo[1]) { --pos1; }
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
+ continue;
}
- } else {
- otherQuote = endQuoteMap.get(c);
- if (typeof otherQuote !== 'undefined') {
- quoteStack.unshift(otherQuote);
- }
+ }
+
+ quoteInfo = backwardQuoteMap.get(c);
+ if (typeof quoteInfo !== 'undefined') {
+ quoteStack.unshift(quoteInfo[0]);
}
}
@@ -123,23 +133,28 @@ class DocumentUtil {
const c = text[pos2];
if (c === '\n') { break; }
- if (quoteStack.length === 0 && terminatorSet.has(c)) {
- ++pos2;
- break;
+ if (quoteStack.length === 0) {
+ const terminatorInfo = terminatorMap.get(c);
+ if (typeof terminatorInfo !== 'undefined') {
+ if (terminatorInfo[1]) { ++pos2; }
+ break;
+ }
}
- let otherQuote = endQuoteMap.get(c);
- if (typeof otherQuote !== 'undefined') {
+ let quoteInfo = backwardQuoteMap.get(c);
+ if (typeof quoteInfo !== 'undefined') {
if (quoteStack.length === 0) {
+ if (quoteInfo[1]) { ++pos2; }
break;
} else if (quoteStack[0] === c) {
quoteStack.pop();
+ continue;
}
- } else {
- otherQuote = startQuoteMap.get(c);
- if (typeof otherQuote !== 'undefined') {
- quoteStack.unshift(otherQuote);
- }
+ }
+
+ quoteInfo = forwardQuoteMap.get(c);
+ if (typeof quoteInfo !== 'undefined') {
+ quoteStack.unshift(quoteInfo[0]);
}
}