diff options
author | toasted-nutbread <toasted-nutbread@users.noreply.github.com> | 2021-01-09 19:02:51 -0500 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-01-09 19:02:51 -0500 |
commit | d698911bc9bf7cb431bce939412131a90f24ee45 (patch) | |
tree | affa1b6f5b10180c3aa65ba33160775cff26c856 | |
parent | 11e9eb2295ebcbbc6e560e4a46c550c979c23500 (diff) |
Refactor sentence scanning (#1213)
* Update tests
* Update extractSentence implementation
* Remove old extractSentence implementation
* Optimize maps/sets
-rw-r--r-- | ext/mixed/js/document-util.js | 113 | ||||
-rw-r--r-- | test/data/html/test-document1.html | 48 |
2 files changed, 115 insertions, 46 deletions
diff --git a/ext/mixed/js/document-util.js b/ext/mixed/js/document-util.js index 59313314..46ed321e 100644 --- a/ext/mixed/js/document-util.js +++ b/ext/mixed/js/document-util.js @@ -24,6 +24,20 @@ class DocumentUtil { constructor() { this._transparentColorPattern = /rgba\s*\([^)]*,\s*0(?:\.0+)?\s*\)/; + + const quoteArray = [ + ['「', '」'], + ['『', '』'], + ['\'', '\''], + ['"', '"'] + ]; + this._terminatorSet = new Set(['…', '。', '.', '.', '?', '?', '!', '!']); + this._startQuoteMap = new Map(); + this._endQuoteMap = new Map(); + for (const [char1, char2] of quoteArray) { + this._startQuoteMap.set(char1, char2); + this._endQuoteMap.set(char2, char1); + } } getRangeFromPoint(x, y, deepContentScan) { @@ -64,72 +78,79 @@ class DocumentUtil { } extractSentence(source, extent, layoutAwareScan) { - const quotesFwd = {'「': '」', '『': '』', "'": "'", '"': '"'}; - const quotesBwd = {'」': '「', '』': '『', "'": "'", '"': '"'}; - const terminators = '…。..??!!'; - - const sourceLocal = source.clone(); - const position = sourceLocal.setStartOffset(extent, layoutAwareScan); - sourceLocal.setEndOffset(extent * 2 - position, layoutAwareScan, true); - const content = sourceLocal.text(); - + const terminatorSet = this._terminatorSet; + const startQuoteMap = this._startQuoteMap; + const endQuoteMap = this._endQuoteMap; + + // Scan text + source = source.clone(); + const startLength = source.setStartOffset(extent, layoutAwareScan); + const endLength = source.setEndOffset(extent * 2 - startLength, layoutAwareScan, true); + const text = source.text(); + const textLength = text.length; + const textEndAnchor = textLength - endLength; + let pos1 = startLength; + let pos2 = textEndAnchor; + + // Move backward let quoteStack = []; + for (; pos1 > 0; --pos1) { + const c = text[pos1 - 1]; + if (c === '\n') { break; } - let startPos = 0; - for (let i = position; i >= startPos; --i) { - const c = content[i]; - - if (c === '\n') { - startPos = i + 1; + if (quoteStack.length === 0 && terminatorSet.has(c)) { break; } - if (quoteStack.length === 0 && (terminators.includes(c) || c in quotesFwd)) { - startPos = i + 1; - break; - } - - if (quoteStack.length > 0 && c === quoteStack[0]) { - quoteStack.pop(); - } else if (c in quotesBwd) { - quoteStack.unshift(quotesBwd[c]); + let otherQuote = startQuoteMap.get(c); + if (typeof otherQuote !== 'undefined') { + if (quoteStack.length === 0) { + break; + } else if (quoteStack[0] === c) { + quoteStack.pop(); + } + } else { + otherQuote = endQuoteMap.get(c); + if (typeof otherQuote !== 'undefined') { + quoteStack.unshift(otherQuote); + } } } + // Move forward quoteStack = []; + for (; pos2 < textLength; ++pos2) { + const c = text[pos2]; + if (c === '\n') { break; } - let endPos = content.length; - for (let i = position; i <= endPos; ++i) { - const c = content[i]; - - if (c === '\n') { - endPos = i + 1; + if (quoteStack.length === 0 && terminatorSet.has(c)) { + ++pos2; break; } - if (quoteStack.length === 0) { - if (terminators.includes(c)) { - endPos = i + 1; - break; - } else if (c in quotesBwd) { - endPos = i; + let otherQuote = endQuoteMap.get(c); + if (typeof otherQuote !== 'undefined') { + if (quoteStack.length === 0) { break; + } else if (quoteStack[0] === c) { + quoteStack.pop(); + } + } else { + otherQuote = startQuoteMap.get(c); + if (typeof otherQuote !== 'undefined') { + quoteStack.unshift(otherQuote); } - } - - if (quoteStack.length > 0 && c === quoteStack[0]) { - quoteStack.pop(); - } else if (c in quotesFwd) { - quoteStack.unshift(quotesFwd[c]); } } - const text = content.substring(startPos, endPos); - const padding = text.length - text.replace(/^\s+/, '').length; + // Trim whitespace + for (; pos1 < startLength && this._isWhitespace(text[pos1]); ++pos1) { /* NOP */ } + for (; pos2 > textEndAnchor && this._isWhitespace(text[pos2 - 1]); --pos2) { /* NOP */ } + // Result return { - text: text.trim(), - offset: position - startPos - padding + text: text.substring(pos1, pos2), + offset: startLength - pos1 }; } diff --git a/test/data/html/test-document1.html b/test/data/html/test-document1.html index 98a6fb44..37dbb017 100644 --- a/test/data/html/test-document1.html +++ b/test/data/html/test-document1.html @@ -46,6 +46,54 @@ <div class="test" data-test-type="scan" + data-element-from-point-selector="span" + data-caret-range-from-point-selector="span" + data-start-node-selector="span" + data-start-offset="16" + data-end-node-selector="span" + data-end-offset="16" + data-result-type="TextSourceRange", + data-sentence-extent="100" + data-sentence="心配して「くださって」、ありがと「ございます」" + > + <span>真白「心配して「くださって」、ありがと「ございます」」</span> + </div> + + <div + class="test" + data-test-type="scan" + data-element-from-point-selector="span" + data-caret-range-from-point-selector="span" + data-start-node-selector="span" + data-start-offset="4" + data-end-node-selector="span" + data-end-offset="4" + data-result-type="TextSourceRange", + data-sentence-extent="100" + data-sentence="ありがとございます。" + > + <span>ありがとございます。ありがとございます。</span> + </div> + + <div + class="test" + data-test-type="scan" + data-element-from-point-selector="span" + data-caret-range-from-point-selector="span" + data-start-node-selector="span" + data-start-offset="14" + data-end-node-selector="span" + data-end-offset="14" + data-result-type="TextSourceRange", + data-sentence-extent="100" + data-sentence="ありがとございます。" + > + <span>ありがとございます。ありがとございます。</span> + </div> + + <div + class="test" + data-test-type="scan" data-element-from-point-selector="input" data-caret-range-from-point-selector="input" data-start-node-selector="input" |