From 658e5ddff13f4ec392dc110004635e22d468525a Mon Sep 17 00:00:00 2001 From: siikamiika Date: Wed, 14 Aug 2019 22:54:42 +0300 Subject: ignore zero-width non-joiner fixes #179 --- ext/fg/js/document.js | 2 +- ext/fg/js/frontend.js | 13 +++++++++++-- ext/fg/js/source.js | 19 +++++++++++++++++-- ext/mixed/js/display.js | 3 ++- 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/ext/fg/js/document.js b/ext/fg/js/document.js index f58a64fc..0697edb3 100644 --- a/ext/fg/js/document.js +++ b/ext/fg/js/document.js @@ -116,7 +116,7 @@ function docSentenceExtract(source, extent) { const sourceLocal = source.clone(); const position = sourceLocal.setStartOffset(extent); sourceLocal.setEndOffset(position + extent); - const content = sourceLocal.text(); + const {text: content} = sourceLocal.text(); let quoteStack = []; diff --git a/ext/fg/js/frontend.js b/ext/fg/js/frontend.js index bd652f3b..afb182d4 100644 --- a/ext/fg/js/frontend.js +++ b/ext/fg/js/frontend.js @@ -311,10 +311,18 @@ class Frontend { async searchTerms(textSource) { textSource.setEndOffset(this.options.scanning.length); - const {definitions, length} = await apiTermsFind(textSource.text()); + const {text, strippedIndices} = textSource.text(); + let {definitions, length} = await apiTermsFind(text); if (definitions.length === 0) { return false; } + for (let index of strippedIndices) { + if (index < length) { + length++; + } else { + break; + } + } textSource.setEndOffset(length); @@ -338,7 +346,8 @@ class Frontend { async searchKanji(textSource) { textSource.setEndOffset(1); - const definitions = await apiKanjiFind(textSource.text()); + const {text} = textSource.text(); + const definitions = await apiKanjiFind(text); if (definitions.length === 0) { return false; } diff --git a/ext/fg/js/source.js b/ext/fg/js/source.js index 664dbec7..cd8f63fd 100644 --- a/ext/fg/js/source.js +++ b/ext/fg/js/source.js @@ -16,6 +16,9 @@ * along with this program. If not, see . */ +// \u200c (Zero-width non-joiner) appears on Google Docs from Chrome 76 onwards +const IGNORE_TEXT_PATTERN = /\u200c/g; + /* * TextSourceRange @@ -32,7 +35,13 @@ class TextSourceRange { } text() { - return this.content; + let strippedIndices = []; + const text = this.content.replace(IGNORE_TEXT_PATTERN, (match, offset) => { + strippedIndices.push(offset); + return ''; + }); + + return {text, strippedIndices}; } setEndOffset(length) { @@ -195,7 +204,13 @@ class TextSourceElement { } text() { - return this.content; + let strippedIndices = []; + const text = this.content.replace(IGNORE_TEXT_PATTERN, (match, offset) => { + strippedIndices.push(offset); + return ''; + }); + + return {text, strippedIndices}; } setEndOffset(length) { diff --git a/ext/mixed/js/display.js b/ext/mixed/js/display.js index 3bb78fe1..6b9c295b 100644 --- a/ext/mixed/js/display.js +++ b/ext/mixed/js/display.js @@ -83,7 +83,8 @@ class Display { const textSource = docRangeFromPoint({x: e.clientX, y: e.clientY}); textSource.setEndOffset(this.options.scanning.length); - const {definitions, length} = await apiTermsFind(textSource.text()); + const {text} = textSource.text(); + const {definitions, length} = await apiTermsFind(text); if (definitions.length === 0) { return false; } -- cgit v1.2.3 From 0a9b673e27de04c499e06680d36e804a6a43f673 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Thu, 15 Aug 2019 15:28:30 +0300 Subject: reimplement ignored chars inside source.js only --- ext/fg/js/document.js | 2 +- ext/fg/js/frontend.js | 13 ++------- ext/fg/js/source.js | 72 ++++++++++++++++++++++++++++++++----------------- ext/mixed/js/display.js | 3 +-- 4 files changed, 51 insertions(+), 39 deletions(-) diff --git a/ext/fg/js/document.js b/ext/fg/js/document.js index 0697edb3..f58a64fc 100644 --- a/ext/fg/js/document.js +++ b/ext/fg/js/document.js @@ -116,7 +116,7 @@ function docSentenceExtract(source, extent) { const sourceLocal = source.clone(); const position = sourceLocal.setStartOffset(extent); sourceLocal.setEndOffset(position + extent); - const {text: content} = sourceLocal.text(); + const content = sourceLocal.text(); let quoteStack = []; diff --git a/ext/fg/js/frontend.js b/ext/fg/js/frontend.js index afb182d4..bd652f3b 100644 --- a/ext/fg/js/frontend.js +++ b/ext/fg/js/frontend.js @@ -311,18 +311,10 @@ class Frontend { async searchTerms(textSource) { textSource.setEndOffset(this.options.scanning.length); - const {text, strippedIndices} = textSource.text(); - let {definitions, length} = await apiTermsFind(text); + const {definitions, length} = await apiTermsFind(textSource.text()); if (definitions.length === 0) { return false; } - for (let index of strippedIndices) { - if (index < length) { - length++; - } else { - break; - } - } textSource.setEndOffset(length); @@ -346,8 +338,7 @@ class Frontend { async searchKanji(textSource) { textSource.setEndOffset(1); - const {text} = textSource.text(); - const definitions = await apiKanjiFind(text); + const definitions = await apiKanjiFind(textSource.text()); if (definitions.length === 0) { return false; } diff --git a/ext/fg/js/source.js b/ext/fg/js/source.js index cd8f63fd..cbf0ce7d 100644 --- a/ext/fg/js/source.js +++ b/ext/fg/js/source.js @@ -17,7 +17,7 @@ */ // \u200c (Zero-width non-joiner) appears on Google Docs from Chrome 76 onwards -const IGNORE_TEXT_PATTERN = /\u200c/g; +const IGNORE_TEXT_PATTERN = /\u200c/; /* @@ -35,13 +35,7 @@ class TextSourceRange { } text() { - let strippedIndices = []; - const text = this.content.replace(IGNORE_TEXT_PATTERN, (match, offset) => { - strippedIndices.push(offset); - return ''; - }); - - return {text, strippedIndices}; + return this.content; } setEndOffset(length) { @@ -133,11 +127,23 @@ class TextSourceRange { static seekForwardHelper(node, state) { if (node.nodeType === 3 && node.parentElement && TextSourceRange.shouldEnter(node.parentElement)) { const offset = state.node === node ? state.offset : 0; - const remaining = node.length - offset; - const consumed = Math.min(remaining, state.remainder); - state.content = state.content + node.nodeValue.substring(offset, offset + consumed); + + let consumed = 0; + let stripped = 0; + while (state.remainder - consumed > 0) { + const currentChar = node.nodeValue[offset + consumed + stripped]; + if (!currentChar) { + break; + } else if (currentChar.match(IGNORE_TEXT_PATTERN)) { + stripped++; + } else { + consumed++; + state.content += currentChar; + } + } + state.node = node; - state.offset = offset + consumed; + state.offset = offset + consumed + stripped; state.remainder -= consumed; } else if (TextSourceRange.shouldEnter(node)) { for (let i = 0; i < node.childNodes.length; ++i) { @@ -170,11 +176,23 @@ class TextSourceRange { static seekBackwardHelper(node, state) { if (node.nodeType === 3 && node.parentElement && TextSourceRange.shouldEnter(node.parentElement)) { const offset = state.node === node ? state.offset : node.length; - const remaining = offset; - const consumed = Math.min(remaining, state.remainder); - state.content = node.nodeValue.substring(offset - consumed, offset) + state.content; + + let consumed = 0; + let stripped = 0; + while (state.remainder - consumed > 0) { + const currentChar = node.nodeValue[offset - consumed - stripped]; // negative indices are undefined in JS + if (!currentChar) { + break; + } else if (currentChar.match(IGNORE_TEXT_PATTERN)) { + stripped++; + } else { + consumed++; + state.content = currentChar + state.content; + } + } + state.node = node; - state.offset = offset - consumed; + state.offset = offset - consumed - stripped; state.remainder -= consumed; } else if (TextSourceRange.shouldEnter(node)) { for (let i = node.childNodes.length - 1; i >= 0; --i) { @@ -204,13 +222,7 @@ class TextSourceElement { } text() { - let strippedIndices = []; - const text = this.content.replace(IGNORE_TEXT_PATTERN, (match, offset) => { - strippedIndices.push(offset); - return ''; - }); - - return {text, strippedIndices}; + return this.content; } setEndOffset(length) { @@ -226,8 +238,18 @@ class TextSourceElement { break; } - this.content = this.content || ''; - this.content = this.content.substring(0, length); + let consumed = 0; + let content = ''; + for (let currentChar of this.content) { + if (consumed >= length) { + break; + } else if (!currentChar.match(IGNORE_TEXT_PATTERN)) { + consumed++; + content += currentChar; + } + } + + this.content = content; return this.content.length; } diff --git a/ext/mixed/js/display.js b/ext/mixed/js/display.js index 6b9c295b..3bb78fe1 100644 --- a/ext/mixed/js/display.js +++ b/ext/mixed/js/display.js @@ -83,8 +83,7 @@ class Display { const textSource = docRangeFromPoint({x: e.clientX, y: e.clientY}); textSource.setEndOffset(this.options.scanning.length); - const {text} = textSource.text(); - const {definitions, length} = await apiTermsFind(text); + const {definitions, length} = await apiTermsFind(textSource.text()); if (definitions.length === 0) { return false; } -- cgit v1.2.3 From 609dbf6a819a736818e9caa03893850f14453d84 Mon Sep 17 00:00:00 2001 From: siikamiika Date: Thu, 15 Aug 2019 15:35:23 +0300 Subject: ensure that content is iterable --- ext/fg/js/source.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ext/fg/js/source.js b/ext/fg/js/source.js index cbf0ce7d..bbf00e30 100644 --- a/ext/fg/js/source.js +++ b/ext/fg/js/source.js @@ -240,7 +240,7 @@ class TextSourceElement { let consumed = 0; let content = ''; - for (let currentChar of this.content) { + for (let currentChar of this.content || '') { if (consumed >= length) { break; } else if (!currentChar.match(IGNORE_TEXT_PATTERN)) { -- cgit v1.2.3