diff options
author | siikamiika <siikamiika@users.noreply.github.com> | 2020-04-17 01:32:01 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-04-17 01:32:01 +0300 |
commit | 8c16a6e580bfdd70e27df1816ca90807062cf9b5 (patch) | |
tree | 25f551dfaa68427cd873fbd822abf0f2bfe09b76 | |
parent | e6053eefa594524c7adcec4986a5a2c499adf192 (diff) | |
parent | 619df42aedaa8da97d0a16d539b7211349143a0a (diff) |
Merge pull request #456 from siikamiika/parse-text-refactor
Parse text refactor
-rw-r--r-- | ext/bg/js/backend.js | 123 | ||||
-rw-r--r-- | ext/bg/js/japanese.js | 24 | ||||
-rw-r--r-- | ext/bg/js/mecab.js | 31 | ||||
-rw-r--r-- | ext/bg/js/search-query-parser-generator.js | 14 | ||||
-rw-r--r-- | ext/bg/js/search-query-parser.js | 29 | ||||
-rw-r--r-- | ext/mixed/js/api.js | 4 | ||||
-rw-r--r-- | test/test-japanese.js | 64 |
7 files changed, 148 insertions, 141 deletions
diff --git a/ext/bg/js/backend.js b/ext/bg/js/backend.js index eb1f3e7b..2265c1a9 100644 --- a/ext/bg/js/backend.js +++ b/ext/bg/js/backend.js @@ -84,7 +84,6 @@ class Backend { ['kanjiFind', {handler: this._onApiKanjiFind.bind(this), async: true}], ['termsFind', {handler: this._onApiTermsFind.bind(this), async: true}], ['textParse', {handler: this._onApiTextParse.bind(this), async: true}], - ['textParseMecab', {handler: this._onApiTextParseMecab.bind(this), async: true}], ['definitionAdd', {handler: this._onApiDefinitionAdd.bind(this), async: true}], ['definitionsAddable', {handler: this._onApiDefinitionsAddable.bind(this), async: true}], ['noteView', {handler: this._onApiNoteView.bind(this), async: true}], @@ -314,6 +313,60 @@ class Backend { return await this.dictionaryImporter.import(this.database, archiveSource, onProgress, details); } + async _textParseScanning(text, options) { + const results = []; + while (text.length > 0) { + const term = []; + const [definitions, sourceLength] = await this.translator.findTerms( + 'simple', + text.substring(0, options.scanning.length), + {}, + options + ); + if (definitions.length > 0 && sourceLength > 0) { + dictTermsSort(definitions); + const {expression, reading} = definitions[0]; + const source = text.substring(0, sourceLength); + for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) { + const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode); + term.push({text: text2, reading: reading2}); + } + text = text.substring(source.length); + } else { + const reading = jp.convertReading(text[0], '', options.parsing.readingMode); + term.push({text: text[0], reading}); + text = text.substring(1); + } + results.push(term); + } + return results; + } + + async _textParseMecab(text, options) { + const results = []; + const rawResults = await this.mecab.parseText(text); + for (const [mecabName, parsedLines] of Object.entries(rawResults)) { + const result = []; + for (const parsedLine of parsedLines) { + for (const {expression, reading, source} of parsedLine) { + const term = []; + for (const {text: text2, furigana} of jp.distributeFuriganaInflected( + expression.length > 0 ? expression : source, + jp.convertKatakanaToHiragana(reading), + source + )) { + const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode); + term.push({text: text2, reading: reading2}); + } + result.push(term); + } + result.push([{text: '\n', reading: ''}]); + } + results.push([mecabName, result]); + } + return results; + } + // Message handlers _onApiYomichanCoreReady(_params, sender) { @@ -405,61 +458,27 @@ class Backend { async _onApiTextParse({text, optionsContext}) { const options = this.getOptions(optionsContext); const results = []; - while (text.length > 0) { - const term = []; - const [definitions, sourceLength] = await this.translator.findTerms( - 'simple', - text.substring(0, options.scanning.length), - {}, - options - ); - if (definitions.length > 0) { - dictTermsSort(definitions); - const {expression, reading} = definitions[0]; - const source = text.substring(0, sourceLength); - for (const {text: text2, furigana} of jp.distributeFuriganaInflected(expression, reading, source)) { - const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode); - term.push({text: text2, reading: reading2}); - } - text = text.substring(source.length); - } else { - const reading = jp.convertReading(text[0], null, options.parsing.readingMode); - term.push({text: text[0], reading}); - text = text.substring(1); - } - results.push(term); + + if (options.parsing.enableScanningParser) { + results.push({ + source: 'scanning-parser', + id: 'scan', + content: await this._textParseScanning(text, options) + }); } - return results; - } - async _onApiTextParseMecab({text, optionsContext}) { - const options = this.getOptions(optionsContext); - const results = []; - const rawResults = await this.mecab.parseText(text); - for (const [mecabName, parsedLines] of Object.entries(rawResults)) { - const result = []; - for (const parsedLine of parsedLines) { - for (const {expression, reading, source} of parsedLine) { - const term = []; - if (expression !== null && reading !== null) { - for (const {text: text2, furigana} of jp.distributeFuriganaInflected( - expression, - jp.convertKatakanaToHiragana(reading), - source - )) { - const reading2 = jp.convertReading(text2, furigana, options.parsing.readingMode); - term.push({text: text2, reading: reading2}); - } - } else { - const reading2 = jp.convertReading(source, null, options.parsing.readingMode); - term.push({text: source, reading: reading2}); - } - result.push(term); - } - result.push([{text: '\n'}]); + if (options.parsing.enableMecabParser) { + const mecabResults = await this._textParseMecab(text, options); + for (const [mecabDictName, mecabDictResults] of mecabResults) { + results.push({ + source: 'mecab', + dictionary: mecabDictName, + id: `mecab-${mecabDictName}`, + content: mecabDictResults + }); } - results.push([mecabName, result]); } + return results; } diff --git a/ext/bg/js/japanese.js b/ext/bg/js/japanese.js index 5fef27a7..ac81acb5 100644 --- a/ext/bg/js/japanese.js +++ b/ext/bg/js/japanese.js @@ -124,25 +124,25 @@ return wanakana.toRomaji(text); } - function convertReading(expressionFragment, readingFragment, readingMode) { + function convertReading(expression, reading, readingMode) { switch (readingMode) { case 'hiragana': - return convertKatakanaToHiragana(readingFragment || ''); + return convertKatakanaToHiragana(reading); case 'katakana': - return convertHiraganaToKatakana(readingFragment || ''); + return convertHiraganaToKatakana(reading); case 'romaji': - if (readingFragment) { - return convertToRomaji(readingFragment); + if (reading) { + return convertToRomaji(reading); } else { - if (isStringEntirelyKana(expressionFragment)) { - return convertToRomaji(expressionFragment); + if (isStringEntirelyKana(expression)) { + return convertToRomaji(expression); } } - return readingFragment; + return reading; case 'none': - return null; + return ''; default: - return readingFragment; + return reading; } } @@ -300,7 +300,7 @@ const readingLeft = reading2.substring(group.text.length); const segs = segmentize(readingLeft, groups.splice(1)); if (segs) { - return [{text: group.text}].concat(segs); + return [{text: group.text, furigana: ''}].concat(segs); } } } else { @@ -368,7 +368,7 @@ } if (stemLength !== source.length) { - output.push({text: source.substring(stemLength)}); + output.push({text: source.substring(stemLength), furigana: ''}); } return output; diff --git a/ext/bg/js/mecab.js b/ext/bg/js/mecab.js index cd6e6c57..597dceae 100644 --- a/ext/bg/js/mecab.js +++ b/ext/bg/js/mecab.js @@ -40,7 +40,36 @@ class Mecab { } async parseText(text) { - return await this.invoke('parse_text', {text}); + const rawResults = await this.invoke('parse_text', {text}); + // { + // 'mecab-name': [ + // // line1 + // [ + // {str expression: 'expression', str reading: 'reading', str source: 'source'}, + // {str expression: 'expression2', str reading: 'reading2', str source: 'source2'} + // ], + // line2, + // ... + // ], + // 'mecab-name2': [...] + // } + const results = {}; + for (const [mecabName, parsedLines] of Object.entries(rawResults)) { + const result = []; + for (const parsedLine of parsedLines) { + const line = []; + for (const {expression, reading, source} of parsedLine) { + line.push({ + expression: expression || '', + reading: reading || '', + source: source || '' + }); + } + result.push(line); + } + results[mecabName] = result; + } + return results; } startListener() { diff --git a/ext/bg/js/search-query-parser-generator.js b/ext/bg/js/search-query-parser-generator.js index 390841c1..9e7ff8aa 100644 --- a/ext/bg/js/search-query-parser-generator.js +++ b/ext/bg/js/search-query-parser-generator.js @@ -36,7 +36,7 @@ class QueryParserGenerator { const termContainer = this._templateHandler.instantiate(preview ? 'term-preview' : 'term'); for (const segment of term) { if (!segment.text.trim()) { continue; } - if (!segment.reading || !segment.reading.trim()) { + if (!segment.reading.trim()) { termContainer.appendChild(this.createSegmentText(segment.text)); } else { termContainer.appendChild(this.createSegment(segment)); @@ -71,7 +71,17 @@ class QueryParserGenerator { for (const parseResult of parseResults) { const optionContainer = this._templateHandler.instantiate('select-option'); optionContainer.value = parseResult.id; - optionContainer.textContent = parseResult.name; + switch (parseResult.source) { + case 'scanning-parser': + optionContainer.textContent = 'Scanning parser'; + break; + case 'mecab': + optionContainer.textContent = `MeCab: ${parseResult.dictionary}`; + break; + default: + optionContainer.textContent = 'Unrecognized dictionary'; + break; + } optionContainer.defaultSelected = selectedParser === parseResult.id; selectContainer.appendChild(optionContainer); } diff --git a/ext/bg/js/search-query-parser.js b/ext/bg/js/search-query-parser.js index f90c68ee..eb3b681c 100644 --- a/ext/bg/js/search-query-parser.js +++ b/ext/bg/js/search-query-parser.js @@ -21,7 +21,6 @@ * apiOptionsSet * apiTermsFind * apiTextParse - * apiTextParseMecab * docSentenceExtract */ @@ -128,7 +127,7 @@ class QueryParser extends TextScanner { this.setPreview(text); - this.parseResults = await this.parseText(text); + this.parseResults = await apiTextParse(text, this.getOptionsContext()); this.refreshSelectedParser(); this.renderParserSelect(); @@ -137,33 +136,11 @@ class QueryParser extends TextScanner { this.setSpinnerVisible(false); } - async parseText(text) { - const results = []; - if (this.options.parsing.enableScanningParser) { - results.push({ - name: 'Scanning parser', - id: 'scan', - parsedText: await apiTextParse(text, this.getOptionsContext()) - }); - } - if (this.options.parsing.enableMecabParser) { - const mecabResults = await apiTextParseMecab(text, this.getOptionsContext()); - for (const [mecabDictName, mecabDictResults] of mecabResults) { - results.push({ - name: `MeCab: ${mecabDictName}`, - id: `mecab-${mecabDictName}`, - parsedText: mecabDictResults - }); - } - } - return results; - } - setPreview(text) { const previewTerms = []; for (let i = 0, ii = text.length; i < ii; i += 2) { const tempText = text.substring(i, i + 2); - previewTerms.push([{text: tempText}]); + previewTerms.push([{text: tempText, reading: ''}]); } this.queryParser.textContent = ''; this.queryParser.appendChild(this.queryParserGenerator.createParseResult(previewTerms, true)); @@ -183,6 +160,6 @@ class QueryParser extends TextScanner { const parseResult = this.getParseResult(); this.queryParser.textContent = ''; if (!parseResult) { return; } - this.queryParser.appendChild(this.queryParserGenerator.createParseResult(parseResult.parsedText)); + this.queryParser.appendChild(this.queryParserGenerator.createParseResult(parseResult.content)); } } diff --git a/ext/mixed/js/api.js b/ext/mixed/js/api.js index 50b285a5..30c08347 100644 --- a/ext/mixed/js/api.js +++ b/ext/mixed/js/api.js @@ -44,10 +44,6 @@ function apiTextParse(text, optionsContext) { return _apiInvoke('textParse', {text, optionsContext}); } -function apiTextParseMecab(text, optionsContext) { - return _apiInvoke('textParseMecab', {text, optionsContext}); -} - function apiKanjiFind(text, optionsContext) { return _apiInvoke('kanjiFind', {text, optionsContext}); } diff --git a/test/test-japanese.js b/test/test-japanese.js index 89e41c36..87efdfad 100644 --- a/test/test-japanese.js +++ b/test/test-japanese.js @@ -176,19 +176,19 @@ function testConvertReading() { [['アリガトウ', 'アリガトウ', 'hiragana'], 'ありがとう'], [['アリガトウ', 'アリガトウ', 'katakana'], 'アリガトウ'], [['アリガトウ', 'アリガトウ', 'romaji'], 'arigatou'], - [['アリガトウ', 'アリガトウ', 'none'], null], + [['アリガトウ', 'アリガトウ', 'none'], ''], [['アリガトウ', 'アリガトウ', 'default'], 'アリガトウ'], [['ありがとう', 'ありがとう', 'hiragana'], 'ありがとう'], [['ありがとう', 'ありがとう', 'katakana'], 'アリガトウ'], [['ありがとう', 'ありがとう', 'romaji'], 'arigatou'], - [['ありがとう', 'ありがとう', 'none'], null], + [['ありがとう', 'ありがとう', 'none'], ''], [['ありがとう', 'ありがとう', 'default'], 'ありがとう'], [['有り難う', 'ありがとう', 'hiragana'], 'ありがとう'], [['有り難う', 'ありがとう', 'katakana'], 'アリガトウ'], [['有り難う', 'ありがとう', 'romaji'], 'arigatou'], - [['有り難う', 'ありがとう', 'none'], null], + [['有り難う', 'ありがとう', 'none'], ''], [['有り難う', 'ありがとう', 'default'], 'ありがとう'], // Cases with falsy readings @@ -196,44 +196,20 @@ function testConvertReading() { [['ありがとう', '', 'hiragana'], ''], [['ありがとう', '', 'katakana'], ''], [['ありがとう', '', 'romaji'], 'arigatou'], - [['ありがとう', '', 'none'], null], + [['ありがとう', '', 'none'], ''], [['ありがとう', '', 'default'], ''], - [['ありがとう', null, 'hiragana'], ''], - [['ありがとう', null, 'katakana'], ''], - [['ありがとう', null, 'romaji'], 'arigatou'], - [['ありがとう', null, 'none'], null], - [['ありがとう', null, 'default'], null], - - [['ありがとう', void 0, 'hiragana'], ''], - [['ありがとう', void 0, 'katakana'], ''], - [['ありがとう', void 0, 'romaji'], 'arigatou'], - [['ありがとう', void 0, 'none'], null], - [['ありがとう', void 0, 'default'], void 0], - // Cases with falsy readings and kanji expressions [['有り難う', '', 'hiragana'], ''], [['有り難う', '', 'katakana'], ''], [['有り難う', '', 'romaji'], ''], - [['有り難う', '', 'none'], null], - [['有り難う', '', 'default'], ''], - - [['有り難う', null, 'hiragana'], ''], - [['有り難う', null, 'katakana'], ''], - [['有り難う', null, 'romaji'], null], - [['有り難う', null, 'none'], null], - [['有り難う', null, 'default'], null], - - [['有り難う', void 0, 'hiragana'], ''], - [['有り難う', void 0, 'katakana'], ''], - [['有り難う', void 0, 'romaji'], void 0], - [['有り難う', void 0, 'none'], null], - [['有り難う', void 0, 'default'], void 0] + [['有り難う', '', 'none'], ''], + [['有り難う', '', 'default'], ''] ]; - for (const [[expressionFragment, readingFragment, readingMode], expected] of data) { - assert.strictEqual(jp.convertReading(expressionFragment, readingFragment, readingMode), expected); + for (const [[expression, reading, readingMode], expected] of data) { + assert.strictEqual(jp.convertReading(expression, reading, readingMode), expected); } } @@ -303,9 +279,9 @@ function testDistributeFurigana() { ['有り難う', 'ありがとう'], [ {text: '有', furigana: 'あ'}, - {text: 'り'}, + {text: 'り', furigana: ''}, {text: '難', furigana: 'がと'}, - {text: 'う'} + {text: 'う', furigana: ''} ] ], [ @@ -317,23 +293,23 @@ function testDistributeFurigana() { [ ['お祝い', 'おいわい'], [ - {text: 'お'}, + {text: 'お', furigana: ''}, {text: '祝', furigana: 'いわ'}, - {text: 'い'} + {text: 'い', furigana: ''} ] ], [ ['美味しい', 'おいしい'], [ {text: '美味', furigana: 'おい'}, - {text: 'しい'} + {text: 'しい', furigana: ''} ] ], [ ['食べ物', 'たべもの'], [ {text: '食', furigana: 'た'}, - {text: 'べ'}, + {text: 'べ', furigana: ''}, {text: '物', furigana: 'もの'} ] ], @@ -341,9 +317,9 @@ function testDistributeFurigana() { ['試し切り', 'ためしぎり'], [ {text: '試', furigana: 'ため'}, - {text: 'し'}, + {text: 'し', furigana: ''}, {text: '切', furigana: 'ぎ'}, - {text: 'り'} + {text: 'り', furigana: ''} ] ], // Ambiguous @@ -373,16 +349,16 @@ function testDistributeFuriganaInflected() { ['美味しい', 'おいしい', '美味しかた'], [ {text: '美味', furigana: 'おい'}, - {text: 'し'}, - {text: 'かた'} + {text: 'し', furigana: ''}, + {text: 'かた', furigana: ''} ] ], [ ['食べる', 'たべる', '食べた'], [ {text: '食', furigana: 'た'}, - {text: 'べ'}, - {text: 'た'} + {text: 'べ', furigana: ''}, + {text: 'た', furigana: ''} ] ] ]; |