diff options
Diffstat (limited to 'ext/js/language')
-rw-r--r-- | ext/js/language/deinflector.js | 96 | ||||
-rw-r--r-- | ext/js/language/dictionary-database.js | 484 | ||||
-rw-r--r-- | ext/js/language/dictionary-importer.js | 407 | ||||
-rw-r--r-- | ext/js/language/translator.js | 1397 |
4 files changed, 2384 insertions, 0 deletions
diff --git a/ext/js/language/deinflector.js b/ext/js/language/deinflector.js new file mode 100644 index 00000000..8fee3f01 --- /dev/null +++ b/ext/js/language/deinflector.js @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2016-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + + +class Deinflector { + constructor(reasons) { + this.reasons = Deinflector.normalizeReasons(reasons); + } + + deinflect(source, rawSource) { + const results = [{ + source, + rawSource, + term: source, + rules: 0, + reasons: [], + databaseDefinitions: [] + }]; + for (let i = 0; i < results.length; ++i) { + const {rules, term, reasons} = results[i]; + for (const [reason, variants] of this.reasons) { + for (const [kanaIn, kanaOut, rulesIn, rulesOut] of variants) { + if ( + (rules !== 0 && (rules & rulesIn) === 0) || + !term.endsWith(kanaIn) || + (term.length - kanaIn.length + kanaOut.length) <= 0 + ) { + continue; + } + + results.push({ + source, + rawSource, + term: term.substring(0, term.length - kanaIn.length) + kanaOut, + rules: rulesOut, + reasons: [reason, ...reasons], + databaseDefinitions: [] + }); + } + } + } + return results; + } + + static normalizeReasons(reasons) { + const normalizedReasons = []; + for (const [reason, reasonInfo] of Object.entries(reasons)) { + const variants = []; + for (const {kanaIn, kanaOut, rulesIn, rulesOut} of reasonInfo) { + variants.push([ + kanaIn, + kanaOut, + Deinflector.rulesToRuleFlags(rulesIn), + Deinflector.rulesToRuleFlags(rulesOut) + ]); + } + normalizedReasons.push([reason, variants]); + } + return normalizedReasons; + } + + static rulesToRuleFlags(rules) { + const ruleTypes = Deinflector.ruleTypes; + let value = 0; + for (const rule of rules) { + const ruleBits = ruleTypes.get(rule); + if (typeof ruleBits === 'undefined') { continue; } + value |= ruleBits; + } + return value; + } +} + +Deinflector.ruleTypes = new Map([ + ['v1', 0b00000001], // Verb ichidan + ['v5', 0b00000010], // Verb godan + ['vs', 0b00000100], // Verb suru + ['vk', 0b00001000], // Verb kuru + ['vz', 0b00010000], // Verb zuru + ['adj-i', 0b00100000], // Adjective i + ['iru', 0b01000000] // Intermediate -iru endings for progressive or perfect tense +]); diff --git a/ext/js/language/dictionary-database.js b/ext/js/language/dictionary-database.js new file mode 100644 index 00000000..b363ed25 --- /dev/null +++ b/ext/js/language/dictionary-database.js @@ -0,0 +1,484 @@ +/* + * Copyright (C) 2016-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/* global + * Database + */ + +class DictionaryDatabase { + constructor() { + this._db = new Database(); + this._dbName = 'dict'; + this._schemas = new Map(); + } + + // Public + + async prepare() { + await this._db.open( + this._dbName, + 60, + [ + { + version: 20, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading'] + }, + kanji: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'character'] + }, + tagMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary'] + }, + dictionaries: { + primaryKey: {autoIncrement: true}, + indices: ['title', 'version'] + } + } + }, + { + version: 30, + stores: { + termMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'expression'] + }, + kanjiMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'character'] + }, + tagMeta: { + primaryKey: {autoIncrement: true}, + indices: ['dictionary', 'name'] + } + } + }, + { + version: 40, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading', 'sequence'] + } + } + }, + { + version: 50, + stores: { + terms: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'expression', 'reading', 'sequence', 'expressionReverse', 'readingReverse'] + } + } + }, + { + version: 60, + stores: { + media: { + primaryKey: {keyPath: 'id', autoIncrement: true}, + indices: ['dictionary', 'path'] + } + } + } + ] + ); + } + + async close() { + this._db.close(); + } + + isPrepared() { + return this._db.isOpen(); + } + + async purge() { + if (this._db.isOpening()) { + throw new Error('Cannot purge database while opening'); + } + if (this._db.isOpen()) { + this._db.close(); + } + let result = false; + try { + await Database.deleteDatabase(this._dbName); + result = true; + } catch (e) { + yomichan.logError(e); + } + await this.prepare(); + return result; + } + + async deleteDictionary(dictionaryName, progressSettings, onProgress) { + const targets = [ + ['dictionaries', 'title'], + ['kanji', 'dictionary'], + ['kanjiMeta', 'dictionary'], + ['terms', 'dictionary'], + ['termMeta', 'dictionary'], + ['tagMeta', 'dictionary'], + ['media', 'dictionary'] + ]; + + const {rate} = progressSettings; + const progressData = { + count: 0, + processed: 0, + storeCount: targets.length, + storesProcesed: 0 + }; + + const filterKeys = (keys) => { + ++progressData.storesProcesed; + progressData.count += keys.length; + onProgress(progressData); + return keys; + }; + const onProgress2 = () => { + const processed = progressData.processed + 1; + progressData.processed = processed; + if ((processed % rate) === 0 || processed === progressData.count) { + onProgress(progressData); + } + }; + + const promises = []; + for (const [objectStoreName, indexName] of targets) { + const query = IDBKeyRange.only(dictionaryName); + const promise = this._db.bulkDelete(objectStoreName, indexName, query, filterKeys, onProgress2); + promises.push(promise); + } + await Promise.all(promises); + } + + findTermsBulk(termList, dictionaries, wildcard) { + return new Promise((resolve, reject) => { + const results = []; + const count = termList.length; + if (count === 0) { + resolve(results); + return; + } + + const visited = new Set(); + const useWildcard = !!wildcard; + const prefixWildcard = wildcard === 'prefix'; + + const transaction = this._db.transaction(['terms'], 'readonly'); + const terms = transaction.objectStore('terms'); + const index1 = terms.index(prefixWildcard ? 'expressionReverse' : 'expression'); + const index2 = terms.index(prefixWildcard ? 'readingReverse' : 'reading'); + + const count2 = count * 2; + let completeCount = 0; + for (let i = 0; i < count; ++i) { + const inputIndex = i; + const term = prefixWildcard ? stringReverse(termList[i]) : termList[i]; + const query = useWildcard ? IDBKeyRange.bound(term, `${term}\uffff`, false, false) : IDBKeyRange.only(term); + + const onGetAll = (rows) => { + for (const row of rows) { + if (dictionaries.has(row.dictionary) && !visited.has(row.id)) { + visited.add(row.id); + results.push(this._createTerm(row, inputIndex)); + } + } + if (++completeCount >= count2) { + resolve(results); + } + }; + + this._db.getAll(index1, query, onGetAll, reject); + this._db.getAll(index2, query, onGetAll, reject); + } + }); + } + + findTermsExactBulk(termList, readingList, dictionaries) { + return new Promise((resolve, reject) => { + const results = []; + const count = termList.length; + if (count === 0) { + resolve(results); + return; + } + + const transaction = this._db.transaction(['terms'], 'readonly'); + const terms = transaction.objectStore('terms'); + const index = terms.index('expression'); + + let completeCount = 0; + for (let i = 0; i < count; ++i) { + const inputIndex = i; + const reading = readingList[i]; + const query = IDBKeyRange.only(termList[i]); + + const onGetAll = (rows) => { + for (const row of rows) { + if (row.reading === reading && dictionaries.has(row.dictionary)) { + results.push(this._createTerm(row, inputIndex)); + } + } + if (++completeCount >= count) { + resolve(results); + } + }; + + this._db.getAll(index, query, onGetAll, reject); + } + }); + } + + findTermsBySequenceBulk(sequenceList, mainDictionary) { + return new Promise((resolve, reject) => { + const results = []; + const count = sequenceList.length; + if (count === 0) { + resolve(results); + return; + } + + const transaction = this._db.transaction(['terms'], 'readonly'); + const terms = transaction.objectStore('terms'); + const index = terms.index('sequence'); + + let completeCount = 0; + for (let i = 0; i < count; ++i) { + const inputIndex = i; + const query = IDBKeyRange.only(sequenceList[i]); + + const onGetAll = (rows) => { + for (const row of rows) { + if (row.dictionary === mainDictionary) { + results.push(this._createTerm(row, inputIndex)); + } + } + if (++completeCount >= count) { + resolve(results); + } + }; + + this._db.getAll(index, query, onGetAll, reject); + } + }); + } + + findTermMetaBulk(termList, dictionaries) { + return this._findGenericBulk('termMeta', 'expression', termList, dictionaries, this._createTermMeta.bind(this)); + } + + findKanjiBulk(kanjiList, dictionaries) { + return this._findGenericBulk('kanji', 'character', kanjiList, dictionaries, this._createKanji.bind(this)); + } + + findKanjiMetaBulk(kanjiList, dictionaries) { + return this._findGenericBulk('kanjiMeta', 'character', kanjiList, dictionaries, this._createKanjiMeta.bind(this)); + } + + findTagForTitle(name, title) { + const query = IDBKeyRange.only(name); + return this._db.find('tagMeta', 'name', query, (row) => (row.dictionary === title), null); + } + + getMedia(targets) { + return new Promise((resolve, reject) => { + const count = targets.length; + const results = new Array(count).fill(null); + if (count === 0) { + resolve(results); + return; + } + + let completeCount = 0; + const transaction = this._db.transaction(['media'], 'readonly'); + const objectStore = transaction.objectStore('media'); + const index = objectStore.index('path'); + + for (let i = 0; i < count; ++i) { + const inputIndex = i; + const {path, dictionaryName} = targets[i]; + const query = IDBKeyRange.only(path); + + const onGetAll = (rows) => { + for (const row of rows) { + if (row.dictionary !== dictionaryName) { continue; } + results[inputIndex] = this._createMedia(row, inputIndex); + } + if (++completeCount >= count) { + resolve(results); + } + }; + + this._db.getAll(index, query, onGetAll, reject); + } + }); + } + + getDictionaryInfo() { + return new Promise((resolve, reject) => { + const transaction = this._db.transaction(['dictionaries'], 'readonly'); + const objectStore = transaction.objectStore('dictionaries'); + this._db.getAll(objectStore, null, resolve, reject); + }); + } + + getDictionaryCounts(dictionaryNames, getTotal) { + return new Promise((resolve, reject) => { + const targets = [ + ['kanji', 'dictionary'], + ['kanjiMeta', 'dictionary'], + ['terms', 'dictionary'], + ['termMeta', 'dictionary'], + ['tagMeta', 'dictionary'], + ['media', 'dictionary'] + ]; + const objectStoreNames = targets.map(([objectStoreName]) => objectStoreName); + const transaction = this._db.transaction(objectStoreNames, 'readonly'); + const databaseTargets = targets.map(([objectStoreName, indexName]) => { + const objectStore = transaction.objectStore(objectStoreName); + const index = objectStore.index(indexName); + return {objectStore, index}; + }); + + const countTargets = []; + if (getTotal) { + for (const {objectStore} of databaseTargets) { + countTargets.push([objectStore, null]); + } + } + for (const dictionaryName of dictionaryNames) { + const query = IDBKeyRange.only(dictionaryName); + for (const {index} of databaseTargets) { + countTargets.push([index, query]); + } + } + + const onCountComplete = (results) => { + const resultCount = results.length; + const targetCount = targets.length; + const counts = []; + for (let i = 0; i < resultCount; i += targetCount) { + const countGroup = {}; + for (let j = 0; j < targetCount; ++j) { + countGroup[targets[j][0]] = results[i + j]; + } + counts.push(countGroup); + } + const total = getTotal ? counts.shift() : null; + resolve({total, counts}); + }; + + this._db.bulkCount(countTargets, onCountComplete, reject); + }); + } + + async dictionaryExists(title) { + const query = IDBKeyRange.only(title); + const result = await this._db.find('dictionaries', 'title', query); + return typeof result !== 'undefined'; + } + + bulkAdd(objectStoreName, items, start, count) { + return this._db.bulkAdd(objectStoreName, items, start, count); + } + + // Private + + async _findGenericBulk(objectStoreName, indexName, indexValueList, dictionaries, createResult) { + return new Promise((resolve, reject) => { + const results = []; + const count = indexValueList.length; + if (count === 0) { + resolve(results); + return; + } + + const transaction = this._db.transaction([objectStoreName], 'readonly'); + const terms = transaction.objectStore(objectStoreName); + const index = terms.index(indexName); + + let completeCount = 0; + for (let i = 0; i < count; ++i) { + const inputIndex = i; + const query = IDBKeyRange.only(indexValueList[i]); + + const onGetAll = (rows) => { + for (const row of rows) { + if (dictionaries.has(row.dictionary)) { + results.push(createResult(row, inputIndex)); + } + } + if (++completeCount >= count) { + resolve(results); + } + }; + + this._db.getAll(index, query, onGetAll, reject); + } + }); + } + + _createTerm(row, index) { + return { + index, + expression: row.expression, + reading: row.reading, + definitionTags: this._splitField(row.definitionTags || row.tags || ''), + termTags: this._splitField(row.termTags || ''), + rules: this._splitField(row.rules), + glossary: row.glossary, + score: row.score, + dictionary: row.dictionary, + id: row.id, + sequence: typeof row.sequence === 'undefined' ? -1 : row.sequence + }; + } + + _createKanji(row, index) { + return { + index, + character: row.character, + onyomi: this._splitField(row.onyomi), + kunyomi: this._splitField(row.kunyomi), + tags: this._splitField(row.tags), + glossary: row.meanings, + stats: row.stats, + dictionary: row.dictionary + }; + } + + _createTermMeta({expression, mode, data, dictionary}, index) { + return {expression, mode, data, dictionary, index}; + } + + _createKanjiMeta({character, mode, data, dictionary}, index) { + return {character, mode, data, dictionary, index}; + } + + _createMedia(row, index) { + return Object.assign({}, row, {index}); + } + + _splitField(field) { + return field.length === 0 ? [] : field.split(' '); + } +} diff --git a/ext/js/language/dictionary-importer.js b/ext/js/language/dictionary-importer.js new file mode 100644 index 00000000..4cb608db --- /dev/null +++ b/ext/js/language/dictionary-importer.js @@ -0,0 +1,407 @@ +/* + * Copyright (C) 2020-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/* global + * JSZip + * JsonSchemaValidator + * MediaUtility + */ + +class DictionaryImporter { + constructor() { + this._schemas = new Map(); + this._jsonSchemaValidator = new JsonSchemaValidator(); + this._mediaUtility = new MediaUtility(); + } + + async importDictionary(dictionaryDatabase, archiveSource, details, onProgress) { + if (!dictionaryDatabase) { + throw new Error('Invalid database'); + } + if (!dictionaryDatabase.isPrepared()) { + throw new Error('Database is not ready'); + } + + const hasOnProgress = (typeof onProgress === 'function'); + + // Read archive + const archive = await JSZip.loadAsync(archiveSource); + + // Read and validate index + const indexFileName = 'index.json'; + const indexFile = archive.files[indexFileName]; + if (!indexFile) { + throw new Error('No dictionary index found in archive'); + } + + const index = JSON.parse(await indexFile.async('string')); + + const indexSchema = await this._getSchema('/data/schemas/dictionary-index-schema.json'); + this._validateJsonSchema(index, indexSchema, indexFileName); + + const dictionaryTitle = index.title; + const version = index.format || index.version; + + if (!dictionaryTitle || !index.revision) { + throw new Error('Unrecognized dictionary format'); + } + + // Verify database is not already imported + if (await dictionaryDatabase.dictionaryExists(dictionaryTitle)) { + throw new Error('Dictionary is already imported'); + } + + // Data format converters + const convertTermBankEntry = (entry) => { + if (version === 1) { + const [expression, reading, definitionTags, rules, score, ...glossary] = entry; + return {expression, reading, definitionTags, rules, score, glossary}; + } else { + const [expression, reading, definitionTags, rules, score, glossary, sequence, termTags] = entry; + return {expression, reading, definitionTags, rules, score, glossary, sequence, termTags}; + } + }; + + const convertTermMetaBankEntry = (entry) => { + const [expression, mode, data] = entry; + return {expression, mode, data}; + }; + + const convertKanjiBankEntry = (entry) => { + if (version === 1) { + const [character, onyomi, kunyomi, tags, ...meanings] = entry; + return {character, onyomi, kunyomi, tags, meanings}; + } else { + const [character, onyomi, kunyomi, tags, meanings, stats] = entry; + return {character, onyomi, kunyomi, tags, meanings, stats}; + } + }; + + const convertKanjiMetaBankEntry = (entry) => { + const [character, mode, data] = entry; + return {character, mode, data}; + }; + + const convertTagBankEntry = (entry) => { + const [name, category, order, notes, score] = entry; + return {name, category, order, notes, score}; + }; + + // Archive file reading + const readFileSequence = async (fileNameFormat, convertEntry, schema) => { + const results = []; + for (let i = 1; true; ++i) { + const fileName = fileNameFormat.replace(/\?/, `${i}`); + const file = archive.files[fileName]; + if (!file) { break; } + + const entries = JSON.parse(await file.async('string')); + this._validateJsonSchema(entries, schema, fileName); + + for (let entry of entries) { + entry = convertEntry(entry); + entry.dictionary = dictionaryTitle; + results.push(entry); + } + } + return results; + }; + + // Load schemas + const dataBankSchemaPaths = this._getDataBankSchemaPaths(version); + const dataBankSchemas = await Promise.all(dataBankSchemaPaths.map((path) => this._getSchema(path))); + + // Load data + const termList = await readFileSequence('term_bank_?.json', convertTermBankEntry, dataBankSchemas[0]); + const termMetaList = await readFileSequence('term_meta_bank_?.json', convertTermMetaBankEntry, dataBankSchemas[1]); + const kanjiList = await readFileSequence('kanji_bank_?.json', convertKanjiBankEntry, dataBankSchemas[2]); + const kanjiMetaList = await readFileSequence('kanji_meta_bank_?.json', convertKanjiMetaBankEntry, dataBankSchemas[3]); + const tagList = await readFileSequence('tag_bank_?.json', convertTagBankEntry, dataBankSchemas[4]); + + // Old tags + const indexTagMeta = index.tagMeta; + if (typeof indexTagMeta === 'object' && indexTagMeta !== null) { + for (const name of Object.keys(indexTagMeta)) { + const {category, order, notes, score} = indexTagMeta[name]; + tagList.push({name, category, order, notes, score}); + } + } + + // Prefix wildcard support + const prefixWildcardsSupported = !!details.prefixWildcardsSupported; + if (prefixWildcardsSupported) { + for (const entry of termList) { + entry.expressionReverse = stringReverse(entry.expression); + entry.readingReverse = stringReverse(entry.reading); + } + } + + // Extended data support + const extendedDataContext = { + archive, + media: new Map() + }; + for (const entry of termList) { + const glossaryList = entry.glossary; + for (let i = 0, ii = glossaryList.length; i < ii; ++i) { + const glossary = glossaryList[i]; + if (typeof glossary !== 'object' || glossary === null) { continue; } + glossaryList[i] = await this._formatDictionaryTermGlossaryObject(glossary, extendedDataContext, entry); + } + } + + const media = [...extendedDataContext.media.values()]; + + // Add dictionary + const summary = this._createSummary(dictionaryTitle, version, index, {prefixWildcardsSupported}); + + dictionaryDatabase.bulkAdd('dictionaries', [summary], 0, 1); + + // Add data + const errors = []; + const total = ( + termList.length + + termMetaList.length + + kanjiList.length + + kanjiMetaList.length + + tagList.length + ); + let loadedCount = 0; + const maxTransactionLength = 1000; + + const bulkAdd = async (objectStoreName, entries) => { + const ii = entries.length; + for (let i = 0; i < ii; i += maxTransactionLength) { + const count = Math.min(maxTransactionLength, ii - i); + + try { + await dictionaryDatabase.bulkAdd(objectStoreName, entries, i, count); + } catch (e) { + errors.push(e); + } + + loadedCount += count; + if (hasOnProgress) { + onProgress(total, loadedCount); + } + } + }; + + await bulkAdd('terms', termList); + await bulkAdd('termMeta', termMetaList); + await bulkAdd('kanji', kanjiList); + await bulkAdd('kanjiMeta', kanjiMetaList); + await bulkAdd('tagMeta', tagList); + await bulkAdd('media', media); + + return {result: summary, errors}; + } + + _createSummary(dictionaryTitle, version, index, details) { + const summary = { + title: dictionaryTitle, + revision: index.revision, + sequenced: index.sequenced, + version + }; + + const {author, url, description, attribution} = index; + if (typeof author === 'string') { summary.author = author; } + if (typeof url === 'string') { summary.url = url; } + if (typeof description === 'string') { summary.description = description; } + if (typeof attribution === 'string') { summary.attribution = attribution; } + + Object.assign(summary, details); + + return summary; + } + + async _getSchema(fileName) { + let schemaPromise = this._schemas.get(fileName); + if (typeof schemaPromise !== 'undefined') { + return schemaPromise; + } + + schemaPromise = this._fetchJsonAsset(fileName); + this._schemas.set(fileName, schemaPromise); + return schemaPromise; + } + + _validateJsonSchema(value, schema, fileName) { + try { + this._jsonSchemaValidator.validate(value, schema); + } catch (e) { + throw this._formatSchemaError(e, fileName); + } + } + + _formatSchemaError(e, fileName) { + const valuePathString = this._getSchemaErrorPathString(e.info.valuePath, 'dictionary'); + const schemaPathString = this._getSchemaErrorPathString(e.info.schemaPath, 'schema'); + + const e2 = new Error(`Dictionary has invalid data in '${fileName}' for value '${valuePathString}', validated against '${schemaPathString}': ${e.message}`); + e2.data = e; + + return e2; + } + + _getSchemaErrorPathString(infoList, base='') { + let result = base; + for (const [part] of infoList) { + switch (typeof part) { + case 'string': + if (result.length > 0) { + result += '.'; + } + result += part; + break; + case 'number': + result += `[${part}]`; + break; + } + } + return result; + } + + _getDataBankSchemaPaths(version) { + const termBank = ( + version === 1 ? + '/data/schemas/dictionary-term-bank-v1-schema.json' : + '/data/schemas/dictionary-term-bank-v3-schema.json' + ); + const termMetaBank = '/data/schemas/dictionary-term-meta-bank-v3-schema.json'; + const kanjiBank = ( + version === 1 ? + '/data/schemas/dictionary-kanji-bank-v1-schema.json' : + '/data/schemas/dictionary-kanji-bank-v3-schema.json' + ); + const kanjiMetaBank = '/data/schemas/dictionary-kanji-meta-bank-v3-schema.json'; + const tagBank = '/data/schemas/dictionary-tag-bank-v3-schema.json'; + + return [termBank, termMetaBank, kanjiBank, kanjiMetaBank, tagBank]; + } + + async _formatDictionaryTermGlossaryObject(data, context, entry) { + switch (data.type) { + case 'text': + return data.text; + case 'image': + return await this._formatDictionaryTermGlossaryImage(data, context, entry); + default: + throw new Error(`Unhandled data type: ${data.type}`); + } + } + + async _formatDictionaryTermGlossaryImage(data, context, entry) { + const dictionary = entry.dictionary; + const {path, width: preferredWidth, height: preferredHeight, title, description, pixelated} = data; + if (context.media.has(path)) { + // Already exists + return data; + } + + let errorSource = entry.expression; + if (entry.reading.length > 0) { + errorSource += ` (${entry.reading});`; + } + + const file = context.archive.file(path); + if (file === null) { + throw new Error(`Could not find image at path ${JSON.stringify(path)} for ${errorSource}`); + } + + const content = await file.async('base64'); + const mediaType = this._mediaUtility.getImageMediaTypeFromFileName(path); + if (mediaType === null) { + throw new Error(`Could not determine media type for image at path ${JSON.stringify(path)} for ${errorSource}`); + } + + let image; + try { + image = await this._loadImageBase64(mediaType, content); + } catch (e) { + throw new Error(`Could not load image at path ${JSON.stringify(path)} for ${errorSource}`); + } + + const width = image.naturalWidth; + const height = image.naturalHeight; + + // Create image data + const mediaData = { + dictionary, + path, + mediaType, + width, + height, + content + }; + context.media.set(path, mediaData); + + // Create new data + const newData = { + type: 'image', + path, + width, + height + }; + if (typeof preferredWidth === 'number') { newData.preferredWidth = preferredWidth; } + if (typeof preferredHeight === 'number') { newData.preferredHeight = preferredHeight; } + if (typeof title === 'string') { newData.title = title; } + if (typeof description === 'string') { newData.description = description; } + if (typeof pixelated === 'boolean') { newData.pixelated = pixelated; } + + return newData; + } + + async _fetchJsonAsset(url) { + const response = await fetch(chrome.runtime.getURL(url), { + method: 'GET', + mode: 'no-cors', + cache: 'default', + credentials: 'omit', + redirect: 'follow', + referrerPolicy: 'no-referrer' + }); + if (!response.ok) { + throw new Error(`Failed to fetch ${url}: ${response.status}`); + } + return await response.json(); + } + + /** + * Attempts to load an image using a base64 encoded content and a media type. + * @param mediaType The media type for the image content. + * @param content The binary content for the image, encoded in base64. + * @returns A Promise which resolves with an HTMLImageElement instance on + * successful load, otherwise an error is thrown. + */ + _loadImageBase64(mediaType, content) { + return new Promise((resolve, reject) => { + const image = new Image(); + const eventListeners = new EventListenerCollection(); + eventListeners.addEventListener(image, 'load', () => { + eventListeners.removeAllEventListeners(); + resolve(image); + }, false); + eventListeners.addEventListener(image, 'error', () => { + eventListeners.removeAllEventListeners(); + reject(new Error('Image failed to load')); + }, false); + image.src = `data:${mediaType};base64,${content}`; + }); + } +} diff --git a/ext/js/language/translator.js b/ext/js/language/translator.js new file mode 100644 index 00000000..729c8294 --- /dev/null +++ b/ext/js/language/translator.js @@ -0,0 +1,1397 @@ +/* + * Copyright (C) 2016-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +/* global + * Deinflector + * TextSourceMap + */ + +/** + * Class which finds term and kanji definitions for text. + */ +class Translator { + /** + * Creates a new Translator instance. + * @param database An instance of DictionaryDatabase. + */ + constructor({japaneseUtil, database}) { + this._japaneseUtil = japaneseUtil; + this._database = database; + this._deinflector = null; + this._tagCache = new Map(); + this._stringComparer = new Intl.Collator('en-US'); // Invariant locale + } + + /** + * Initializes the instance for use. The public API should not be used until + * this function has been called. + * @param deinflectionReasons The raw deinflections reasons data that the Deinflector uses. + */ + prepare(deinflectionReasons) { + this._deinflector = new Deinflector(deinflectionReasons); + } + + /** + * Clears the database tag cache. This should be executed if the database is changed. + */ + clearDatabaseCaches() { + this._tagCache.clear(); + } + + /** + * Finds term definitions for the given text. + * @param mode The mode to use for finding terms, which determines the format of the resulting array. + * One of: 'group', 'merge', 'split', 'simple' + * @param text The text to find terms for. + * @param options An object using the following structure: + * { + * wildcard: (enum: null, 'prefix', 'suffix'), + * mainDictionary: (string), + * alphanumeric: (boolean), + * convertHalfWidthCharacters: (enum: 'false', 'true', 'variant'), + * convertNumericCharacters: (enum: 'false', 'true', 'variant'), + * convertAlphabeticCharacters: (enum: 'false', 'true', 'variant'), + * convertHiraganaToKatakana: (enum: 'false', 'true', 'variant'), + * convertKatakanaToHiragana: (enum: 'false', 'true', 'variant'), + * collapseEmphaticSequences: (enum: 'false', 'true', 'full'), + * textReplacements: [ + * (null or [ + * {pattern: (RegExp), replacement: (string)} + * ... + * ]) + * ... + * ], + * enabledDictionaryMap: (Map of [ + * (string), + * { + * priority: (number), + * allowSecondarySearches: (boolean) + * } + * ]) + * } + * @returns An array of [definitions, textLength]. The structure of each definition depends on the + * mode parameter, see the _create?TermDefinition?() functions for structure details. + */ + async findTerms(mode, text, options) { + switch (mode) { + case 'group': + return await this._findTermsGrouped(text, options); + case 'merge': + return await this._findTermsMerged(text, options); + case 'split': + return await this._findTermsSplit(text, options); + case 'simple': + return await this._findTermsSimple(text, options); + default: + return [[], 0]; + } + } + + /** + * Finds kanji definitions for the given text. + * @param text The text to find kanji definitions for. This string can be of any length, + * but is typically just one character, which is a single kanji. If the string is multiple + * characters long, each character will be searched in the database. + * @param options An object using the following structure: + * { + * enabledDictionaryMap: (Map of [ + * (string), + * { + * priority: (number) + * } + * ]) + * } + * @returns An array of definitions. See the _createKanjiDefinition() function for structure details. + */ + async findKanji(text, options) { + const {enabledDictionaryMap} = options; + const kanjiUnique = new Set(); + for (const c of text) { + kanjiUnique.add(c); + } + + const databaseDefinitions = await this._database.findKanjiBulk([...kanjiUnique], enabledDictionaryMap); + if (databaseDefinitions.length === 0) { return []; } + + this._sortDatabaseDefinitionsByIndex(databaseDefinitions); + + const definitions = []; + for (const {character, onyomi, kunyomi, tags, glossary, stats, dictionary} of databaseDefinitions) { + const expandedStats = await this._expandStats(stats, dictionary); + const expandedTags = await this._expandTags(tags, dictionary); + this._sortTags(expandedTags); + + const definition = this._createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, expandedTags, expandedStats); + definitions.push(definition); + } + + await this._buildKanjiMeta(definitions, enabledDictionaryMap); + + return definitions; + } + + // Find terms core functions + + async _findTermsSimple(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + this._sortDefinitions(definitions, false); + return [definitions, length]; + } + + async _findTermsSplit(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + await this._buildTermMeta(definitions, enabledDictionaryMap); + this._sortDefinitions(definitions, true); + return [definitions, length]; + } + + async _findTermsGrouped(text, options) { + const {enabledDictionaryMap} = options; + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + + const groupedDefinitions = this._groupTerms(definitions, enabledDictionaryMap); + await this._buildTermMeta(groupedDefinitions, enabledDictionaryMap); + this._sortDefinitions(groupedDefinitions, false); + + for (const definition of groupedDefinitions) { + this._flagRedundantDefinitionTags(definition.definitions); + } + + return [groupedDefinitions, length]; + } + + async _findTermsMerged(text, options) { + const {mainDictionary, enabledDictionaryMap} = options; + const secondarySearchDictionaryMap = this._getSecondarySearchDictionaryMap(enabledDictionaryMap); + + const [definitions, length] = await this._findTermsInternal(text, enabledDictionaryMap, options); + const {sequencedDefinitions, unsequencedDefinitions} = await this._getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap); + const definitionsMerged = []; + const usedDefinitions = new Set(); + + for (const {sourceDefinitions, relatedDefinitions} of sequencedDefinitions) { + const result = await this._getMergedDefinition( + sourceDefinitions, + relatedDefinitions, + unsequencedDefinitions, + secondarySearchDictionaryMap, + usedDefinitions + ); + definitionsMerged.push(result); + } + + const unusedDefinitions = unsequencedDefinitions.filter((definition) => !usedDefinitions.has(definition)); + for (const groupedDefinition of this._groupTerms(unusedDefinitions, enabledDictionaryMap)) { + const {reasons, score, expression, reading, source, rawSource, sourceTerm, furiganaSegments, termTags, definitions: definitions2} = groupedDefinition; + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; + const compatibilityDefinition = this._createMergedTermDefinition( + source, + rawSource, + this._convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions2), + [expression], + [reading], + termDetailsList, + reasons, + score + ); + definitionsMerged.push(compatibilityDefinition); + } + + await this._buildTermMeta(definitionsMerged, enabledDictionaryMap); + this._sortDefinitions(definitionsMerged, false); + + for (const definition of definitionsMerged) { + this._flagRedundantDefinitionTags(definition.definitions); + } + + return [definitionsMerged, length]; + } + + // Find terms internal implementation + + async _findTermsInternal(text, enabledDictionaryMap, options) { + const {alphanumeric, wildcard} = options; + text = this._getSearchableText(text, alphanumeric); + if (text.length === 0) { + return [[], 0]; + } + + const deinflections = ( + wildcard ? + await this._findTermWildcard(text, enabledDictionaryMap, wildcard) : + await this._findTermDeinflections(text, enabledDictionaryMap, options) + ); + + let maxLength = 0; + const definitions = []; + for (const {databaseDefinitions, source, rawSource, term, reasons} of deinflections) { + if (databaseDefinitions.length === 0) { continue; } + maxLength = Math.max(maxLength, rawSource.length); + for (const databaseDefinition of databaseDefinitions) { + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, term, reasons, enabledDictionaryMap); + definitions.push(definition); + } + } + + this._removeDuplicateDefinitions(definitions); + return [definitions, maxLength]; + } + + async _findTermWildcard(text, enabledDictionaryMap, wildcard) { + const databaseDefinitions = await this._database.findTermsBulk([text], enabledDictionaryMap, wildcard); + if (databaseDefinitions.length === 0) { + return []; + } + + return [{ + source: text, + rawSource: text, + term: text, + rules: 0, + reasons: [], + databaseDefinitions + }]; + } + + async _findTermDeinflections(text, enabledDictionaryMap, options) { + const deinflections = this._getAllDeinflections(text, options); + + if (deinflections.length === 0) { + return []; + } + + const uniqueDeinflectionTerms = []; + const uniqueDeinflectionArrays = []; + const uniqueDeinflectionsMap = new Map(); + for (const deinflection of deinflections) { + const term = deinflection.term; + let deinflectionArray = uniqueDeinflectionsMap.get(term); + if (typeof deinflectionArray === 'undefined') { + deinflectionArray = []; + uniqueDeinflectionTerms.push(term); + uniqueDeinflectionArrays.push(deinflectionArray); + uniqueDeinflectionsMap.set(term, deinflectionArray); + } + deinflectionArray.push(deinflection); + } + + const databaseDefinitions = await this._database.findTermsBulk(uniqueDeinflectionTerms, enabledDictionaryMap, null); + + for (const databaseDefinition of databaseDefinitions) { + const definitionRules = Deinflector.rulesToRuleFlags(databaseDefinition.rules); + for (const deinflection of uniqueDeinflectionArrays[databaseDefinition.index]) { + const deinflectionRules = deinflection.rules; + if (deinflectionRules === 0 || (definitionRules & deinflectionRules) !== 0) { + deinflection.databaseDefinitions.push(databaseDefinition); + } + } + } + + return deinflections; + } + + _getAllDeinflections(text, options) { + const textOptionVariantArray = [ + this._getTextReplacementsVariants(options), + this._getTextOptionEntryVariants(options.convertHalfWidthCharacters), + this._getTextOptionEntryVariants(options.convertNumericCharacters), + this._getTextOptionEntryVariants(options.convertAlphabeticCharacters), + this._getTextOptionEntryVariants(options.convertHiraganaToKatakana), + this._getTextOptionEntryVariants(options.convertKatakanaToHiragana), + this._getCollapseEmphaticOptions(options) + ]; + + const jp = this._japaneseUtil; + const deinflections = []; + const used = new Set(); + for (const [textReplacements, halfWidth, numeric, alphabetic, katakana, hiragana, [collapseEmphatic, collapseEmphaticFull]] of this._getArrayVariants(textOptionVariantArray)) { + let text2 = text; + const sourceMap = new TextSourceMap(text2); + if (textReplacements !== null) { + text2 = this._applyTextReplacements(text2, sourceMap, textReplacements); + } + if (halfWidth) { + text2 = jp.convertHalfWidthKanaToFullWidth(text2, sourceMap); + } + if (numeric) { + text2 = jp.convertNumericToFullWidth(text2); + } + if (alphabetic) { + text2 = jp.convertAlphabeticToKana(text2, sourceMap); + } + if (katakana) { + text2 = jp.convertHiraganaToKatakana(text2); + } + if (hiragana) { + text2 = jp.convertKatakanaToHiragana(text2); + } + if (collapseEmphatic) { + text2 = jp.collapseEmphaticSequences(text2, collapseEmphaticFull, sourceMap); + } + + for (let i = text2.length; i > 0; --i) { + const text2Substring = text2.substring(0, i); + if (used.has(text2Substring)) { break; } + used.add(text2Substring); + const rawSource = sourceMap.source.substring(0, sourceMap.getSourceLength(i)); + for (const deinflection of this._deinflector.deinflect(text2Substring, rawSource)) { + deinflections.push(deinflection); + } + } + } + return deinflections; + } + + async _getSequencedDefinitions(definitions, mainDictionary, enabledDictionaryMap) { + const sequenceList = []; + const sequencedDefinitionMap = new Map(); + const sequencedDefinitions = []; + const unsequencedDefinitions = []; + for (const definition of definitions) { + const {sequence, dictionary} = definition; + if (mainDictionary === dictionary && sequence >= 0) { + let sequencedDefinition = sequencedDefinitionMap.get(sequence); + if (typeof sequencedDefinition === 'undefined') { + sequencedDefinition = { + sourceDefinitions: [], + relatedDefinitions: [], + relatedDefinitionIds: new Set() + }; + sequencedDefinitionMap.set(sequence, sequencedDefinition); + sequencedDefinitions.push(sequencedDefinition); + sequenceList.push(sequence); + } + sequencedDefinition.sourceDefinitions.push(definition); + sequencedDefinition.relatedDefinitions.push(definition); + sequencedDefinition.relatedDefinitionIds.add(definition.id); + } else { + unsequencedDefinitions.push(definition); + } + } + + if (sequenceList.length > 0) { + const databaseDefinitions = await this._database.findTermsBySequenceBulk(sequenceList, mainDictionary); + for (const databaseDefinition of databaseDefinitions) { + const {relatedDefinitions, relatedDefinitionIds} = sequencedDefinitions[databaseDefinition.index]; + const {id} = databaseDefinition; + if (relatedDefinitionIds.has(id)) { continue; } + + const {source, rawSource, sourceTerm} = relatedDefinitions[0]; + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, [], enabledDictionaryMap); + relatedDefinitions.push(definition); + } + } + + for (const {relatedDefinitions} of sequencedDefinitions) { + this._sortDefinitionsById(relatedDefinitions); + } + + return {sequencedDefinitions, unsequencedDefinitions}; + } + + async _getMergedSecondarySearchResults(expressionsMap, secondarySearchDictionaryMap) { + if (secondarySearchDictionaryMap.size === 0) { + return []; + } + + const expressionList = []; + const readingList = []; + for (const [expression, readingMap] of expressionsMap.entries()) { + for (const reading of readingMap.keys()) { + expressionList.push(expression); + readingList.push(reading); + } + } + + const databaseDefinitions = await this._database.findTermsExactBulk(expressionList, readingList, secondarySearchDictionaryMap); + this._sortDatabaseDefinitionsByIndex(databaseDefinitions); + + const definitions = []; + for (const databaseDefinition of databaseDefinitions) { + const source = expressionList[databaseDefinition.index]; + const definition = await this._createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, source, source, [], secondarySearchDictionaryMap); + definitions.push(definition); + } + + return definitions; + } + + async _getMergedDefinition(sourceDefinitions, relatedDefinitions, unsequencedDefinitions, secondarySearchDictionaryMap, usedDefinitions) { + const {reasons, source, rawSource} = sourceDefinitions[0]; + const score = this._getMaxDefinitionScore(sourceDefinitions); + const termInfoMap = new Map(); + const glossaryDefinitions = []; + const glossaryDefinitionGroupMap = new Map(); + + this._mergeByGlossary(relatedDefinitions, glossaryDefinitionGroupMap); + this._addUniqueTermInfos(relatedDefinitions, termInfoMap); + + let secondaryDefinitions = await this._getMergedSecondarySearchResults(termInfoMap, secondarySearchDictionaryMap); + secondaryDefinitions = [...unsequencedDefinitions, ...secondaryDefinitions]; + + this._removeUsedDefinitions(secondaryDefinitions, termInfoMap, usedDefinitions); + this._removeDuplicateDefinitions(secondaryDefinitions); + + this._mergeByGlossary(secondaryDefinitions, glossaryDefinitionGroupMap); + + const allExpressions = new Set(); + const allReadings = new Set(); + for (const {expressions, readings} of glossaryDefinitionGroupMap.values()) { + for (const expression of expressions) { allExpressions.add(expression); } + for (const reading of readings) { allReadings.add(reading); } + } + + for (const {expressions, readings, definitions} of glossaryDefinitionGroupMap.values()) { + const glossaryDefinition = this._createMergedGlossaryTermDefinition( + source, + rawSource, + definitions, + expressions, + readings, + allExpressions, + allReadings + ); + glossaryDefinitions.push(glossaryDefinition); + } + + this._sortDefinitions(glossaryDefinitions, true); + + const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); + + return this._createMergedTermDefinition( + source, + rawSource, + glossaryDefinitions, + [...allExpressions], + [...allReadings], + termDetailsList, + reasons, + score + ); + } + + _removeUsedDefinitions(definitions, termInfoMap, usedDefinitions) { + for (let i = 0, ii = definitions.length; i < ii; ++i) { + const definition = definitions[i]; + const {expression, reading} = definition; + const expressionMap = termInfoMap.get(expression); + if ( + typeof expressionMap !== 'undefined' && + typeof expressionMap.get(reading) !== 'undefined' + ) { + usedDefinitions.add(definition); + } else { + definitions.splice(i, 1); + --i; + --ii; + } + } + } + + _getUniqueDefinitionTags(definitions) { + const definitionTagsMap = new Map(); + for (const {definitionTags} of definitions) { + for (const tag of definitionTags) { + const {name} = tag; + if (definitionTagsMap.has(name)) { continue; } + definitionTagsMap.set(name, this._cloneTag(tag)); + } + } + return [...definitionTagsMap.values()]; + } + + _removeDuplicateDefinitions(definitions) { + const definitionGroups = new Map(); + for (let i = 0, ii = definitions.length; i < ii; ++i) { + const definition = definitions[i]; + const {id} = definition; + const existing = definitionGroups.get(id); + if (typeof existing === 'undefined') { + definitionGroups.set(id, [i, definition]); + continue; + } + + let removeIndex = i; + if (definition.source.length > existing[1].source.length) { + definitionGroups.set(id, [i, definition]); + removeIndex = existing[0]; + } + + definitions.splice(removeIndex, 1); + --i; + --ii; + } + } + + _flagRedundantDefinitionTags(definitions) { + let lastDictionary = null; + let lastPartOfSpeech = ''; + const removeCategoriesSet = new Set(); + + for (const {dictionary, definitionTags} of definitions) { + const partOfSpeech = this._createMapKey(this._getTagNamesWithCategory(definitionTags, 'partOfSpeech')); + + if (lastDictionary !== dictionary) { + lastDictionary = dictionary; + lastPartOfSpeech = ''; + } + + if (lastPartOfSpeech === partOfSpeech) { + removeCategoriesSet.add('partOfSpeech'); + } else { + lastPartOfSpeech = partOfSpeech; + } + + if (removeCategoriesSet.size > 0) { + this._flagTagsWithCategoryAsRedundant(definitionTags, removeCategoriesSet); + removeCategoriesSet.clear(); + } + } + } + + _groupTerms(definitions) { + const groups = new Map(); + for (const definition of definitions) { + const key = this._createMapKey([definition.source, definition.expression, definition.reading, ...definition.reasons]); + let groupDefinitions = groups.get(key); + if (typeof groupDefinitions === 'undefined') { + groupDefinitions = []; + groups.set(key, groupDefinitions); + } + + groupDefinitions.push(definition); + } + + const results = []; + for (const groupDefinitions of groups.values()) { + this._sortDefinitions(groupDefinitions, true); + const definition = this._createGroupedTermDefinition(groupDefinitions); + results.push(definition); + } + + return results; + } + + _mergeByGlossary(definitions, glossaryDefinitionGroupMap) { + for (const definition of definitions) { + const {expression, reading, dictionary, glossary, id} = definition; + + const key = this._createMapKey([dictionary, ...glossary]); + let group = glossaryDefinitionGroupMap.get(key); + if (typeof group === 'undefined') { + group = { + expressions: new Set(), + readings: new Set(), + definitions: [], + definitionIds: new Set() + }; + glossaryDefinitionGroupMap.set(key, group); + } + + const {definitionIds} = group; + if (definitionIds.has(id)) { continue; } + definitionIds.add(id); + group.expressions.add(expression); + group.readings.add(reading); + group.definitions.push(definition); + } + } + + _addUniqueTermInfos(definitions, termInfoMap) { + for (const {expression, reading, sourceTerm, furiganaSegments, termTags} of definitions) { + let readingMap = termInfoMap.get(expression); + if (typeof readingMap === 'undefined') { + readingMap = new Map(); + termInfoMap.set(expression, readingMap); + } + + let termInfo = readingMap.get(reading); + if (typeof termInfo === 'undefined') { + termInfo = { + sourceTerm, + furiganaSegments, + termTagsMap: new Map() + }; + readingMap.set(reading, termInfo); + } + + const {termTagsMap} = termInfo; + for (const tag of termTags) { + const {name} = tag; + if (termTagsMap.has(name)) { continue; } + termTagsMap.set(name, this._cloneTag(tag)); + } + } + } + + _convertTermDefinitionsToMergedGlossaryTermDefinitions(definitions) { + const convertedDefinitions = []; + for (const definition of definitions) { + const {source, rawSource, expression, reading} = definition; + const expressions = new Set([expression]); + const readings = new Set([reading]); + const convertedDefinition = this._createMergedGlossaryTermDefinition(source, rawSource, [definition], expressions, readings, expressions, readings); + convertedDefinitions.push(convertedDefinition); + } + return convertedDefinitions; + } + + // Metadata building + + async _buildTermMeta(definitions, enabledDictionaryMap) { + const addMetadataTargetInfo = (targetMap1, target, parents) => { + let {expression, reading} = target; + if (!reading) { reading = expression; } + + let targetMap2 = targetMap1.get(expression); + if (typeof targetMap2 === 'undefined') { + targetMap2 = new Map(); + targetMap1.set(expression, targetMap2); + } + + let targets = targetMap2.get(reading); + if (typeof targets === 'undefined') { + targets = new Set([target, ...parents]); + targetMap2.set(reading, targets); + } else { + targets.add(target); + for (const parent of parents) { + targets.add(parent); + } + } + }; + + const targetMap = new Map(); + const definitionsQueue = definitions.map((definition) => ({definition, parents: []})); + while (definitionsQueue.length > 0) { + const {definition, parents} = definitionsQueue.shift(); + const childDefinitions = definition.definitions; + if (Array.isArray(childDefinitions)) { + for (const definition2 of childDefinitions) { + definitionsQueue.push({definition: definition2, parents: [...parents, definition]}); + } + } else { + addMetadataTargetInfo(targetMap, definition, parents); + } + + for (const target of definition.expressions) { + addMetadataTargetInfo(targetMap, target, []); + } + } + const targetMapEntries = [...targetMap.entries()]; + const uniqueExpressions = targetMapEntries.map(([expression]) => expression); + + const metas = await this._database.findTermMetaBulk(uniqueExpressions, enabledDictionaryMap); + for (const {expression, mode, data, dictionary, index} of metas) { + const targetMap2 = targetMapEntries[index][1]; + for (const [reading, targets] of targetMap2) { + switch (mode) { + case 'freq': + { + const frequencyData = this._getTermFrequencyData(expression, reading, dictionary, data); + if (frequencyData === null) { continue; } + for (const {frequencies} of targets) { frequencies.push(frequencyData); } + } + break; + case 'pitch': + { + const pitchData = await this._getPitchData(expression, reading, dictionary, data); + if (pitchData === null) { continue; } + for (const {pitches} of targets) { pitches.push(pitchData); } + } + break; + } + } + } + } + + async _buildKanjiMeta(definitions, enabledDictionaryMap) { + const kanjiList = []; + for (const {character} of definitions) { + kanjiList.push(character); + } + + const metas = await this._database.findKanjiMetaBulk(kanjiList, enabledDictionaryMap); + for (const {character, mode, data, dictionary, index} of metas) { + switch (mode) { + case 'freq': + { + const frequencyData = this._getKanjiFrequencyData(character, dictionary, data); + definitions[index].frequencies.push(frequencyData); + } + break; + } + } + } + + async _expandTags(names, dictionary) { + const tagMetaList = await this._getTagMetaList(names, dictionary); + const results = []; + for (let i = 0, ii = tagMetaList.length; i < ii; ++i) { + const meta = tagMetaList[i]; + const name = names[i]; + const {category, notes, order, score} = (meta !== null ? meta : {}); + const tag = this._createTag(name, category, notes, order, score, dictionary, false); + results.push(tag); + } + return results; + } + + async _expandStats(items, dictionary) { + const names = Object.keys(items); + const tagMetaList = await this._getTagMetaList(names, dictionary); + + const statsGroups = new Map(); + for (let i = 0; i < names.length; ++i) { + const name = names[i]; + const meta = tagMetaList[i]; + if (meta === null) { continue; } + + const {category, notes, order, score} = meta; + let group = statsGroups.get(category); + if (typeof group === 'undefined') { + group = []; + statsGroups.set(category, group); + } + + const value = items[name]; + const stat = this._createKanjiStat(name, category, notes, order, score, dictionary, value); + group.push(stat); + } + + const stats = {}; + for (const [category, group] of statsGroups.entries()) { + this._sortKanjiStats(group); + stats[category] = group; + } + return stats; + } + + async _getTagMetaList(names, dictionary) { + const tagMetaList = []; + let cache = this._tagCache.get(dictionary); + if (typeof cache === 'undefined') { + cache = new Map(); + this._tagCache.set(dictionary, cache); + } + + for (const name of names) { + const base = this._getNameBase(name); + + let tagMeta = cache.get(base); + if (typeof tagMeta === 'undefined') { + tagMeta = await this._database.findTagForTitle(base, dictionary); + cache.set(base, tagMeta); + } + + tagMetaList.push(tagMeta); + } + + return tagMetaList; + } + + _getTermFrequencyData(expression, reading, dictionary, data) { + let frequency = data; + const hasReading = (data !== null && typeof data === 'object'); + if (hasReading) { + if (data.reading !== reading) { return null; } + frequency = data.frequency; + } + return {dictionary, expression, reading, hasReading, frequency}; + } + + _getKanjiFrequencyData(character, dictionary, data) { + return {dictionary, character, frequency: data}; + } + + async _getPitchData(expression, reading, dictionary, data) { + if (data.reading !== reading) { return null; } + + const pitches = []; + for (let {position, tags} of data.pitches) { + tags = Array.isArray(tags) ? await this._expandTags(tags, dictionary) : []; + pitches.push({position, tags}); + } + + return {expression, reading, dictionary, pitches}; + } + + // Simple helpers + + _scoreToTermFrequency(score) { + if (score > 0) { + return 'popular'; + } else if (score < 0) { + return 'rare'; + } else { + return 'normal'; + } + } + + _getNameBase(name) { + const pos = name.indexOf(':'); + return (pos >= 0 ? name.substring(0, pos) : name); + } + + _getSearchableText(text, allowAlphanumericCharacters) { + if (allowAlphanumericCharacters) { + return text; + } + + const jp = this._japaneseUtil; + let newText = ''; + for (const c of text) { + if (!jp.isCodePointJapanese(c.codePointAt(0))) { + break; + } + newText += c; + } + return newText; + } + + _getTextOptionEntryVariants(value) { + switch (value) { + case 'true': return [true]; + case 'variant': return [false, true]; + default: return [false]; + } + } + + _getCollapseEmphaticOptions(options) { + const collapseEmphaticOptions = [[false, false]]; + switch (options.collapseEmphaticSequences) { + case 'true': + collapseEmphaticOptions.push([true, false]); + break; + case 'full': + collapseEmphaticOptions.push([true, false], [true, true]); + break; + } + return collapseEmphaticOptions; + } + + _getTextReplacementsVariants(options) { + return options.textReplacements; + } + + _getSecondarySearchDictionaryMap(enabledDictionaryMap) { + const secondarySearchDictionaryMap = new Map(); + for (const [dictionary, details] of enabledDictionaryMap.entries()) { + if (!details.allowSecondarySearches) { continue; } + secondarySearchDictionaryMap.set(dictionary, details); + } + return secondarySearchDictionaryMap; + } + + _getDictionaryPriority(dictionary, enabledDictionaryMap) { + const info = enabledDictionaryMap.get(dictionary); + return typeof info !== 'undefined' ? info.priority : 0; + } + + _getTagNamesWithCategory(tags, category) { + const results = []; + for (const tag of tags) { + if (tag.category !== category) { continue; } + results.push(tag.name); + } + results.sort(); + return results; + } + + _flagTagsWithCategoryAsRedundant(tags, removeCategoriesSet) { + for (const tag of tags) { + if (removeCategoriesSet.has(tag.category)) { + tag.redundant = true; + } + } + } + + _getUniqueDictionaryNames(definitions) { + const uniqueDictionaryNames = new Set(); + for (const {dictionaryNames} of definitions) { + for (const dictionaryName of dictionaryNames) { + uniqueDictionaryNames.add(dictionaryName); + } + } + return [...uniqueDictionaryNames]; + } + + _getUniqueTermTags(definitions) { + const newTermTags = []; + if (definitions.length <= 1) { + for (const {termTags} of definitions) { + for (const tag of termTags) { + newTermTags.push(this._cloneTag(tag)); + } + } + } else { + const tagsSet = new Set(); + let checkTagsMap = false; + for (const {termTags} of definitions) { + for (const tag of termTags) { + const key = this._getTagMapKey(tag); + if (checkTagsMap && tagsSet.has(key)) { continue; } + tagsSet.add(key); + newTermTags.push(this._cloneTag(tag)); + } + checkTagsMap = true; + } + } + return newTermTags; + } + + *_getArrayVariants(arrayVariants) { + const ii = arrayVariants.length; + + let total = 1; + for (let i = 0; i < ii; ++i) { + total *= arrayVariants[i].length; + } + + for (let a = 0; a < total; ++a) { + const variant = []; + let index = a; + for (let i = 0; i < ii; ++i) { + const entryVariants = arrayVariants[i]; + variant.push(entryVariants[index % entryVariants.length]); + index = Math.floor(index / entryVariants.length); + } + yield variant; + } + } + + _areSetsEqual(set1, set2) { + if (set1.size !== set2.size) { + return false; + } + + for (const value of set1) { + if (!set2.has(value)) { + return false; + } + } + + return true; + } + + _getSetIntersection(set1, set2) { + const result = []; + for (const value of set1) { + if (set2.has(value)) { + result.push(value); + } + } + return result; + } + + // Reduction functions + + _getTermTagsScoreSum(termTags) { + let result = 0; + for (const {score} of termTags) { + result += score; + } + return result; + } + + _getSourceTermMatchCountSum(definitions) { + let result = 0; + for (const {sourceTermExactMatchCount} of definitions) { + result += sourceTermExactMatchCount; + } + return result; + } + + _getMaxDefinitionScore(definitions) { + let result = Number.MIN_SAFE_INTEGER; + for (const {score} of definitions) { + if (score > result) { result = score; } + } + return result; + } + + _getMaxDictionaryPriority(definitions) { + let result = Number.MIN_SAFE_INTEGER; + for (const {dictionaryPriority} of definitions) { + if (dictionaryPriority > result) { result = dictionaryPriority; } + } + return result; + } + + // Common data creation and cloning functions + + _cloneTag(tag) { + const {name, category, notes, order, score, dictionary, redundant} = tag; + return this._createTag(name, category, notes, order, score, dictionary, redundant); + } + + _getTagMapKey(tag) { + const {name, category, notes} = tag; + return this._createMapKey([name, category, notes]); + } + + _createMapKey(array) { + return JSON.stringify(array); + } + + _createTag(name, category, notes, order, score, dictionary, redundant) { + return { + name, + category: (typeof category === 'string' && category.length > 0 ? category : 'default'), + notes: (typeof notes === 'string' ? notes : ''), + order: (typeof order === 'number' ? order : 0), + score: (typeof score === 'number' ? score : 0), + dictionary: (typeof dictionary === 'string' ? dictionary : null), + redundant + }; + } + + _createKanjiStat(name, category, notes, order, score, dictionary, value) { + return { + name, + category: (typeof category === 'string' && category.length > 0 ? category : 'default'), + notes: (typeof notes === 'string' ? notes : ''), + order: (typeof order === 'number' ? order : 0), + score: (typeof score === 'number' ? score : 0), + dictionary: (typeof dictionary === 'string' ? dictionary : null), + value + }; + } + + _createKanjiDefinition(character, dictionary, onyomi, kunyomi, glossary, tags, stats) { + return { + type: 'kanji', + character, + dictionary, + onyomi, + kunyomi, + glossary, + tags, + stats, + frequencies: [] + }; + } + + async _createTermDefinitionFromDatabaseDefinition(databaseDefinition, source, rawSource, sourceTerm, reasons, enabledDictionaryMap) { + const {expression, reading, definitionTags, termTags, glossary, score, dictionary, id, sequence} = databaseDefinition; + const dictionaryPriority = this._getDictionaryPriority(dictionary, enabledDictionaryMap); + const termTagsExpanded = await this._expandTags(termTags, dictionary); + const definitionTagsExpanded = await this._expandTags(definitionTags, dictionary); + + this._sortTags(definitionTagsExpanded); + this._sortTags(termTagsExpanded); + + const furiganaSegments = this._japaneseUtil.distributeFurigana(expression, reading); + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTagsExpanded)]; + const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); + + return { + type: 'term', + id, + source, + rawSource, + sourceTerm, + reasons, + score, + sequence, + dictionary, + dictionaryPriority, + dictionaryNames: [dictionary], + expression, + reading, + expressions: termDetailsList, + furiganaSegments, + glossary, + definitionTags: definitionTagsExpanded, + termTags: termTagsExpanded, + // definitions + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createGroupedTermDefinition(definitions) { + const {expression, reading, furiganaSegments, reasons, source, rawSource, sourceTerm} = definitions[0]; + const score = this._getMaxDefinitionScore(definitions); + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + const termTags = this._getUniqueTermTags(definitions); + const termDetailsList = [this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)]; + const sourceTermExactMatchCount = (sourceTerm === expression ? 1 : 0); + return { + type: 'termGrouped', + // id + source, + rawSource, + sourceTerm, + reasons: [...reasons], + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression, + reading, + expressions: termDetailsList, + furiganaSegments, // Contains duplicate data + // glossary + // definitionTags + termTags, + definitions, // type: 'term' + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createMergedTermDefinition(source, rawSource, definitions, expressions, readings, termDetailsList, reasons, score) { + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + return { + type: 'termMerged', + // id + source, + rawSource, + // sourceTerm + reasons, + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression: expressions, + reading: readings, + expressions: termDetailsList, + // furiganaSegments + // glossary + // definitionTags + // termTags + definitions, // type: 'termMergedByGlossary' + frequencies: [], + pitches: [], + // only + sourceTermExactMatchCount + }; + } + + _createMergedGlossaryTermDefinition(source, rawSource, definitions, expressions, readings, allExpressions, allReadings) { + const only = []; + if (!this._areSetsEqual(expressions, allExpressions)) { + only.push(...this._getSetIntersection(expressions, allExpressions)); + } + if (!this._areSetsEqual(readings, allReadings)) { + only.push(...this._getSetIntersection(readings, allReadings)); + } + + const sourceTermExactMatchCount = this._getSourceTermMatchCountSum(definitions); + const dictionaryNames = this._getUniqueDictionaryNames(definitions); + + const termInfoMap = new Map(); + this._addUniqueTermInfos(definitions, termInfoMap); + const termDetailsList = this._createTermDetailsListFromTermInfoMap(termInfoMap); + + const definitionTags = this._getUniqueDefinitionTags(definitions); + this._sortTags(definitionTags); + + const {glossary} = definitions[0]; + const score = this._getMaxDefinitionScore(definitions); + const dictionaryPriority = this._getMaxDictionaryPriority(definitions); + return { + type: 'termMergedByGlossary', + // id + source, + rawSource, + // sourceTerm + reasons: [], + score, + // sequence + dictionary: dictionaryNames[0], + dictionaryPriority, + dictionaryNames, + expression: [...expressions], + reading: [...readings], + expressions: termDetailsList, + // furiganaSegments + glossary: [...glossary], + definitionTags, + // termTags + definitions, // type: 'term'; contains duplicate data + frequencies: [], + pitches: [], + only, + sourceTermExactMatchCount + }; + } + + _createTermDetailsListFromTermInfoMap(termInfoMap) { + const termDetailsList = []; + for (const [expression, readingMap] of termInfoMap.entries()) { + for (const [reading, {termTagsMap, sourceTerm, furiganaSegments}] of readingMap.entries()) { + const termTags = [...termTagsMap.values()]; + this._sortTags(termTags); + termDetailsList.push(this._createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags)); + } + } + return termDetailsList; + } + + _createTermDetails(sourceTerm, expression, reading, furiganaSegments, termTags) { + const termFrequency = this._scoreToTermFrequency(this._getTermTagsScoreSum(termTags)); + return { + sourceTerm, + expression, + reading, + furiganaSegments, // Contains duplicate data + termTags, + termFrequency, + frequencies: [], + pitches: [] + }; + } + + // Sorting functions + + _sortTags(tags) { + if (tags.length <= 1) { return; } + const stringComparer = this._stringComparer; + tags.sort((v1, v2) => { + const i = v1.order - v2.order; + if (i !== 0) { return i; } + + return stringComparer.compare(v1.name, v2.name); + }); + } + + _sortDefinitions(definitions, useDictionaryPriority) { + if (definitions.length <= 1) { return; } + const stringComparer = this._stringComparer; + const compareFunction1 = (v1, v2) => { + let i = v2.source.length - v1.source.length; + if (i !== 0) { return i; } + + i = v1.reasons.length - v2.reasons.length; + if (i !== 0) { return i; } + + i = v2.sourceTermExactMatchCount - v1.sourceTermExactMatchCount; + if (i !== 0) { return i; } + + i = v2.score - v1.score; + if (i !== 0) { return i; } + + const expression1 = v1.expression; + const expression2 = v2.expression; + if (typeof expression1 !== 'string' || typeof expression2 !== 'string') { return 0; } // Skip if either is not a string (array) + + i = expression2.length - expression1.length; + if (i !== 0) { return i; } + + return stringComparer.compare(expression1, expression2); + }; + const compareFunction2 = (v1, v2) => { + const i = v2.dictionaryPriority - v1.dictionaryPriority; + return (i !== 0) ? i : compareFunction1(v1, v2); + }; + definitions.sort(useDictionaryPriority ? compareFunction2 : compareFunction1); + } + + _sortDatabaseDefinitionsByIndex(definitions) { + if (definitions.length <= 1) { return; } + definitions.sort((a, b) => a.index - b.index); + } + + _sortDefinitionsById(definitions) { + if (definitions.length <= 1) { return; } + definitions.sort((a, b) => a.id - b.id); + } + + _sortKanjiStats(stats) { + if (stats.length <= 1) { return; } + const stringComparer = this._stringComparer; + stats.sort((v1, v2) => { + const i = v1.order - v2.order; + if (i !== 0) { return i; } + + return stringComparer.compare(v1.notes, v2.notes); + }); + } + + // Regex functions + + _applyTextReplacements(text, sourceMap, replacements) { + for (const {pattern, replacement} of replacements) { + text = this._applyTextReplacement(text, sourceMap, pattern, replacement); + } + return text; + } + + _applyTextReplacement(text, sourceMap, pattern, replacement) { + const isGlobal = pattern.global; + if (isGlobal) { pattern.lastIndex = 0; } + for (let loop = true; loop; loop = isGlobal) { + const match = pattern.exec(text); + if (match === null) { break; } + + const matchText = match[0]; + const index = match.index; + const actualReplacement = this._applyMatchReplacement(replacement, match); + const actualReplacementLength = actualReplacement.length; + const delta = actualReplacementLength - (matchText.length > 0 ? matchText.length : -1); + + text = `${text.substring(0, index)}${actualReplacement}${text.substring(index + matchText.length)}`; + pattern.lastIndex += delta; + + if (actualReplacementLength > 0) { + sourceMap.combine(Math.max(0, index - 1), matchText.length); + sourceMap.insert(index, ...(new Array(actualReplacementLength).fill(0))); + } else { + sourceMap.combine(index, matchText.length); + } + } + return text; + } + + _applyMatchReplacement(replacement, match) { + const pattern = /\$(?:\$|&|`|'|(\d\d?)|<([^>]*)>)/g; + return replacement.replace(pattern, (g0, g1, g2) => { + if (typeof g1 !== 'undefined') { + const matchIndex = Number.parseInt(g1, 10); + if (matchIndex >= 1 && matchIndex <= match.length) { + return match[matchIndex]; + } + } else if (typeof g2 !== 'undefined') { + const {groups} = match; + if (typeof groups === 'object' && groups !== null && Object.prototype.hasOwnProperty.call(groups, g2)) { + return groups[g2]; + } + } else { + switch (g0) { + case '$': return '$'; + case '&': return match[0]; + case '`': return replacement.substring(0, match.index); + case '\'': return replacement.substring(match.index + g0.length); + } + } + return g0; + }); + } +} |