diff options
Diffstat (limited to 'ext/js/language/sandbox/dictionary-data-util.js')
-rw-r--r-- | ext/js/language/sandbox/dictionary-data-util.js | 299 |
1 files changed, 299 insertions, 0 deletions
diff --git a/ext/js/language/sandbox/dictionary-data-util.js b/ext/js/language/sandbox/dictionary-data-util.js new file mode 100644 index 00000000..951e10ff --- /dev/null +++ b/ext/js/language/sandbox/dictionary-data-util.js @@ -0,0 +1,299 @@ +/* + * Copyright (C) 2020-2021 Yomichan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +class DictionaryDataUtil { + static groupTermTags(dictionaryEntry) { + const {headwords} = dictionaryEntry; + const headwordCount = headwords.length; + const uniqueCheck = (headwordCount > 1); + const resultsIndexMap = new Map(); + const results = []; + for (let i = 0; i < headwordCount; ++i) { + const {tags} = headwords[i]; + for (const tag of tags) { + if (uniqueCheck) { + const {name, category, notes, dictionary} = tag; + const key = this._createMapKey([name, category, notes, dictionary]); + const index = resultsIndexMap.get(key); + if (typeof index !== 'undefined') { + const existingItem = results[index]; + existingItem.headwordIndices.push(i); + continue; + } + resultsIndexMap.set(key, results.length); + } + + const item = {tag, headwordIndices: [i]}; + results.push(item); + } + } + return results; + } + + static groupTermFrequencies(dictionaryEntry) { + const {headwords, frequencies} = dictionaryEntry; + + const map1 = new Map(); + for (const {headwordIndex, dictionary, hasReading, frequency} of frequencies) { + const {term, reading} = headwords[headwordIndex]; + + let map2 = map1.get(dictionary); + if (typeof map2 === 'undefined') { + map2 = new Map(); + map1.set(dictionary, map2); + } + + const readingKey = hasReading ? reading : null; + const key = this._createMapKey([term, readingKey]); + let frequencyData = map2.get(key); + if (typeof frequencyData === 'undefined') { + frequencyData = {term, reading: readingKey, values: new Set()}; + map2.set(key, frequencyData); + } + + frequencyData.values.add(frequency); + } + return this._createFrequencyGroupsFromMap(map1); + } + + static groupKanjiFrequencies(frequencies) { + const map1 = new Map(); + for (const {dictionary, character, frequency} of frequencies) { + let map2 = map1.get(dictionary); + if (typeof map2 === 'undefined') { + map2 = new Map(); + map1.set(dictionary, map2); + } + + let frequencyData = map2.get(character); + if (typeof frequencyData === 'undefined') { + frequencyData = {character, values: new Set()}; + map2.set(character, frequencyData); + } + + frequencyData.values.add(frequency); + } + return this._createFrequencyGroupsFromMap(map1); + } + + static getPitchAccentInfos(dictionaryEntry) { + const {headwords, pronunciations} = dictionaryEntry; + + const allTerms = new Set(); + const allReadings = new Set(); + for (const {term, reading} of headwords) { + allTerms.add(term); + allReadings.add(reading); + } + + const pitchAccentInfoMap = new Map(); + for (const {headwordIndex, dictionary, pitches} of pronunciations) { + const {term, reading} = headwords[headwordIndex]; + let dictionaryPitchAccentInfoList = pitchAccentInfoMap.get(dictionary); + if (typeof dictionaryPitchAccentInfoList === 'undefined') { + dictionaryPitchAccentInfoList = []; + pitchAccentInfoMap.set(dictionary, dictionaryPitchAccentInfoList); + } + for (const {position, nasalPositions, devoicePositions, tags} of pitches) { + let pitchAccentInfo = this._findExistingPitchAccentInfo(reading, position, nasalPositions, devoicePositions, tags, dictionaryPitchAccentInfoList); + if (pitchAccentInfo === null) { + pitchAccentInfo = { + terms: new Set(), + reading, + position, + nasalPositions, + devoicePositions, + tags, + exclusiveTerms: [], + exclusiveReadings: [] + }; + dictionaryPitchAccentInfoList.push(pitchAccentInfo); + } + pitchAccentInfo.terms.add(term); + } + } + + const multipleReadings = (allReadings.size > 1); + for (const dictionaryPitchAccentInfoList of pitchAccentInfoMap.values()) { + for (const pitchAccentInfo of dictionaryPitchAccentInfoList) { + const {terms, reading, exclusiveTerms, exclusiveReadings} = pitchAccentInfo; + if (!this._areSetsEqual(terms, allTerms)) { + exclusiveTerms.push(...this._getSetIntersection(terms, allTerms)); + } + if (multipleReadings) { + exclusiveReadings.push(reading); + } + pitchAccentInfo.terms = [...terms]; + } + } + + const results2 = []; + for (const [dictionary, pitches] of pitchAccentInfoMap.entries()) { + results2.push({dictionary, pitches}); + } + return results2; + } + + static getTermFrequency(termTags) { + let totalScore = 0; + for (const {score} of termTags) { + totalScore += score; + } + if (totalScore > 0) { + return 'popular'; + } else if (totalScore < 0) { + return 'rare'; + } else { + return 'normal'; + } + } + + static getDisambiguations(headwords, headwordIndices, allTermsSet, allReadingsSet) { + if (allTermsSet.size <= 1 && allReadingsSet.size <= 1) { return []; } + + const terms = new Set(); + const readings = new Set(); + for (const headwordIndex of headwordIndices) { + const {term, reading} = headwords[headwordIndex]; + terms.add(term); + readings.add(reading); + } + + const disambiguations = []; + const addTerms = !this._areSetsEqual(terms, allTermsSet); + const addReadings = !this._areSetsEqual(readings, allReadingsSet); + if (addTerms) { + disambiguations.push(...this._getSetIntersection(terms, allTermsSet)); + } + if (addReadings) { + if (addTerms) { + for (const term of terms) { + readings.delete(term); + } + } + disambiguations.push(...this._getSetIntersection(readings, allReadingsSet)); + } + return disambiguations; + } + + static isNonNounVerbOrAdjective(wordClasses) { + let isVerbOrAdjective = false; + let isSuruVerb = false; + let isNoun = false; + for (const wordClass of wordClasses) { + switch (wordClass) { + case 'v1': + case 'v5': + case 'vk': + case 'vz': + case 'adj-i': + isVerbOrAdjective = true; + break; + case 'vs': + isVerbOrAdjective = true; + isSuruVerb = true; + break; + case 'n': + isNoun = true; + break; + } + } + return isVerbOrAdjective && !(isSuruVerb && isNoun); + } + + // Private + + static _createFrequencyGroupsFromMap(map) { + const results = []; + for (const [dictionary, map2] of map.entries()) { + const frequencies = []; + for (const frequencyData of map2.values()) { + frequencyData.values = [...frequencyData.values]; + frequencies.push(frequencyData); + } + results.push({dictionary, frequencies}); + } + return results; + } + + static _findExistingPitchAccentInfo(reading, position, nasalPositions, devoicePositions, tags, pitchAccentInfoList) { + for (const pitchInfo of pitchAccentInfoList) { + if ( + pitchInfo.reading === reading && + pitchInfo.position === position && + this._areArraysEqual(pitchInfo.nasalPositions, nasalPositions) && + this._areArraysEqual(pitchInfo.devoicePositions, devoicePositions) && + this._areTagListsEqual(pitchInfo.tags, tags) + ) { + return pitchInfo; + } + } + return null; + } + + static _areArraysEqual(array1, array2) { + const ii = array1.length; + if (ii !== array2.length) { return false; } + for (let i = 0; i < ii; ++i) { + if (array1[i] !== array2[i]) { return false; } + } + return true; + } + + static _areTagListsEqual(tagList1, tagList2) { + const ii = tagList1.length; + if (tagList2.length !== ii) { return false; } + + for (let i = 0; i < ii; ++i) { + const tag1 = tagList1[i]; + const tag2 = tagList2[i]; + if (tag1.name !== tag2.name || tag1.dictionary !== tag2.dictionary) { + return false; + } + } + + return true; + } + + static _areSetsEqual(set1, set2) { + if (set1.size !== set2.size) { + return false; + } + + for (const value of set1) { + if (!set2.has(value)) { + return false; + } + } + + return true; + } + + static _getSetIntersection(set1, set2) { + const result = []; + for (const value of set1) { + if (set2.has(value)) { + result.push(value); + } + } + return result; + } + + static _createMapKey(array) { + return JSON.stringify(array); + } +} |