/* * Copyright (C) 2024 Yomitan Authors * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js'; import {eszettPreprocessor} from './de/german-text-preprocessors.js'; import {germanTransforms} from './de/german-transforms.js'; import {englishTransforms} from './en/english-transforms.js'; import {spanishTransforms} from './es/spanish-transforms.js'; import { alphabeticToHiragana, alphanumericWidthVariants, collapseEmphaticSequences, convertHalfWidthCharacters, convertHiraganaToKatakana, normalizeCombiningCharacters, } from './ja/japanese-text-preprocessors.js'; import {japaneseTransforms} from './ja/japanese-transforms.js'; import {isStringPartiallyJapanese} from './ja/japanese.js'; import {disassembleHangul, reassembleHangul} from './ko/korean-text-processors.js'; import {koreanTransforms} from './ko/korean-transforms.js'; import {latinTransforms} from './la/latin-transforms.js'; import {removeRussianDiacritics, yoToE} from './ru/russian-text-preprocessors.js'; import {oldIrishTransforms} from './sga/old-irish-transforms.js'; import {removeSerboCroatianAccentMarks} from './sh/serbo-croatian-text-preprocessors.js'; import {albanianTransforms} from './sq/albanian-transforms.js'; import {capitalizeFirstLetter, decapitalize, removeAlphabeticDiacritics} from './text-processors.js'; import {normalizeDiacritics} from './vi/viet-text-preprocessors.js'; import {isStringPartiallyChinese, normalizePinyin} from './zh/chinese.js'; const capitalizationPreprocessors = { decapitalize, capitalizeFirstLetter, }; /** @type {import('language-descriptors').LanguageDescriptorAny[]} */ const languageDescriptors = [ { iso: 'ar', iso639_3: 'ara', name: 'Arabic', exampleText: 'قَرَأَ', textPreprocessors: { removeArabicScriptDiacritics, }, }, { iso: 'de', iso639_3: 'deu', name: 'German', exampleText: 'gelesen', textPreprocessors: { ...capitalizationPreprocessors, eszettPreprocessor, }, languageTransforms: germanTransforms, }, { iso: 'el', iso639_3: 'ell', name: 'Greek', exampleText: 'διαβάζω', textPreprocessors: capitalizationPreprocessors, }, { iso: 'en', iso639_3: 'eng', name: 'English', exampleText: 'read', textPreprocessors: capitalizationPreprocessors, languageTransforms: englishTransforms, }, { iso: 'es', iso639_3: 'spa', name: 'Spanish', exampleText: 'leer', textPreprocessors: capitalizationPreprocessors, languageTransforms: spanishTransforms, }, { iso: 'fa', iso639_3: 'fas', name: 'Persian', exampleText: 'خواندن', textPreprocessors: { removeArabicScriptDiacritics, }, }, { iso: 'fi', iso639_3: 'fin', name: 'Finnish', exampleText: 'lukea', textPreprocessors: capitalizationPreprocessors, }, { iso: 'fr', iso639_3: 'fra', name: 'French', exampleText: 'lire', textPreprocessors: capitalizationPreprocessors, }, { iso: 'grc', iso639_3: 'grc', name: 'Ancient Greek', exampleText: 'γράφω', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, }, { iso: 'hu', iso639_3: 'hun', name: 'Hungarian', exampleText: 'olvasni', textPreprocessors: capitalizationPreprocessors, }, { iso: 'id', iso639_3: 'ind', name: 'Indonesian', exampleText: 'membaca', textPreprocessors: capitalizationPreprocessors, }, { iso: 'it', iso639_3: 'ita', name: 'Italian', exampleText: 'leggere', textPreprocessors: capitalizationPreprocessors, }, { iso: 'la', iso639_3: 'lat', name: 'Latin', exampleText: 'legere', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, languageTransforms: latinTransforms, }, { iso: 'lo', iso639_3: 'lao', name: 'Lao', exampleText: 'ອ່ານ', }, { iso: 'ja', iso639_3: 'jpn', name: 'Japanese', exampleText: '読め', isTextLookupWorthy: isStringPartiallyJapanese, textPreprocessors: { convertHalfWidthCharacters, alphabeticToHiragana, normalizeCombiningCharacters, alphanumericWidthVariants, convertHiraganaToKatakana, collapseEmphaticSequences, }, languageTransforms: japaneseTransforms, }, { iso: 'km', iso639_3: 'khm', name: 'Khmer', exampleText: 'អាន', }, { iso: 'ko', iso639_3: 'kor', name: 'Korean', exampleText: '읽어', textPreprocessors: { disassembleHangul, }, textPostprocessors: { reassembleHangul, }, languageTransforms: koreanTransforms, }, { iso: 'mn', iso639_3: 'mon', name: 'Mongolian', exampleText: 'унших', textPreprocessors: capitalizationPreprocessors, }, { iso: 'nl', iso639_3: 'nld', name: 'Dutch', exampleText: 'lezen', textPreprocessors: capitalizationPreprocessors, }, { iso: 'pl', iso639_3: 'pol', name: 'Polish', exampleText: 'czytacie', textPreprocessors: capitalizationPreprocessors, }, { iso: 'pt', iso639_3: 'por', name: 'Portuguese', exampleText: 'ler', textPreprocessors: capitalizationPreprocessors, }, { iso: 'ro', iso639_3: 'ron', name: 'Romanian', exampleText: 'citit', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, }, { iso: 'ru', iso639_3: 'rus', name: 'Russian', exampleText: 'читать', textPreprocessors: { ...capitalizationPreprocessors, yoToE, removeRussianDiacritics, }, }, { iso: 'sga', iso639_3: 'sga', name: 'Old Irish', exampleText: 'légaid', textPreprocessors: { ...capitalizationPreprocessors, removeAlphabeticDiacritics, }, languageTransforms: oldIrishTransforms, }, { iso: 'sh', iso639_3: 'hbs', name: 'Serbo-Croatian', exampleText: 'čitaše', textPreprocessors: { ...capitalizationPreprocessors, removeSerboCroatianAccentMarks, }, }, { iso: 'sq', iso639_3: 'sqi', name: 'Albanian', exampleText: 'ndihmojme', textPreprocessors: capitalizationPreprocessors, languageTransforms: albanianTransforms, }, { iso: 'sv', iso639_3: 'swe', name: 'Swedish', exampleText: 'läsa', textPreprocessors: capitalizationPreprocessors, }, { iso: 'th', iso639_3: 'tha', name: 'Thai', exampleText: 'อ่าน', }, { iso: 'tr', iso639_3: 'tur', name: 'Turkish', exampleText: 'okuyor', textPreprocessors: capitalizationPreprocessors, }, { iso: 'vi', iso639_3: 'vie', name: 'Vietnamese', exampleText: 'đọc', textPreprocessors: { ...capitalizationPreprocessors, normalizeDiacritics, }, }, { iso: 'yue', iso639_3: 'yue', name: 'Cantonese', exampleText: '讀', }, { iso: 'zh', iso639_3: 'zho', name: 'Chinese', exampleText: '读', isTextLookupWorthy: isStringPartiallyChinese, readingNormalizer: normalizePinyin, }, ]; /** @type {Map} */ export const languageDescriptorMap = new Map(); for (const languageDescriptor of languageDescriptors) { languageDescriptorMap.set(languageDescriptor.iso, languageDescriptor); }