diff options
| author | James Maa <jmaa@berkeley.edu> | 2024-05-23 15:23:10 -0700 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2024-05-23 22:23:10 +0000 | 
| commit | d6aa6737821f5db61e932714322f2401f86b5200 (patch) | |
| tree | a8d95dab4c7f6ebe1140bb894a919bc666761ab6 | |
| parent | bbb19669c27a4216ae11937650da173165e72978 (diff) | |
Basic Spanish Transforms (#908)
* Spanish transforms
* Add more spanish transforms
* Address comments
* Fix types
* Undo prefix change
| -rw-r--r-- | .eslintrc.json | 1 | ||||
| -rw-r--r-- | ext/js/core/log.js | 2 | ||||
| -rw-r--r-- | ext/js/language/es/spanish-transforms.js | 171 | ||||
| -rw-r--r-- | ext/js/language/language-descriptors.js | 4 | ||||
| -rw-r--r-- | ext/js/language/language-transformer.js | 2 | ||||
| -rw-r--r-- | ext/js/language/language-transforms.js | 18 | ||||
| -rw-r--r-- | test/language/spanish-transforms.test.js | 100 | ||||
| -rw-r--r-- | types/ext/language-transformer-internal.d.ts | 2 | ||||
| -rw-r--r-- | types/ext/language-transformer.d.ts | 2 | 
9 files changed, 297 insertions, 5 deletions
| diff --git a/.eslintrc.json b/.eslintrc.json index 65ae89a6..ea741e5a 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -643,6 +643,7 @@                  "ext/js/language/de/german-text-preprocessors.js",                  "ext/js/language/de/german-transforms.js",                  "ext/js/language/en/english-transforms.js", +                "ext/js/language/es/spanish-transforms.js",                  "ext/js/language/ja/japanese-text-preprocessors.js",                  "ext/js/language/ja/japanese-transforms.js",                  "ext/js/language/ja/japanese-wanakana.js", diff --git a/ext/js/core/log.js b/ext/js/core/log.js index 8401cc2b..cb714e70 100644 --- a/ext/js/core/log.js +++ b/ext/js/core/log.js @@ -73,7 +73,7 @@ class Logger extends EventDispatcher {       */      logGenericError(error, level, context) {          if (typeof context === 'undefined') { -            context = {url: location.href}; +            context = typeof location === 'undefined' ? {url: 'unknown'} : {url: location.href};          }          let errorString; diff --git a/ext/js/language/es/spanish-transforms.js b/ext/js/language/es/spanish-transforms.js new file mode 100644 index 00000000..cf145f6a --- /dev/null +++ b/ext/js/language/es/spanish-transforms.js @@ -0,0 +1,171 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {suffixInflection, wholeWordInflection} from '../language-transforms.js'; + +const ACCENTS = new Map([ +    ['a', 'á'], +    ['e', 'é'], +    ['i', 'í'], +    ['o', 'ó'], +    ['u', 'ú'] +]); + + +/** + * @param {string} char + * @returns {string} + */ +function addAccent(char) { +    return ACCENTS.get(char) || char; +} + +/** @type {import('language-transformer').LanguageTransformDescriptor} */ +export const spanishTransforms = { +    language: 'es', +    conditions: { +        v: { +            name: 'Verb', +            isDictionaryForm: true, +            subConditions: ['v_ar', 'v_er', 'v_ir'] +        }, +        v_ar: { +            name: '-ar verb', +            isDictionaryForm: true +        }, +        v_er: { +            name: '-er verb', +            isDictionaryForm: true +        }, +        v_ir: { +            name: '-ir verb', +            isDictionaryForm: true +        }, +        n: { +            name: 'Noun', +            isDictionaryForm: true, +            subConditions: ['ns', 'np'] +        }, +        np: { +            name: 'Noun plural', +            isDictionaryForm: true +        }, +        ns: { +            name: 'Noun singular', +            isDictionaryForm: true +        }, +        adj: { +            name: 'Adjective', +            isDictionaryForm: true +        } +    }, +    transforms: [ +        { +            name: 'plural', +            description: 'Plural form of a noun', +            rules: [ +                suffixInflection('s', '', ['np'], ['ns']), +                suffixInflection('es', '', ['np'], ['ns']), +                suffixInflection('ces', 'z', ['np'], ['ns']), // 'lápices' -> lápiz +                ...[...'aeiou'].map((v) => suffixInflection(`${v}ses`, `${addAccent(v)}s`, ['np'], ['ns'])), // 'autobuses' -> autobús +                ...[...'aeiou'].map((v) => suffixInflection(`${v}nes`, `${addAccent(v)}n`, ['np'], ['ns'])) // 'canciones' -> canción +            ] +        }, +        { +            name: 'feminine adjective', +            description: 'feminine form of an adjective', +            rules: [ +                suffixInflection('a', 'o', ['adj'], ['adj']) +            ] +        }, +        { +            name: 'present indicative', +            description: 'Present indicative form of a verb', +            rules: [ +                // -ar verbs +                suffixInflection('o', 'ar', ['v'], ['v']), +                suffixInflection('as', 'ar', ['v'], ['v']), +                suffixInflection('a', 'ar', ['v'], ['v']), +                suffixInflection('amos', 'ar', ['v'], ['v']), +                suffixInflection('áis', 'ar', ['v'], ['v']), +                suffixInflection('an', 'ar', ['v'], ['v']), +                // -er verbs +                suffixInflection('o', 'er', ['v'], ['v']), +                suffixInflection('es', 'er', ['v'], ['v']), +                suffixInflection('e', 'er', ['v'], ['v']), +                suffixInflection('emos', 'er', ['v'], ['v']), +                suffixInflection('éis', 'er', ['v'], ['v']), +                suffixInflection('en', 'er', ['v'], ['v']), +                // -ir verbs +                suffixInflection('o', 'ir', ['v'], ['v']), +                suffixInflection('es', 'ir', ['v'], ['v']), +                suffixInflection('e', 'ir', ['v'], ['v']), +                suffixInflection('imos', 'ir', ['v'], ['v']), +                suffixInflection('ís', 'ir', ['v'], ['v']), +                suffixInflection('en', 'ir', ['v'], ['v']), +                // -tener verbs +                suffixInflection('tengo', 'tener', ['v'], ['v']), +                suffixInflection('tienes', 'tener', ['v'], ['v']), +                suffixInflection('tiene', 'tener', ['v'], ['v']), +                suffixInflection('tenemos', 'tener', ['v'], ['v']), +                suffixInflection('tenéis', 'tener', ['v'], ['v']), +                suffixInflection('tienen', 'tener', ['v'], ['v']), +                // Verbs with Irregular Yo Forms +                // -guir, -ger, or -gir verbs +                suffixInflection('go', 'guir', ['v'], ['v']), +                suffixInflection('jo', 'ger', ['v'], ['v']), +                suffixInflection('jo', 'gir', ['v'], ['v']), +                suffixInflection('aigo', 'aer', ['v'], ['v']), +                suffixInflection('zco', 'cer', ['v'], ['v']), +                suffixInflection('zco', 'cir', ['v'], ['v']), +                suffixInflection('hago', 'hacer', ['v'], ['v']), +                suffixInflection('pongo', 'poner', ['v'], ['v']), +                suffixInflection('lgo', 'lir', ['v'], ['v']), +                suffixInflection('lgo', 'ler', ['v'], ['v']), +                wholeWordInflection('quepo', 'caber', ['v'], ['v']), +                wholeWordInflection('doy', 'dar', ['v'], ['v']), +                wholeWordInflection('sé', 'saber', ['v'], ['v']), +                wholeWordInflection('veo', 'ver', ['v'], ['v']), +                // Ser, estar, ir, haber +                wholeWordInflection('soy', 'ser', ['v'], ['v']), +                wholeWordInflection('eres', 'ser', ['v'], ['v']), +                wholeWordInflection('es', 'ser', ['v'], ['v']), +                wholeWordInflection('somos', 'ser', ['v'], ['v']), +                wholeWordInflection('sois', 'ser', ['v'], ['v']), +                wholeWordInflection('son', 'ser', ['v'], ['v']), +                wholeWordInflection('estoy', 'estar', ['v'], ['v']), +                wholeWordInflection('estás', 'estar', ['v'], ['v']), +                wholeWordInflection('está', 'estar', ['v'], ['v']), +                wholeWordInflection('estamos', 'estar', ['v'], ['v']), +                wholeWordInflection('estáis', 'estar', ['v'], ['v']), +                wholeWordInflection('están', 'estar', ['v'], ['v']), +                wholeWordInflection('voy', 'ir', ['v'], ['v']), +                wholeWordInflection('vas', 'ir', ['v'], ['v']), +                wholeWordInflection('va', 'ir', ['v'], ['v']), +                wholeWordInflection('vamos', 'ir', ['v'], ['v']), +                wholeWordInflection('vais', 'ir', ['v'], ['v']), +                wholeWordInflection('van', 'ir', ['v'], ['v']), +                wholeWordInflection('he', 'haber', ['v'], ['v']), +                wholeWordInflection('has', 'haber', ['v'], ['v']), +                wholeWordInflection('ha', 'haber', ['v'], ['v']), +                wholeWordInflection('hemos', 'haber', ['v'], ['v']), +                wholeWordInflection('habéis', 'haber', ['v'], ['v']), +                wholeWordInflection('han', 'haber', ['v'], ['v']) +            ] +        } +    ] +}; diff --git a/ext/js/language/language-descriptors.js b/ext/js/language/language-descriptors.js index baf53f81..3a78aff5 100644 --- a/ext/js/language/language-descriptors.js +++ b/ext/js/language/language-descriptors.js @@ -19,6 +19,7 @@ import {removeArabicScriptDiacritics} from './ar/arabic-text-preprocessors.js';  import {eszettPreprocessor} from './de/german-text-preprocessors.js';  import {germanTransforms} from './de/german-transforms.js';  import {englishTransforms} from './en/english-transforms.js'; +import {spanishTransforms} from './es/spanish-transforms.js';  import {      alphabeticToHiragana,      alphanumericWidthVariants, @@ -78,7 +79,8 @@ const languageDescriptors = [          iso: 'es',          name: 'Spanish',          exampleText: 'acabar de', -        textPreprocessors: capitalizationPreprocessors +        textPreprocessors: capitalizationPreprocessors, +        languageTransforms: spanishTransforms      },      {          iso: 'fa', diff --git a/ext/js/language/language-transformer.js b/ext/js/language/language-transformer.js index 47f31b5f..f859ebf2 100644 --- a/ext/js/language/language-transformer.js +++ b/ext/js/language/language-transformer.js @@ -132,7 +132,7 @@ export class LanguageTransformer {                      const isCycle = trace.some((frame) => frame.transform === name && frame.ruleIndex === j && frame.text === text);                      if (isCycle) { -                        log.warn(new Error(`Cycle detected in transform[${name}] rule[${j}] for text: ${text}`)); +                        log.warn(new Error(`Cycle detected in transform[${name}] rule[${j}] for text: ${text}\nTrace: ${JSON.stringify(trace)}`));                          continue;                      } diff --git a/ext/js/language/language-transforms.js b/ext/js/language/language-transforms.js index ee8af88b..f3e36560 100644 --- a/ext/js/language/language-transforms.js +++ b/ext/js/language/language-transforms.js @@ -52,3 +52,21 @@ export function prefixInflection(inflectedPrefix, deinflectedPrefix, conditionsI          conditionsOut      };  } + +/** + * @param {string} inflectedWord + * @param {string} deinflectedWord + * @param {string[]} conditionsIn + * @param {string[]} conditionsOut + * @returns {import('language-transformer').Rule} + */ +export function wholeWordInflection(inflectedWord, deinflectedWord, conditionsIn, conditionsOut) { +    const regex = new RegExp('^' + inflectedWord + '$'); +    return { +        type: 'wholeWord', +        isInflected: regex, +        deinflect: () => deinflectedWord, +        conditionsIn, +        conditionsOut +    }; +} diff --git a/test/language/spanish-transforms.test.js b/test/language/spanish-transforms.test.js new file mode 100644 index 00000000..7a6ab729 --- /dev/null +++ b/test/language/spanish-transforms.test.js @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2024  Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program.  If not, see <https://www.gnu.org/licenses/>. + */ + +import {spanishTransforms} from '../../ext/js/language/es/spanish-transforms.js'; +import {LanguageTransformer} from '../../ext/js/language/language-transformer.js'; +import {testLanguageTransformer} from '../fixtures/language-transformer-test.js'; + +const tests = [ +    { +        category: 'nouns', +        valid: true, +        tests: [ +            {term: 'gato', source: 'gatos', rule: 'ns', reasons: ['plural']}, +            {term: 'sofá', source: 'sofás', rule: 'ns', reasons: ['plural']}, +            {term: 'tisú', source: 'tisús', rule: 'ns', reasons: ['plural']}, +            {term: 'tisú', source: 'tisúes', rule: 'ns', reasons: ['plural']}, +            {term: 'autobús', source: 'autobuses', rule: 'ns', reasons: ['plural']}, +            {term: 'ciudad', source: 'ciudades', rule: 'ns', reasons: ['plural']}, +            {term: 'clic', source: 'clics', rule: 'ns', reasons: ['plural']}, +            {term: 'sí', source: 'síes', rule: 'ns', reasons: ['plural']}, +            {term: 'zigzag', source: 'zigzags', rule: 'ns', reasons: ['plural']}, +            {term: 'luz', source: 'luces', rule: 'ns', reasons: ['plural']}, +            {term: 'canción', source: 'canciones', rule: 'ns', reasons: ['plural']} +        ] +    }, +    { +        category: 'feminine adjectives', +        valid: true, +        tests: [ +            {term: 'rojo', source: 'roja', rule: 'adj', reasons: ['feminine adjective']} +        ] +    }, +    { +        category: 'present indicative verbs', +        valid: true, +        tests: [ +            {term: 'hablar', source: 'hablo', rule: 'v', reasons: ['present indicative']}, +            {term: 'hablar', source: 'hablas', rule: 'v', reasons: ['present indicative']}, +            {term: 'hablar', source: 'habla', rule: 'v', reasons: ['present indicative']}, +            {term: 'hablar', source: 'hablamos', rule: 'v', reasons: ['present indicative']}, +            {term: 'hablar', source: 'habláis', rule: 'v', reasons: ['present indicative']}, +            {term: 'hablar', source: 'hablan', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'como', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'comes', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'come', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'comemos', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'coméis', rule: 'v', reasons: ['present indicative']}, +            {term: 'comer', source: 'comen', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'vivo', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'vives', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'vive', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'vivimos', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'vivís', rule: 'v', reasons: ['present indicative']}, +            {term: 'vivir', source: 'viven', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tengo', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tienes', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tiene', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tenemos', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tenéis', rule: 'v', reasons: ['present indicative']}, +            {term: 'tener', source: 'tienen', rule: 'v', reasons: ['present indicative']}, +            {term: 'exigir', source: 'exijo', rule: 'v', reasons: ['present indicative']}, +            {term: 'extinguir', source: 'extingo', rule: 'v', reasons: ['present indicative']}, +            {term: 'escoger', source: 'escojo', rule: 'v', reasons: ['present indicative']}, +            {term: 'caber', source: 'quepo', rule: 'v', reasons: ['present indicative']}, +            {term: 'caer', source: 'caigo', rule: 'v', reasons: ['present indicative']}, +            {term: 'conocer', source: 'conozco', rule: 'v', reasons: ['present indicative']}, +            {term: 'dar', source: 'doy', rule: 'v', reasons: ['present indicative']}, +            {term: 'hacer', source: 'hago', rule: 'v', reasons: ['present indicative']}, +            {term: 'poner', source: 'pongo', rule: 'v', reasons: ['present indicative']}, +            {term: 'saber', source: 'sé', rule: 'v', reasons: ['present indicative']}, +            {term: 'salir', source: 'salgo', rule: 'v', reasons: ['present indicative']}, +            {term: 'traducir', source: 'traduzco', rule: 'v', reasons: ['present indicative']}, +            {term: 'traer', source: 'traigo', rule: 'v', reasons: ['present indicative']}, +            {term: 'valer', source: 'valgo', rule: 'v', reasons: ['present indicative']}, +            {term: 'ver', source: 'veo', rule: 'v', reasons: ['present indicative']}, +            {term: 'ser', source: 'soy', rule: 'v', reasons: ['present indicative']}, +            {term: 'estar', source: 'estoy', rule: 'v', reasons: ['present indicative']}, +            {term: 'ir', source: 'voy', rule: 'v', reasons: ['present indicative']}, +            {term: 'haber', source: 'he', rule: 'v', reasons: ['present indicative']} +        ] +    } +]; + +const languageTransformer = new LanguageTransformer(); +languageTransformer.addDescriptor(spanishTransforms); +testLanguageTransformer(languageTransformer, tests); diff --git a/types/ext/language-transformer-internal.d.ts b/types/ext/language-transformer-internal.d.ts index 9ae412d3..a2b18b35 100644 --- a/types/ext/language-transformer-internal.d.ts +++ b/types/ext/language-transformer-internal.d.ts @@ -22,7 +22,7 @@ export type Transform = {  };  export type Rule = { -    type: 'suffix' | 'prefix' | 'other'; +    type: 'suffix' | 'prefix' | 'wholeWord' | 'other';      isInflected: RegExp;      deinflect: (inflectedWord: string) => string;      conditionsIn: number; diff --git a/types/ext/language-transformer.d.ts b/types/ext/language-transformer.d.ts index 95da602d..02457523 100644 --- a/types/ext/language-transformer.d.ts +++ b/types/ext/language-transformer.d.ts @@ -55,7 +55,7 @@ export type TransformI18n = {  };  export type Rule = { -    type: 'suffix' | 'prefix' | 'other'; +    type: 'suffix' | 'prefix' | 'wholeWord' | 'other';      isInflected: RegExp;      deinflect: (inflectedWord: string) => string;      conditionsIn: string[]; |