From d2e9841f96ebff61d4a5c26a322484f6268115f1 Mon Sep 17 00:00:00 2001 From: StefanVukovic99 Date: Fri, 22 Mar 2024 15:27:35 +0100 Subject: expand deinflection format (#745) * abstract deinflections * undo redundant changes * remove cast * switch to js * MultiLanguageTransformer * comments * comments * fix test * suffixInflection * fix bench * substring instead of replace * without heuristic * suffixMap * add other language deinflections * wip * catch cycles * fix tests * uninflect to deinflect * use less regex * add suru masu stem deinflection --- ext/js/language/en/english-transforms.js | 282 +++++++++++++++++++++++++++++++ 1 file changed, 282 insertions(+) create mode 100644 ext/js/language/en/english-transforms.js (limited to 'ext/js/language/en/english-transforms.js') diff --git a/ext/js/language/en/english-transforms.js b/ext/js/language/en/english-transforms.js new file mode 100644 index 00000000..eab8b328 --- /dev/null +++ b/ext/js/language/en/english-transforms.js @@ -0,0 +1,282 @@ +/* + * Copyright (C) 2024 Yomitan Authors + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +import {prefixInflection, suffixInflection} from '../language-transforms.js'; + +/** + * @param {string} consonants + * @param {string} suffix + * @param {string[]} conditionsIn + * @param {string[]} conditionsOut + * @returns {import('language-transformer').SuffixRule[]} + */ +function doubledConsonantInflection(consonants, suffix, conditionsIn, conditionsOut) { + const inflections = []; + for (const consonant of consonants) { + inflections.push(suffixInflection(`${consonant}${consonant}${suffix}`, consonant, conditionsIn, conditionsOut)); + } + return inflections; +} + +const pastSuffixInflections = [ + suffixInflection('ed', '', ['v'], ['v']), // 'walked' + suffixInflection('ed', 'e', ['v'], ['v']), // 'hoped' + suffixInflection('ied', 'y', ['v'], ['v']), // 'tried' + suffixInflection('cked', 'c', ['v'], ['v']), // 'frolicked' + ...doubledConsonantInflection('bdgklmnprstz', 'ed', ['v'], ['v']), + + suffixInflection('laid', 'lay', ['v'], ['v']), + suffixInflection('paid', 'pay', ['v'], ['v']), + suffixInflection('said', 'say', ['v'], ['v']) +]; + +const ingSuffixInflections = [ + suffixInflection('ing', '', ['v'], ['v']), // 'walking' + suffixInflection('ing', 'e', ['v'], ['v']), // 'driving' + suffixInflection('ying', 'ie', ['v'], ['v']), // 'lying' + suffixInflection('cking', 'c', ['v'], ['v']), // 'panicking' + ...doubledConsonantInflection('bdgklmnprstz', 'ing', ['v'], ['v']) +]; + +const thirdPersonSgPresentSuffixInflections = [ + suffixInflection('s', '', ['v'], ['v']), // 'walks' + suffixInflection('es', '', ['v'], ['v']), // 'teaches' + suffixInflection('ies', 'y', ['v'], ['v']) // 'tries' +]; + +const phrasalVerbParticles = ['aboard', 'about', 'above', 'across', 'ahead', 'alongside', 'apart', 'around', 'aside', 'astray', 'away', 'back', 'before', 'behind', 'below', 'beneath', 'besides', 'between', 'beyond', 'by', 'close', 'down', 'east', 'west', 'north', 'south', 'eastward', 'westward', 'northward', 'southward', 'forward', 'backward', 'backwards', 'forwards', 'home', 'in', 'inside', 'instead', 'near', 'off', 'on', 'opposite', 'out', 'outside', 'over', 'overhead', 'past', 'round', 'since', 'through', 'throughout', 'together', 'under', 'underneath', 'up', 'within', 'without']; +const phrasalVerbPrepositions = ['aback', 'about', 'above', 'across', 'after', 'against', 'ahead', 'along', 'among', 'apart', 'around', 'as', 'aside', 'at', 'away', 'back', 'before', 'behind', 'below', 'between', 'beyond', 'by', 'down', 'even', 'for', 'forth', 'forward', 'from', 'in', 'into', 'it', 'of', 'off', 'on', 'one', 'onto', 'open', 'out', 'over', 'past', 'round', 'through', 'to', 'together', 'toward', 'towards', 'under', 'up', 'upon', 'way', 'with', 'without']; + +const particlesDisjunction = phrasalVerbParticles.join('|'); +const phrasalVerbWordSet = new Set([...phrasalVerbParticles, ...phrasalVerbPrepositions]); +const phrasalVerbWordDisjunction = [...phrasalVerbWordSet].join('|'); +/** @type {import('language-transformer').Rule} */ +const phrasalVerbInterposedObjectRule = { + type: 'other', + isInflected: new RegExp(`^\\w* (?:(?!\\b(${phrasalVerbWordDisjunction})\\b).)+ (?:${particlesDisjunction})`), + deinflect: (term) => { + return term.replace(new RegExp(`(?<=\\w) (?:(?!\\b(${phrasalVerbWordDisjunction})\\b).)+ (?=(?:${particlesDisjunction}))`), ' '); + }, + conditionsIn: [], + conditionsOut: ['v'] +}; + +/** + * @param {string} inflected + * @param {string} deinflected + * @returns {import('language-transformer').Rule} + */ +function createPhrasalVerbInflection(inflected, deinflected) { + return { + type: 'other', + isInflected: new RegExp(`^\\w*${inflected} (?:${phrasalVerbWordDisjunction})`), + deinflect: (term) => { + return term.replace(new RegExp(`(?<=)${inflected}(?= (?:${phrasalVerbWordDisjunction}))`), deinflected); + }, + conditionsIn: [], + conditionsOut: ['v_phr'] + }; +} + +/** + * @param {import('language-transformer').SuffixRule[]} sourceRules + * @returns {import('language-transformer').Rule[]} + */ +function createPhrasalVerbInflectionsFromSuffixInflections(sourceRules) { + return sourceRules.flatMap(({isInflected, deinflected}) => { + if (typeof deinflected === 'undefined') { return []; } + const inflectedSuffix = isInflected.source.replace('$', ''); + const deinflectedSuffix = deinflected; + return [createPhrasalVerbInflection(inflectedSuffix, deinflectedSuffix)]; + }); +} + +/** @type {import('language-transformer').LanguageTransformDescriptor} */ +export const englishTransforms = { + language: 'en', + conditions: { + v_any: { + name: 'Verb', + isDictionaryForm: false, + subConditions: ['v', 'v_irr', 'v_phr'] + }, + v: { + name: 'Regular verb', + isDictionaryForm: true + }, + v_irr: { + name: 'Irregular verb', + isDictionaryForm: true + }, + v_phr: { + name: 'Phrasal verb', + isDictionaryForm: true + }, + n: { + name: 'Noun', + isDictionaryForm: true, + subConditions: ['np', 'ns'] + }, + np: { + name: 'Noun plural', + isDictionaryForm: true + }, + ns: { + name: 'Noun singular', + isDictionaryForm: true + }, + adj: { + name: 'Adjective', + isDictionaryForm: true + }, + adv: { + name: 'Adverb', + isDictionaryForm: true + } + }, + transforms: [ + { + name: 'plural', + description: 'Plural form of a noun', + rules: [ + suffixInflection('s', '', ['np'], ['ns']), + suffixInflection('es', '', ['np'], ['ns']), + suffixInflection('ies', 'y', ['np'], ['ns']), + suffixInflection('ves', 'fe', ['np'], ['ns']), + suffixInflection('ves', 'f', ['np'], ['ns']) + ] + }, + { + name: 'possessive', + description: 'Possessive form of a noun', + rules: [ + suffixInflection('\'s', '', ['n'], ['n']), + suffixInflection('s\'', 's', ['n'], ['n']) + ] + }, + { + name: 'past', + description: 'Simple past tense of a verb', + rules: [ + ...pastSuffixInflections, + ...createPhrasalVerbInflectionsFromSuffixInflections(pastSuffixInflections) + ] + }, + { + name: 'ing', + description: 'Present participle of a verb', + rules: [ + ...ingSuffixInflections, + ...createPhrasalVerbInflectionsFromSuffixInflections(ingSuffixInflections) + ] + }, + { + name: '3rd pers. sing. pres', + description: 'Third person singular present tense of a verb', + rules: [ + ...thirdPersonSgPresentSuffixInflections, + ...createPhrasalVerbInflectionsFromSuffixInflections(thirdPersonSgPresentSuffixInflections) + ] + }, + { + name: 'interposed object', + description: 'Phrasal verb with interposed object', + rules: [ + phrasalVerbInterposedObjectRule + ] + }, + { + name: 'archaic', + description: 'Archaic form of a word', + rules: [ + suffixInflection('\'d', 'ed', ['v'], ['v']) + ] + }, + { + name: 'adverb', + description: 'Adverb form of an adjective', + rules: [ + suffixInflection('ly', '', ['adv'], ['adj']) + ] + }, + { + name: 'comparative', + description: 'Comparative form of an adjective', + rules: [ + suffixInflection('er', '', ['adj'], ['adj']), // 'faster' + suffixInflection('er', 'e', ['adj'], ['adj']), // 'nicer' + suffixInflection('ier', 'y', ['adj'], ['adj']), // 'happier' + ...doubledConsonantInflection('bdgmnt', 'er', ['adj'], ['adj']) + ] + }, + { + name: 'superlative', + description: 'Superlative form of an adjective', + rules: [ + suffixInflection('est', '', ['adj'], ['adj']), // 'fastest' + suffixInflection('est', 'e', ['adj'], ['adj']), // 'nicest' + suffixInflection('iest', 'y', ['adj'], ['adj']), // 'happiest' + ...doubledConsonantInflection('bdgmnt', 'est', ['adj'], ['adj']) + ] + }, + { + name: 'dropped g', + description: 'Dropped g in -ing form of a verb', + rules: [ + suffixInflection('in\'', 'ing', ['v'], ['v']) + ] + }, + { + name: '-y', + description: 'Adjective formed from a verb or noun', + rules: [ + suffixInflection('y', '', ['adj'], ['n', 'v']), // 'dirty', 'pushy' + suffixInflection('y', 'e', ['adj'], ['n', 'v']), // 'hazy' + ...doubledConsonantInflection('glmnprst', 'y', [], ['n', 'v']) // 'baggy', 'saggy' + ] + }, + { + name: 'un-', + description: 'Negative form of an adjective, adverb, or verb', + rules: [ + suffixInflection('un', '', ['adj', 'adv', 'v'], ['adj', 'adv', 'v']) + ] + }, + { + name: 'going-to future', + description: 'Going-to future tense of a verb', + rules: [ + prefixInflection('going to ', '', ['v'], ['v']) + ] + }, + { + name: 'will future', + description: 'Will-future tense of a verb', + rules: [ + prefixInflection('will ', '', ['v'], ['v']) + ] + }, + { + name: 'imperative negative', + description: 'Negative imperative form of a verb', + rules: [ + prefixInflection('don\'t ', '', ['v'], ['v']), + prefixInflection('do not ', '', ['v'], ['v']) + ] + } + ] +}; -- cgit v1.2.3