diff options
| -rwxr-xr-x | util/compile.py | 219 | 
1 files changed, 135 insertions, 84 deletions
| diff --git a/util/compile.py b/util/compile.py index 790ebfc7..7510aa9b 100755 --- a/util/compile.py +++ b/util/compile.py @@ -18,80 +18,114 @@  import codecs +import json  import optparse  import os.path  import re  PARSED_TAGS = { -    'P',       # common word -    'adj',     # former adjective classification (being removed) -    'adj-f',   # noun or verb acting prenominally (other than the above) -    'adj-i',   # adjective (keiyoushi) -    'adj-na',  # adjectival nouns or quasi-adjectives (keiyodoshi) -    'adj-no',  # nouns which may take the genitive case particle `no' -    'adj-pn',  # pre-noun adjectival (rentaishi) -    'adj-t',   # `taru' adjective -    'adv',     # adverb (fukushi) -    'adv-n',   # adverbial noun -    'adv-to',  # adverb taking the `to' particle -    'aux',     # auxiliary -    'aux-adj', # auxiliary adjective -    'aux-v',   # auxiliary verb -    'c',       # company name -    'conj',    # conjunction -    'ctr',     # counter -    'exp',     # Expressions (phrases, clauses, etc.) -    'f',       # female given name -    'g',       # given name, as-yet not classified by sex -    'h',       # full (usually family plus given) name of a particular person -    'int',     # interjection (kandoushi) -    'iv',      # irregular verb -    'm',       # male given name -    'n',       # noun (common) (futsuumeishi) -    'n-adv',   # adverbial noun (fukushitekimeishi) -    'n-pref',  # noun, used as a prefix -    'n-suf',   # noun, used as a suffix -    'n-t',     # noun (temporal) (jisoumeishi) -    'num',     # numeric -    'p',       # place-name -    'pn',      # pronoun -    'pr',      # product name -    'pref' ,   # prefix -    'prt',     # particle -    's',       # surname -    'st',      # stations -    'suf',     # suffix -    'u',       # person name, either given or surname, as-yet unclassified -    'v1',      # Ichidan verb -    'v2a-s',   # Nidan verb with 'u' ending (archaic) -    'v4h',     # Yodan verb with `hu/fu' ending (archaic) -    'v4r',     # Yodan verb with `ru' ending (archaic) -    'v5',      # Godan verb (not completely classified) -    'v5aru',   # Godan verb - -aru special class -    'v5b',     # Godan verb with `bu' ending -    'v5g',     # Godan verb with `gu' ending -    'v5k',     # Godan verb with `ku' ending -    'v5k-s',   # Godan verb - iku/yuku special class -    'v5m',     # Godan verb with `mu' ending -    'v5n',     # Godan verb with `nu' ending -    'v5r',     # Godan verb with `ru' ending -    'v5r-i',   # Godan verb with `ru' ending (irregular verb) -    'v5s',     # Godan verb with `su' ending -    'v5t',     # Godan verb with `tsu' ending -    'v5u',     # Godan verb with `u' ending -    'v5u-s',   # Godan verb with `u' ending (special class) -    'v5uru',   # Godan verb - uru old class verb (old form of Eru) -    'v5z',     # Godan verb with `zu' ending -    'vi',      # intransitive verb -    'vk',      # kuru verb - special class -    'vn',      # irregular nu verb -    'vs',      # noun or participle which takes the aux. verb suru -    'vs-c',    # su verb - precursor to the modern suru -    'vs-i',    # suru verb - irregular -    'vs-s',    # suru verb - special class -    'vt',      # transitive verb -    'vz',      # Ichidan verb - zuru verb - (alternative form of -jiru verbs) +    'Buddh':   'Buddhist term', +    'MA':      'martial arts term', +    'X':       'rude or X-rated term', +    'abbr':    'abbreviation', +    'adj':     'former adjective classification (being removed)', +    'adj-f':   'noun or verb acting prenominally (other than the above)', +    'adj-i':   'adjective (keiyoushi)', +    'adj-na':  'adjectival nouns or quasi-adjectives (keiyodoshi)', +    'adj-no':  'nouns which may take the genitive case particle "no"', +    'adj-pn':  'pre-noun adjectival (rentaishi)', +    'adj-t':   '"taru" adjective', +    'adv':     'adverb (fukushi)', +    'adv-n':   'adverbial noun', +    'adv-to':  'adverb taking the "to" particle', +    'arch':    'archaism', +    'ateji':   'ateji (phonetic) reading', +    'aux':     'auxiliary', +    'aux-adj': 'auxiliary adjective', +    'aux-v':   'auxiliary verb', +    'chn':     'children\'s language', +    'col':     'colloquialism', +    'comp':    'computer terminology', +    'conj':    'conjunction', +    'ctr':     'counter', +    'derog':   'derogatory term', +    'eK':      'exclusively kanji', +    'ek':      'exclusively kana', +    'exp':     'Expressions (phrases, clauses, etc.)', +    'fam':     'familiar language', +    'fem':     'female term or language', +    'food':    'food term', +    'geom':    'geometry term', +    'gikun':   'gikun (meaning) reading', +    'gram':    'grammatical term', +    'hon':     'honorific or respectful (sonkeigo) language', +    'hum':     'humble (kenjougo) language', +    'iK':      'word containing irregular kanji usage', +    'id':      'idiomatic expression', +    'ik':      'word containing irregular kana usage', +    'int':     'interjection (kandoushi)', +    'io':      'irregular okurigana usage', +    'iv':      'irregular verb', +    'ling':    'linguistics terminology', +    'm-sl':    'manga slang', +    'male':    'male term or language', +    'male-sl': 'male slang', +    'math':    'mathematics', +    'mil':     'military', +    'n':       'noun (common) (futsuumeishi)', +    'n-adv':   'adverbial noun (fukushitekimeishi)', +    'n-pref':  'noun, used as a prefix', +    'n-suf':   'noun, used as a suffix', +    'n-t':     'noun (temporal) (jisoumeishi)', +    'num':     'numeric', +    'oK':      'word containing out-dated kanji', +    'obs':     'obsolete term', +    'obsc':    'obscure term', +    'ok':      'out-dated or obsolete kana usage', +    'on-mim':  'onomatopoeic or mimetic word', +    'physics': 'physics terminology', +    'pn':      'pronoun', +    'poet':    'poetical term', +    'pol':     'polite (teineigo) language', +    'pref':    'prefix', +    'prt':     'particle', +    'rare':    'rare (now replaced by "obsc")', +    'sens':    'sensitive word', +    'sl':      'slang', +    'suf':     'suffix', +    'uK':      'word usually written using kanji alone', +    'uk':      'word usually written using kana alone', +    'v1':      'Ichidan verb', +    'v2a-s':   'Nidan verb with "u" ending (archaic)', +    'v4h':     'Yodan verb with "hu/fu" ending (archaic)', +    'v4r':     'Yodan verb with "ru" ending (archaic)', +    'v5':      'Godan verb (not completely classified)', +    'v5aru':   'Godan verb - -aru special class', +    'v5b':     'Godan verb with "bu" ending', +    'v5g':     'Godan verb with "gu" ending', +    'v5k':     'Godan verb with "ku" ending', +    'v5k-s':   'Godan verb - iku/yuku special class', +    'v5m':     'Godan verb with "mu" ending', +    'v5n':     'Godan verb with "nu" ending', +    'v5r':     'Godan verb with "ru" ending', +    'v5r-i':   'Godan verb with "ru" ending (irregular verb)', +    'v5s':     'Godan verb with "su" ending', +    'v5t':     'Godan verb with "tsu" ending', +    'v5u':     'Godan verb with "u" ending', +    'v5u-s':   'Godan verb with "u" ending (special class)', +    'v5uru':   'Godan verb - uru old class verb (old form of Eru)', +    'v5z':     'Godan verb with "zu" ending', +    'vi':      'intransitive verb', +    'vk':      'kuru verb - special class', +    'vn':      'irregular nu verb', +    'vs':      'noun or participle which takes the aux. verb suru', +    'vs-c':    'su verb - precursor to the modern suru', +    'vs-i':    'suru verb - irregular', +    'vs-s':    'suru verb - special class', +    'vt':      'transitive ver', +    'vulg':    'vulgar expression or word', +    'vz':      'Ichidan verb - zuru verb - (alternative form of -jiru verbs)',  } @@ -128,32 +162,49 @@ def parse_edict(path):      for line in load_definitions(path):          segments = line.split('/') -        expression = segments[0].split(' ') -        term = expression[0] -        match = re.search('\[([^\]]+)\]', expression[1]) -        reading = '' if match is None else match.group(1) - -        glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:])) -        glossary = re.sub('\(\d+\)\s*', '', glossary) +        exp_parts = segments[0].split(' ') +        expression = exp_parts[0] +        reading_match = re.search('\[([^\]]+)\]', exp_parts[1]) +        reading = None if reading_match is None else reading_match.group(1) +        defs = []          tags = [] -        for group in re.findall('\(([^\)\]]+)\)', glossary): -            tags.extend(group.split(',')) -        tags = set(tags).intersection(PARSED_TAGS) -        tags = ' '.join(tags) +        for index, dfn in enumerate(filter(None, segments[1:])): +            dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn) +            gloss = dfn_match.group(2) + +            if index == 0: +                tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) +                tags = tags_raw.intersection(set(PARSED_TAGS.keys())) + +            if index == 0 or len(dfn_match.group(1)) > 0: +                defs.append([gloss]) +            else: +                defs[-1].append(gloss) + +        result = [expression, reading, ' '.join(tags)] +        result += map(lambda x: '; '.join(x), defs) + +        results.append(result) -        results.append((term, reading, glossary, tags)) +    indices = {} +    for i, d in enumerate(results): +        for key in d[:2]: +            if key is not None: +                values = indices.get(key, []) +                values.append(i) +                indices[key] = values -    return results[1:] +    return {'defs': results, 'indices': indices}  def build_dict(output_dir, input_file, parser):      if input_file is not None:          base = os.path.splitext(os.path.basename(input_file))[0] -        with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp: -            for d in parser(input_file): -                fp.write('\t'.join(d) + '\n') +        with open(os.path.join(output_dir, base) + '.json', 'w') as fp: +             # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': ')) +             json.dump(parser(input_file), fp, separators=(',', ':'))  def build(dict_dir, kanjidic, edict, enamdict): |