#!/usr/bin/env python # Copyright (C) 2016 Alex Yatskov <alex@foosoft.net> # Author: Alex Yatskov <alex@foosoft.net> # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. import codecs import json import optparse import os.path import re PARSED_TAGS = { 'Buddh': 'Buddhist term', 'MA': 'martial arts term', 'X': 'rude or X-rated term', 'abbr': 'abbreviation', 'adj': 'former adjective classification (being removed)', 'adj-f': 'noun or verb acting prenominally (other than the above)', 'adj-i': 'adjective (keiyoushi)', 'adj-na': 'adjectival nouns or quasi-adjectives (keiyodoshi)', 'adj-no': 'nouns which may take the genitive case particle "no"', 'adj-pn': 'pre-noun adjectival (rentaishi)', 'adj-t': '"taru" adjective', 'adv': 'adverb (fukushi)', 'adv-n': 'adverbial noun', 'adv-to': 'adverb taking the "to" particle', 'arch': 'archaism', 'ateji': 'ateji (phonetic) reading', 'aux': 'auxiliary', 'aux-adj': 'auxiliary adjective', 'aux-v': 'auxiliary verb', 'c': 'company name', 'chn': 'children\'s language', 'col': 'colloquialism', 'comp': 'computer terminology', 'conj': 'conjunction', 'ctr': 'counter', 'derog': 'derogatory term', 'eK': 'exclusively kanji', 'ek': 'exclusively kana', 'exp': 'Expressions (phrases, clauses, etc.)', 'f': 'female given name', 'fam': 'familiar language', 'fem': 'female term or language', 'food': 'food term', 'g': 'given name, as-yet not classified by sex', 'geom': 'geometry term', 'gikun': 'gikun (meaning) reading', 'gram': 'grammatical term', 'h': 'full (usually family plus given) name of a particular person', 'hon': 'honorific or respectful (sonkeigo) language', 'hum': 'humble (kenjougo) language', 'iK': 'word containing irregular kanji usage', 'id': 'idiomatic expression', 'ik': 'word containing irregular kana usage', 'int': 'interjection (kandoushi)', 'io': 'irregular okurigana usage', 'iv': 'irregular verb', 'ling': 'linguistics terminology', 'm': 'male given name', 'm-sl': 'manga slang', 'male': 'male term or language', 'male-sl': 'male slang', 'math': 'mathematics', 'mil': 'military', 'n': 'noun (common) (futsuumeishi)', 'n-adv': 'adverbial noun (fukushitekimeishi)', 'n-pref': 'noun, used as a prefix', 'n-suf': 'noun, used as a suffix', 'n-t': 'noun (temporal) (jisoumeishi)', 'num': 'numeric', 'oK': 'word containing out-dated kanji', 'obs': 'obsolete term', 'obsc': 'obscure term', 'ok': 'out-dated or obsolete kana usage', 'on-mim': 'onomatopoeic or mimetic word', 'P': 'popular term', 'p': 'place-name', 'physics': 'physics terminology', 'pn': 'pronoun', 'poet': 'poetical term', 'pol': 'polite (teineigo) language', 'pr': 'product name', 'pref': 'prefix', 'prt': 'particle', 'rare': 'rare (now replaced by "obsc")', 's': 'surname', 'sens': 'sensitive word', 'sl': 'slang', 'st': 'stations', 'suf': 'suffix', 'u': 'person name, either given or surname, as-yet unclassified', 'uK': 'word usually written using kanji alone', 'uk': 'word usually written using kana alone', 'v1': 'Ichidan verb', 'v2a-s': 'Nidan verb with "u" ending (archaic)', 'v4h': 'Yodan verb with "hu/fu" ending (archaic)', 'v4r': 'Yodan verb with "ru" ending (archaic)', 'v5': 'Godan verb (not completely classified)', # 'v5aru': 'Godan verb - -aru special class', # 'v5b': 'Godan verb with "bu" ending', # 'v5g': 'Godan verb with "gu" ending', # 'v5k': 'Godan verb with "ku" ending', # 'v5k-s': 'Godan verb - iku/yuku special class', # 'v5m': 'Godan verb with "mu" ending', # 'v5n': 'Godan verb with "nu" ending', # 'v5r': 'Godan verb with "ru" ending', # 'v5r-i': 'Godan verb with "ru" ending (irregular verb)', # 'v5s': 'Godan verb with "su" ending', # 'v5t': 'Godan verb with "tsu" ending', # 'v5u': 'Godan verb with "u" ending', # 'v5u-s': 'Godan verb with "u" ending (special class)', # 'v5uru': 'Godan verb - uru old class verb (old form of Eru)', # 'v5z': 'Godan verb with "zu" ending', 'vi': 'intransitive verb', 'vk': 'kuru verb - special class', 'vn': 'irregular nu verb', 'vs': 'noun or participle which takes the aux. verb suru', 'vs-c': 'su verb - precursor to the modern suru', 'vs-i': 'suru verb - irregular', 'vs-s': 'suru verb - special class', 'vt': 'transitive ver', 'vulg': 'vulgar expression or word', 'vz': 'Ichidan verb - zuru verb - (alternative form of -jiru verbs)', } def is_hiragana(c): return 0x3040 <= ord(c) < 0x30a0 def is_katakana(c): return 0x30a0 <= ord(c) < 0x3100 def load_definitions(path): print('Parsing "{0}"...'.format(path)) with codecs.open(path, encoding='euc-jp') as fp: return filter(lambda x: x and x[0] != '#', fp.read().splitlines()) def parse_kanji_dic(path): results = {} for line in load_definitions(path): segments = line.split() character = segments[0] kunyomi = ' '.join(filter(lambda x: list(filter(is_hiragana, x)), segments[1:])) onyomi = ' '.join(filter(lambda x: list(filter(is_katakana, x)), segments[1:])) glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) results[character] = (kunyomi or None, onyomi or None, glossary) return results def fixup_godan_verbs(tags): results = [] for tag in tags: if tag.startswith('v5'): tag = 'v5' results.append(tag) return set(results) def parse_edict(path): results = [] for line in load_definitions(path): segments = line.split('/') exp_parts = segments[0].split(' ') expression = exp_parts[0] reading_match = re.search('\[([^\]]+)\]', exp_parts[1]) reading = None if reading_match is None else reading_match.group(1) defs = [] tags = set() for index, dfn in enumerate(filter(None, segments[1:])): dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn) gloss = dfn_match.group(2).strip() if len(gloss) == 0: continue tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) tags_raw = fixup_godan_verbs(tags_raw) tags_raw = tags_raw.intersection(set(PARSED_TAGS.keys())) tags = tags.union(tags_raw) if index == 0 or len(dfn_match.group(1)) > 0: defs.append([gloss]) else: defs[-1].append(gloss) result = [expression, reading, ' '.join(tags)] result += map(lambda x: '; '.join(x), defs) results.append(result) indices = {} for i, d in enumerate(results): for key in d[:2]: if key is not None: values = indices.get(key, []) values.append(i) indices[key] = values return {'defs': results, 'indices': indices} def build_dict(output_dir, input_file, parser): if input_file is not None: base = os.path.splitext(os.path.basename(input_file))[0] with open(os.path.join(output_dir, base) + '.json', 'w') as fp: # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': ')) json.dump(parser(input_file), fp, separators=(',', ':')) def build(dict_dir, kanjidic, edict, enamdict): build_dict(dict_dir, kanjidic, parse_kanji_dic) build_dict(dict_dir, edict, parse_edict) build_dict(dict_dir, enamdict, parse_edict) def main(): parser = optparse.OptionParser() parser.add_option('--kanjidic', dest='kanjidic') parser.add_option('--edict', dest='edict') parser.add_option('--enamdict', dest='enamdict') options, args = parser.parse_args() if len(args) == 0: parser.print_help() else: build(args[0], options.kanjidic, options.edict, options.enamdict) if __name__ == '__main__': main()