diff options
author | Alex Yatskov <alex@foosoft.net> | 2016-03-31 20:03:39 -0700 |
---|---|---|
committer | Alex Yatskov <alex@foosoft.net> | 2016-03-31 20:03:39 -0700 |
commit | 7eadff3457690074c5c0140a6e9ffd6164021176 (patch) | |
tree | ad8ba8c31cba11f54ca8cab186d1d36090e070c0 /util/compile.py | |
parent | b97e75ba32781341c221f549780f3444d0916714 (diff) |
Moving large files to CSV format, deleting unused kradfile
Diffstat (limited to 'util/compile.py')
-rwxr-xr-x | util/compile.py | 51 |
1 files changed, 11 insertions, 40 deletions
diff --git a/util/compile.py b/util/compile.py index 485537dc..790ebfc7 100755 --- a/util/compile.py +++ b/util/compile.py @@ -18,7 +18,6 @@ import codecs -import json import optparse import os.path import re @@ -111,7 +110,7 @@ def load_definitions(path): def parse_kanji_dic(path): - results = {} + results = [] for line in load_definitions(path): segments = line.split() @@ -119,32 +118,20 @@ def parse_kanji_dic(path): kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:])) onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:])) glossary = '; '.join(re.findall('\{([^\}]+)\}', line)) - results[character] = (kunyomi, onyomi, glossary) - - return results - - -def parse_krad_file(path): - results = {} - - for line in load_definitions(path): - segments = line.split(' ') - character = segments[0] - radicals = ' '.join(segments[2:]) - results[character] = radicals; + results.append((character, kunyomi, onyomi, glossary)) return results def parse_edict(path): - defs = [] + results = [] for line in load_definitions(path): segments = line.split('/') expression = segments[0].split(' ') term = expression[0] match = re.search('\[([^\]]+)\]', expression[1]) - reading = None if match is None else match.group(1) + reading = '' if match is None else match.group(1) glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:])) glossary = re.sub('\(\d+\)\s*', '', glossary) @@ -156,30 +143,21 @@ def parse_edict(path): tags = set(tags).intersection(PARSED_TAGS) tags = ' '.join(tags) - defs.append((term, reading, glossary, tags)) - - indices = {} - for i, d in enumerate(defs): - for key in d[:2]: - if key is not None: - values = indices.get(key, []) - values.append(i) - indices[key] = values + results.append((term, reading, glossary, tags)) - return {'defs': defs, 'indices': indices} + return results[1:] def build_dict(output_dir, input_file, parser): if input_file is not None: base = os.path.splitext(os.path.basename(input_file))[0] - with open(os.path.join(output_dir, base) + '.json', 'w') as fp: - # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': ')) - json.dump(parser(input_file), fp) + with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp: + for d in parser(input_file): + fp.write('\t'.join(d) + '\n') -def build(dict_dir, kanjidic, kradfile, edict, enamdict): +def build(dict_dir, kanjidic, edict, enamdict): build_dict(dict_dir, kanjidic, parse_kanji_dic) - build_dict(dict_dir, kradfile, parse_krad_file) build_dict(dict_dir, edict, parse_edict) build_dict(dict_dir, enamdict, parse_edict) @@ -187,7 +165,6 @@ def build(dict_dir, kanjidic, kradfile, edict, enamdict): def main(): parser = optparse.OptionParser() parser.add_option('--kanjidic', dest='kanjidic') - parser.add_option('--kradfile', dest='kradfile') parser.add_option('--edict', dest='edict') parser.add_option('--enamdict', dest='enamdict') @@ -196,13 +173,7 @@ def main(): if len(args) == 0: parser.print_help() else: - build( - args[0], - options.kanjidic, - options.kradfile, - options.edict, - options.enamdict - ) + build(args[0], options.kanjidic, options.edict, options.enamdict) if __name__ == '__main__': |