aboutsummaryrefslogtreecommitdiff
path: root/util/compile.py
diff options
context:
space:
mode:
authorAlex Yatskov <alex@foosoft.net>2016-04-12 20:17:40 -0700
committerAlex Yatskov <alex@foosoft.net>2016-04-12 20:17:40 -0700
commitcfab4c31eca220ecbab1096b8d11ba7d0d45ed26 (patch)
tree97c9ac2acc45f05770036f0729b713f79ab9e44e /util/compile.py
parentbf7c476a72753cd3449a092fc784eb2b63bd378c (diff)
Updating dictionary format again
Diffstat (limited to 'util/compile.py')
-rwxr-xr-xutil/compile.py219
1 files changed, 135 insertions, 84 deletions
diff --git a/util/compile.py b/util/compile.py
index 790ebfc7..7510aa9b 100755
--- a/util/compile.py
+++ b/util/compile.py
@@ -18,80 +18,114 @@
import codecs
+import json
import optparse
import os.path
import re
PARSED_TAGS = {
- 'P', # common word
- 'adj', # former adjective classification (being removed)
- 'adj-f', # noun or verb acting prenominally (other than the above)
- 'adj-i', # adjective (keiyoushi)
- 'adj-na', # adjectival nouns or quasi-adjectives (keiyodoshi)
- 'adj-no', # nouns which may take the genitive case particle `no'
- 'adj-pn', # pre-noun adjectival (rentaishi)
- 'adj-t', # `taru' adjective
- 'adv', # adverb (fukushi)
- 'adv-n', # adverbial noun
- 'adv-to', # adverb taking the `to' particle
- 'aux', # auxiliary
- 'aux-adj', # auxiliary adjective
- 'aux-v', # auxiliary verb
- 'c', # company name
- 'conj', # conjunction
- 'ctr', # counter
- 'exp', # Expressions (phrases, clauses, etc.)
- 'f', # female given name
- 'g', # given name, as-yet not classified by sex
- 'h', # full (usually family plus given) name of a particular person
- 'int', # interjection (kandoushi)
- 'iv', # irregular verb
- 'm', # male given name
- 'n', # noun (common) (futsuumeishi)
- 'n-adv', # adverbial noun (fukushitekimeishi)
- 'n-pref', # noun, used as a prefix
- 'n-suf', # noun, used as a suffix
- 'n-t', # noun (temporal) (jisoumeishi)
- 'num', # numeric
- 'p', # place-name
- 'pn', # pronoun
- 'pr', # product name
- 'pref' , # prefix
- 'prt', # particle
- 's', # surname
- 'st', # stations
- 'suf', # suffix
- 'u', # person name, either given or surname, as-yet unclassified
- 'v1', # Ichidan verb
- 'v2a-s', # Nidan verb with 'u' ending (archaic)
- 'v4h', # Yodan verb with `hu/fu' ending (archaic)
- 'v4r', # Yodan verb with `ru' ending (archaic)
- 'v5', # Godan verb (not completely classified)
- 'v5aru', # Godan verb - -aru special class
- 'v5b', # Godan verb with `bu' ending
- 'v5g', # Godan verb with `gu' ending
- 'v5k', # Godan verb with `ku' ending
- 'v5k-s', # Godan verb - iku/yuku special class
- 'v5m', # Godan verb with `mu' ending
- 'v5n', # Godan verb with `nu' ending
- 'v5r', # Godan verb with `ru' ending
- 'v5r-i', # Godan verb with `ru' ending (irregular verb)
- 'v5s', # Godan verb with `su' ending
- 'v5t', # Godan verb with `tsu' ending
- 'v5u', # Godan verb with `u' ending
- 'v5u-s', # Godan verb with `u' ending (special class)
- 'v5uru', # Godan verb - uru old class verb (old form of Eru)
- 'v5z', # Godan verb with `zu' ending
- 'vi', # intransitive verb
- 'vk', # kuru verb - special class
- 'vn', # irregular nu verb
- 'vs', # noun or participle which takes the aux. verb suru
- 'vs-c', # su verb - precursor to the modern suru
- 'vs-i', # suru verb - irregular
- 'vs-s', # suru verb - special class
- 'vt', # transitive verb
- 'vz', # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
+ 'Buddh': 'Buddhist term',
+ 'MA': 'martial arts term',
+ 'X': 'rude or X-rated term',
+ 'abbr': 'abbreviation',
+ 'adj': 'former adjective classification (being removed)',
+ 'adj-f': 'noun or verb acting prenominally (other than the above)',
+ 'adj-i': 'adjective (keiyoushi)',
+ 'adj-na': 'adjectival nouns or quasi-adjectives (keiyodoshi)',
+ 'adj-no': 'nouns which may take the genitive case particle "no"',
+ 'adj-pn': 'pre-noun adjectival (rentaishi)',
+ 'adj-t': '"taru" adjective',
+ 'adv': 'adverb (fukushi)',
+ 'adv-n': 'adverbial noun',
+ 'adv-to': 'adverb taking the "to" particle',
+ 'arch': 'archaism',
+ 'ateji': 'ateji (phonetic) reading',
+ 'aux': 'auxiliary',
+ 'aux-adj': 'auxiliary adjective',
+ 'aux-v': 'auxiliary verb',
+ 'chn': 'children\'s language',
+ 'col': 'colloquialism',
+ 'comp': 'computer terminology',
+ 'conj': 'conjunction',
+ 'ctr': 'counter',
+ 'derog': 'derogatory term',
+ 'eK': 'exclusively kanji',
+ 'ek': 'exclusively kana',
+ 'exp': 'Expressions (phrases, clauses, etc.)',
+ 'fam': 'familiar language',
+ 'fem': 'female term or language',
+ 'food': 'food term',
+ 'geom': 'geometry term',
+ 'gikun': 'gikun (meaning) reading',
+ 'gram': 'grammatical term',
+ 'hon': 'honorific or respectful (sonkeigo) language',
+ 'hum': 'humble (kenjougo) language',
+ 'iK': 'word containing irregular kanji usage',
+ 'id': 'idiomatic expression',
+ 'ik': 'word containing irregular kana usage',
+ 'int': 'interjection (kandoushi)',
+ 'io': 'irregular okurigana usage',
+ 'iv': 'irregular verb',
+ 'ling': 'linguistics terminology',
+ 'm-sl': 'manga slang',
+ 'male': 'male term or language',
+ 'male-sl': 'male slang',
+ 'math': 'mathematics',
+ 'mil': 'military',
+ 'n': 'noun (common) (futsuumeishi)',
+ 'n-adv': 'adverbial noun (fukushitekimeishi)',
+ 'n-pref': 'noun, used as a prefix',
+ 'n-suf': 'noun, used as a suffix',
+ 'n-t': 'noun (temporal) (jisoumeishi)',
+ 'num': 'numeric',
+ 'oK': 'word containing out-dated kanji',
+ 'obs': 'obsolete term',
+ 'obsc': 'obscure term',
+ 'ok': 'out-dated or obsolete kana usage',
+ 'on-mim': 'onomatopoeic or mimetic word',
+ 'physics': 'physics terminology',
+ 'pn': 'pronoun',
+ 'poet': 'poetical term',
+ 'pol': 'polite (teineigo) language',
+ 'pref': 'prefix',
+ 'prt': 'particle',
+ 'rare': 'rare (now replaced by "obsc")',
+ 'sens': 'sensitive word',
+ 'sl': 'slang',
+ 'suf': 'suffix',
+ 'uK': 'word usually written using kanji alone',
+ 'uk': 'word usually written using kana alone',
+ 'v1': 'Ichidan verb',
+ 'v2a-s': 'Nidan verb with "u" ending (archaic)',
+ 'v4h': 'Yodan verb with "hu/fu" ending (archaic)',
+ 'v4r': 'Yodan verb with "ru" ending (archaic)',
+ 'v5': 'Godan verb (not completely classified)',
+ 'v5aru': 'Godan verb - -aru special class',
+ 'v5b': 'Godan verb with "bu" ending',
+ 'v5g': 'Godan verb with "gu" ending',
+ 'v5k': 'Godan verb with "ku" ending',
+ 'v5k-s': 'Godan verb - iku/yuku special class',
+ 'v5m': 'Godan verb with "mu" ending',
+ 'v5n': 'Godan verb with "nu" ending',
+ 'v5r': 'Godan verb with "ru" ending',
+ 'v5r-i': 'Godan verb with "ru" ending (irregular verb)',
+ 'v5s': 'Godan verb with "su" ending',
+ 'v5t': 'Godan verb with "tsu" ending',
+ 'v5u': 'Godan verb with "u" ending',
+ 'v5u-s': 'Godan verb with "u" ending (special class)',
+ 'v5uru': 'Godan verb - uru old class verb (old form of Eru)',
+ 'v5z': 'Godan verb with "zu" ending',
+ 'vi': 'intransitive verb',
+ 'vk': 'kuru verb - special class',
+ 'vn': 'irregular nu verb',
+ 'vs': 'noun or participle which takes the aux. verb suru',
+ 'vs-c': 'su verb - precursor to the modern suru',
+ 'vs-i': 'suru verb - irregular',
+ 'vs-s': 'suru verb - special class',
+ 'vt': 'transitive ver',
+ 'vulg': 'vulgar expression or word',
+ 'vz': 'Ichidan verb - zuru verb - (alternative form of -jiru verbs)',
}
@@ -128,32 +162,49 @@ def parse_edict(path):
for line in load_definitions(path):
segments = line.split('/')
- expression = segments[0].split(' ')
- term = expression[0]
- match = re.search('\[([^\]]+)\]', expression[1])
- reading = '' if match is None else match.group(1)
-
- glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
- glossary = re.sub('\(\d+\)\s*', '', glossary)
+ exp_parts = segments[0].split(' ')
+ expression = exp_parts[0]
+ reading_match = re.search('\[([^\]]+)\]', exp_parts[1])
+ reading = None if reading_match is None else reading_match.group(1)
+ defs = []
tags = []
- for group in re.findall('\(([^\)\]]+)\)', glossary):
- tags.extend(group.split(','))
- tags = set(tags).intersection(PARSED_TAGS)
- tags = ' '.join(tags)
+ for index, dfn in enumerate(filter(None, segments[1:])):
+ dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn)
+ gloss = dfn_match.group(2)
+
+ if index == 0:
+ tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
+ tags = tags_raw.intersection(set(PARSED_TAGS.keys()))
+
+ if index == 0 or len(dfn_match.group(1)) > 0:
+ defs.append([gloss])
+ else:
+ defs[-1].append(gloss)
+
+ result = [expression, reading, ' '.join(tags)]
+ result += map(lambda x: '; '.join(x), defs)
+
+ results.append(result)
- results.append((term, reading, glossary, tags))
+ indices = {}
+ for i, d in enumerate(results):
+ for key in d[:2]:
+ if key is not None:
+ values = indices.get(key, [])
+ values.append(i)
+ indices[key] = values
- return results[1:]
+ return {'defs': results, 'indices': indices}
def build_dict(output_dir, input_file, parser):
if input_file is not None:
base = os.path.splitext(os.path.basename(input_file))[0]
- with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
- for d in parser(input_file):
- fp.write('\t'.join(d) + '\n')
+ with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
+ # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
+ json.dump(parser(input_file), fp, separators=(',', ':'))
def build(dict_dir, kanjidic, edict, enamdict):