Updating dictionary format again

author: Alex Yatskov <alex@foosoft.net> 2016-04-12 20:17:40 -0700
committer: Alex Yatskov <alex@foosoft.net> 2016-04-12 20:17:40 -0700
commit: cfab4c31eca220ecbab1096b8d11ba7d0d45ed26 (patch)
tree: 97c9ac2acc45f05770036f0729b713f79ab9e44e /util
parent: bf7c476a72753cd3449a092fc784eb2b63bd378c (diff)
1 files changed, 135 insertions, 84 deletions
diff --git a/util/compile.py b/util/compile.py
index 790ebfc7..7510aa9b 100755
--- a/util/compile.py
+++ b/util/compile.py
@@ -18,80 +18,114 @@
 
 
 import codecs
+import json
 import optparse
 import os.path
 import re
 
 
 PARSED_TAGS = {
-    'P',       # common word
-    'adj',     # former adjective classification (being removed)
-    'adj-f',   # noun or verb acting prenominally (other than the above)
-    'adj-i',   # adjective (keiyoushi)
-    'adj-na',  # adjectival nouns or quasi-adjectives (keiyodoshi)
-    'adj-no',  # nouns which may take the genitive case particle `no'
-    'adj-pn',  # pre-noun adjectival (rentaishi)
-    'adj-t',   # `taru' adjective
-    'adv',     # adverb (fukushi)
-    'adv-n',   # adverbial noun
-    'adv-to',  # adverb taking the `to' particle
-    'aux',     # auxiliary
-    'aux-adj', # auxiliary adjective
-    'aux-v',   # auxiliary verb
-    'c',       # company name
-    'conj',    # conjunction
-    'ctr',     # counter
-    'exp',     # Expressions (phrases, clauses, etc.)
-    'f',       # female given name
-    'g',       # given name, as-yet not classified by sex
-    'h',       # full (usually family plus given) name of a particular person
-    'int',     # interjection (kandoushi)
-    'iv',      # irregular verb
-    'm',       # male given name
-    'n',       # noun (common) (futsuumeishi)
-    'n-adv',   # adverbial noun (fukushitekimeishi)
-    'n-pref',  # noun, used as a prefix
-    'n-suf',   # noun, used as a suffix
-    'n-t',     # noun (temporal) (jisoumeishi)
-    'num',     # numeric
-    'p',       # place-name
-    'pn',      # pronoun
-    'pr',      # product name
-    'pref' ,   # prefix
-    'prt',     # particle
-    's',       # surname
-    'st',      # stations
-    'suf',     # suffix
-    'u',       # person name, either given or surname, as-yet unclassified
-    'v1',      # Ichidan verb
-    'v2a-s',   # Nidan verb with 'u' ending (archaic)
-    'v4h',     # Yodan verb with `hu/fu' ending (archaic)
-    'v4r',     # Yodan verb with `ru' ending (archaic)
-    'v5',      # Godan verb (not completely classified)
-    'v5aru',   # Godan verb - -aru special class
-    'v5b',     # Godan verb with `bu' ending
-    'v5g',     # Godan verb with `gu' ending
-    'v5k',     # Godan verb with `ku' ending
-    'v5k-s',   # Godan verb - iku/yuku special class
-    'v5m',     # Godan verb with `mu' ending
-    'v5n',     # Godan verb with `nu' ending
-    'v5r',     # Godan verb with `ru' ending
-    'v5r-i',   # Godan verb with `ru' ending (irregular verb)
-    'v5s',     # Godan verb with `su' ending
-    'v5t',     # Godan verb with `tsu' ending
-    'v5u',     # Godan verb with `u' ending
-    'v5u-s',   # Godan verb with `u' ending (special class)
-    'v5uru',   # Godan verb - uru old class verb (old form of Eru)
-    'v5z',     # Godan verb with `zu' ending
-    'vi',      # intransitive verb
-    'vk',      # kuru verb - special class
-    'vn',      # irregular nu verb
-    'vs',      # noun or participle which takes the aux. verb suru
-    'vs-c',    # su verb - precursor to the modern suru
-    'vs-i',    # suru verb - irregular
-    'vs-s',    # suru verb - special class
-    'vt',      # transitive verb
-    'vz',      # Ichidan verb - zuru verb - (alternative form of -jiru verbs)
+    'Buddh':   'Buddhist term',
+    'MA':      'martial arts term',
+    'X':       'rude or X-rated term',
+    'abbr':    'abbreviation',
+    'adj':     'former adjective classification (being removed)',
+    'adj-f':   'noun or verb acting prenominally (other than the above)',
+    'adj-i':   'adjective (keiyoushi)',
+    'adj-na':  'adjectival nouns or quasi-adjectives (keiyodoshi)',
+    'adj-no':  'nouns which may take the genitive case particle "no"',
+    'adj-pn':  'pre-noun adjectival (rentaishi)',
+    'adj-t':   '"taru" adjective',
+    'adv':     'adverb (fukushi)',
+    'adv-n':   'adverbial noun',
+    'adv-to':  'adverb taking the "to" particle',
+    'arch':    'archaism',
+    'ateji':   'ateji (phonetic) reading',
+    'aux':     'auxiliary',
+    'aux-adj': 'auxiliary adjective',
+    'aux-v':   'auxiliary verb',
+    'chn':     'children\'s language',
+    'col':     'colloquialism',
+    'comp':    'computer terminology',
+    'conj':    'conjunction',
+    'ctr':     'counter',
+    'derog':   'derogatory term',
+    'eK':      'exclusively kanji',
+    'ek':      'exclusively kana',
+    'exp':     'Expressions (phrases, clauses, etc.)',
+    'fam':     'familiar language',
+    'fem':     'female term or language',
+    'food':    'food term',
+    'geom':    'geometry term',
+    'gikun':   'gikun (meaning) reading',
+    'gram':    'grammatical term',
+    'hon':     'honorific or respectful (sonkeigo) language',
+    'hum':     'humble (kenjougo) language',
+    'iK':      'word containing irregular kanji usage',
+    'id':      'idiomatic expression',
+    'ik':      'word containing irregular kana usage',
+    'int':     'interjection (kandoushi)',
+    'io':      'irregular okurigana usage',
+    'iv':      'irregular verb',
+    'ling':    'linguistics terminology',
+    'm-sl':    'manga slang',
+    'male':    'male term or language',
+    'male-sl': 'male slang',
+    'math':    'mathematics',
+    'mil':     'military',
+    'n':       'noun (common) (futsuumeishi)',
+    'n-adv':   'adverbial noun (fukushitekimeishi)',
+    'n-pref':  'noun, used as a prefix',
+    'n-suf':   'noun, used as a suffix',
+    'n-t':     'noun (temporal) (jisoumeishi)',
+    'num':     'numeric',
+    'oK':      'word containing out-dated kanji',
+    'obs':     'obsolete term',
+    'obsc':    'obscure term',
+    'ok':      'out-dated or obsolete kana usage',
+    'on-mim':  'onomatopoeic or mimetic word',
+    'physics': 'physics terminology',
+    'pn':      'pronoun',
+    'poet':    'poetical term',
+    'pol':     'polite (teineigo) language',
+    'pref':    'prefix',
+    'prt':     'particle',
+    'rare':    'rare (now replaced by "obsc")',
+    'sens':    'sensitive word',
+    'sl':      'slang',
+    'suf':     'suffix',
+    'uK':      'word usually written using kanji alone',
+    'uk':      'word usually written using kana alone',
+    'v1':      'Ichidan verb',
+    'v2a-s':   'Nidan verb with "u" ending (archaic)',
+    'v4h':     'Yodan verb with "hu/fu" ending (archaic)',
+    'v4r':     'Yodan verb with "ru" ending (archaic)',
+    'v5':      'Godan verb (not completely classified)',
+    'v5aru':   'Godan verb - -aru special class',
+    'v5b':     'Godan verb with "bu" ending',
+    'v5g':     'Godan verb with "gu" ending',
+    'v5k':     'Godan verb with "ku" ending',
+    'v5k-s':   'Godan verb - iku/yuku special class',
+    'v5m':     'Godan verb with "mu" ending',
+    'v5n':     'Godan verb with "nu" ending',
+    'v5r':     'Godan verb with "ru" ending',
+    'v5r-i':   'Godan verb with "ru" ending (irregular verb)',
+    'v5s':     'Godan verb with "su" ending',
+    'v5t':     'Godan verb with "tsu" ending',
+    'v5u':     'Godan verb with "u" ending',
+    'v5u-s':   'Godan verb with "u" ending (special class)',
+    'v5uru':   'Godan verb - uru old class verb (old form of Eru)',
+    'v5z':     'Godan verb with "zu" ending',
+    'vi':      'intransitive verb',
+    'vk':      'kuru verb - special class',
+    'vn':      'irregular nu verb',
+    'vs':      'noun or participle which takes the aux. verb suru',
+    'vs-c':    'su verb - precursor to the modern suru',
+    'vs-i':    'suru verb - irregular',
+    'vs-s':    'suru verb - special class',
+    'vt':      'transitive ver',
+    'vulg':    'vulgar expression or word',
+    'vz':      'Ichidan verb - zuru verb - (alternative form of -jiru verbs)',
 }
 
 
@@ -128,32 +162,49 @@ def parse_edict(path):
     for line in load_definitions(path):
         segments = line.split('/')
 
-        expression = segments[0].split(' ')
-        term = expression[0]
-        match = re.search('\[([^\]]+)\]', expression[1])
-        reading = '' if match is None else match.group(1)
-
-        glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
-        glossary = re.sub('\(\d+\)\s*', '', glossary)
+        exp_parts = segments[0].split(' ')
+        expression = exp_parts[0]
+        reading_match = re.search('\[([^\]]+)\]', exp_parts[1])
+        reading = None if reading_match is None else reading_match.group(1)
 
+        defs = []
         tags = []
-        for group in re.findall('\(([^\)\]]+)\)', glossary):
-            tags.extend(group.split(','))
 
-        tags = set(tags).intersection(PARSED_TAGS)
-        tags = ' '.join(tags)
+        for index, dfn in enumerate(filter(None, segments[1:])):
+            dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn)
+            gloss = dfn_match.group(2)
+
+            if index == 0:
+                tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1))))
+                tags = tags_raw.intersection(set(PARSED_TAGS.keys()))
+
+            if index == 0 or len(dfn_match.group(1)) > 0:
+                defs.append([gloss])
+            else:
+                defs[-1].append(gloss)
+
+        result = [expression, reading, ' '.join(tags)]
+        result += map(lambda x: '; '.join(x), defs)
+
+        results.append(result)
 
-        results.append((term, reading, glossary, tags))
+    indices = {}
+    for i, d in enumerate(results):
+        for key in d[:2]:
+            if key is not None:
+                values = indices.get(key, [])
+                values.append(i)
+                indices[key] = values
 
-    return results[1:]
+    return {'defs': results, 'indices': indices}
 
 
 def build_dict(output_dir, input_file, parser):
     if input_file is not None:
         base = os.path.splitext(os.path.basename(input_file))[0]
-        with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
-            for d in parser(input_file):
-                fp.write('\t'.join(d) + '\n')
+        with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
+             # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
+             json.dump(parser(input_file), fp, separators=(',', ':'))
 
 
 def build(dict_dir, kanjidic, edict, enamdict):
author	Alex Yatskov <alex@foosoft.net>	2016-04-12 20:17:40 -0700
committer	Alex Yatskov <alex@foosoft.net>	2016-04-12 20:17:40 -0700
commit	cfab4c31eca220ecbab1096b8d11ba7d0d45ed26 (patch)
tree	97c9ac2acc45f05770036f0729b713f79ab9e44e /util
parent	bf7c476a72753cd3449a092fc784eb2b63bd378c (diff)