diff options
author | Alex Yatskov <alex@foosoft.net> | 2016-04-13 19:41:43 -0700 |
---|---|---|
committer | Alex Yatskov <alex@foosoft.net> | 2016-04-13 19:41:43 -0700 |
commit | 52fca7c5a5179683067970aa59877d00175e2519 (patch) | |
tree | 889fa752577e8102a542a3d5820c1259a77e59b5 | |
parent | 18720aa150734ac1888ca560c6e840d4ce25d00e (diff) |
Fixing dictionary generation
-rwxr-xr-x | util/compile.py | 16 |
1 files changed, 10 insertions, 6 deletions
diff --git a/util/compile.py b/util/compile.py index 8d8470dd..1ed299f7 100755 --- a/util/compile.py +++ b/util/compile.py @@ -89,6 +89,7 @@ PARSED_TAGS = { 'obsc': 'obscure term', 'ok': 'out-dated or obsolete kana usage', 'on-mim': 'onomatopoeic or mimetic word', + 'P': 'popular term', 'p': 'place-name', 'physics': 'physics terminology', 'pn': 'pronoun', @@ -177,15 +178,18 @@ def parse_edict(path): reading = None if reading_match is None else reading_match.group(1) defs = [] - tags = [] + tags = set() for index, dfn in enumerate(filter(None, segments[1:])): - dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s+)*)(.*)$', dfn) - gloss = dfn_match.group(2) + dfn_match = re.search(r'^((?:\((?:[\w\-\,\:]*)*\)\s*)*)(.*)$', dfn) - if index == 0: - tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) - tags = tags_raw.intersection(set(PARSED_TAGS.keys())) + gloss = dfn_match.group(2).strip() + if len(gloss) == 0: + continue + + tags_raw = set(filter(None, re.split(r'[\s\(\),]', dfn_match.group(1)))) + tags_raw = tags_raw.intersection(set(PARSED_TAGS.keys())) + tags = tags.union(tags_raw) if index == 0 or len(dfn_match.group(1)) > 0: defs.append([gloss]) |