Adding converted json dictionaries

author: Alex Yatskov <alex@foosoft.net> 2016-03-20 11:43:28 -0700
committer: Alex Yatskov <alex@foosoft.net> 2016-03-20 11:43:28 -0700
commit: fbb763559425744e33d909c550994a36afe66b3e (patch)
tree: 74004da11c676e5df732444c8c10403ebbd32708
parent: 23327d7e357c358654ac302966819808d2bb2215 (diff)
3 files changed, 34 insertions, 56 deletions
diff --git a/.gitattributes b/.gitattributes
index c4d068a1..297d349d 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,2 +1,2 @@
 util/data/* filter=lfs diff=lfs merge=lfs -text
-ext/jp/data/rules.json filter=lfs diff=lfs merge=lfs -text
+ext/jp/data/* filter=lfs diff=lfs merge=lfs -text
diff --git a/build_dict.sh b/build_dict.sh
index 5dfad9b1..a13b4ed7 100755
--- a/build_dict.sh
+++ b/build_dict.sh
@@ -3,7 +3,7 @@
 KANJIDIC=util/data/kanjidic
 EDICT=util/data/edict
 ENAMDICT=util/data/enamdict
-DICT=ext/jp/data/dict.json
+KRADFILE=util/data/kradfile
+DICT_DIR=ext/jp/data
 
-[ -f $DICT ] && rm $DICT
-util/compile.py --kanjidic $KANJIDIC --edict $EDICT $DICT --enamdict $ENAMDICT
+util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
diff --git a/util/compile.py b/util/compile.py
index 7b793ca6..0fd63236 100755
--- a/util/compile.py
+++ b/util/compile.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
-# -*- coding: utf-8 -*-
 
-# Copyright (C) 2013  Alex Yatskov
+# Copyright (C) 2016  Alex Yatskov <alex@foosoft.net>
+# Author: Alex Yatskov <alex@foosoft.net>
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -18,11 +18,10 @@
 
 
 import codecs
+import json
 import optparse
-import os
+import os.path
 import re
-import sqlite3
-import sys
 
 
 PARSED_TAGS = {
@@ -97,44 +96,38 @@ PARSED_TAGS = {
 }
 
 
-def isHiragana(c):
+def is_hiragana(c):
     return 0x3040 <= ord(c) < 0x30a0
 
 
-def isKatakana(c):
+def is_katakana(c):
     return 0x30a0 <= ord(c) < 0x3100
 
 
-def loadDefinitions(path):
-    print 'Parsing "{0}"...'.format(path)
+def load_definitions(path):
+    print('Parsing "{0}"...'.format(path))
     with codecs.open(path, encoding='euc-jp') as fp:
         return filter(lambda x: x and x[0] != '#', fp.read().splitlines())
 
 
-def parseKanjiDic(path):
-    results = list()
+def parse_kanji_dic(path):
+    results = []
 
-    for line in loadDefinitions(path):
+    for line in load_definitions(path):
         segments = line.split()
         character = segments[0]
-        kunyomi = ', '.join(filter(lambda x: filter(isHiragana, x), segments[1:]))
-        onyomi = ', '.join(filter(lambda x: filter(isKatakana, x), segments[1:]))
+        kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
+        onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
         glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
         results.append((character, kunyomi, onyomi, glossary))
 
     return results
 
 
-def writeKanjiDic(cursor, values):
-    cursor.execute('DROP TABLE IF EXISTS Kanji')
-    cursor.execute('CREATE TABLE Kanji(character TEXT, kunyomi TEXT, onyomi TEXT, glossary TEXT)')
-    cursor.executemany('INSERT INTO Kanji VALUES(?, ?, ?, ?)', values)
+def parse_krad_file(path):
+    results = []
 
-
-def parseKradFile(path):
-    results = list()
-
-    for line in loadDefinitions(path):
+    for line in load_definitions(path):
         segments = line.split(' ')
         character = segments[0]
         radicals = ' '.join(segments[2:])
@@ -143,16 +136,10 @@ def parseKradFile(path):
     return results
 
 
-def writeKradFile(cursor, values):
-    cursor.execute('DROP TABLE IF EXISTS Radicals')
-    cursor.execute('CREATE TABLE Radicals(character TEXT, radicals TEXT)')
-    cursor.executemany('INSERT INTO Radicals VALUES(?, ?)', values)
-
+def parse_edict(path):
+    results = []
 
-def parseEdict(path):
-    results = list()
-
-    for line in loadDefinitions(path):
+    for line in load_definitions(path):
         segments = line.split('/')
 
         expression = segments[0].split(' ')
@@ -164,7 +151,7 @@ def parseEdict(path):
         glossary = '; '.join(glossary)
         glossary = re.sub('\(\d+\)\s*', str(), glossary)
 
-        tags = list()
+        tags = []
         for group in re.findall('\(([^\)\]]+)\)', glossary):
             tags.extend(group.split(','))
 
@@ -176,27 +163,18 @@ def parseEdict(path):
     return results
 
 
-def writeEdict(cursor, values):
-    cursor.execute('DROP TABLE IF EXISTS Terms')
-    cursor.execute('CREATE TABLE Terms(expression TEXT, reading TEXT, glossary TEXT, tags TEXT)')
-    cursor.executemany('INSERT INTO Terms VALUES(?, ?, ?, ?)', values)
-
-
-def build(path, kanjidic, kradfile, edict, enamdict):
-    with sqlite3.connect(path) as db:
-        if kanjidic is not None:
-            writeKanjiDic(db, parseKanjiDic(kanjidic))
+def build_dict(output_dir, input_file, parser):
+    if input_file is not None:
+        base = os.path.splitext(os.path.basename(input_file))[0]
+        with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
+            json.dump(parser(input_file), fp)
 
-        if kradfile is not None:
-            writeKradFile(db, parseKradFile(kradfile))
 
-        terms = []
-        if edict is not None:
-            terms += parseEdict(edict)
-        if enamdict is not None:
-            terms += parseEdict(enamdict)
-        if len(terms) > 0:
-            writeEdict(db, terms)
+def build(dict_dir, kanjidic, kradfile, edict, enamdict):
+    build_dict(dict_dir, kanjidic, parse_kanji_dic)
+    build_dict(dict_dir, kradfile, parse_krad_file)
+    build_dict(dict_dir, edict, parse_edict)
+    build_dict(dict_dir, enamdict, parse_edict)
 
 
 def main():
author	Alex Yatskov <alex@foosoft.net>	2016-03-20 11:43:28 -0700
committer	Alex Yatskov <alex@foosoft.net>	2016-03-20 11:43:28 -0700
commit	fbb763559425744e33d909c550994a36afe66b3e (patch)
tree	74004da11c676e5df732444c8c10403ebbd32708
parent	23327d7e357c358654ac302966819808d2bb2215 (diff)