Moving large files to CSV format, deleting unused kradfile

author: Alex Yatskov <alex@foosoft.net> 2016-03-31 20:03:39 -0700
committer: Alex Yatskov <alex@foosoft.net> 2016-03-31 20:03:39 -0700
commit: 7eadff3457690074c5c0140a6e9ffd6164021176 (patch)
tree: ad8ba8c31cba11f54ca8cab186d1d36090e070c0
parent: b97e75ba32781341c221f549780f3444d0916714 (diff)
4 files changed, 52 insertions, 85 deletions
diff --git a/build_dict.sh b/build_dict.sh
index a13b4ed7..42eed600 100755
--- a/build_dict.sh
+++ b/build_dict.sh
@@ -3,7 +3,6 @@
 KANJIDIC=util/data/kanjidic
 EDICT=util/data/edict
 ENAMDICT=util/data/enamdict
-KRADFILE=util/data/kradfile
-DICT_DIR=ext/jp/data
+DICT_DIR=ext/bg/data
 
-util/compile.py --kanjidic $KANJIDIC --kradfile $KRADFILE --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
+util/compile.py --kanjidic $KANJIDIC --edict $EDICT --enamdict $ENAMDICT $DICT_DIR
diff --git a/ext/bg/dictionary.js b/ext/bg/dictionary.js
index eff54890..30c34687 100644
--- a/ext/bg/dictionary.js
+++ b/ext/bg/dictionary.js
@@ -19,43 +19,33 @@
 
 class Dictionary {
     constructor() {
-        this.termDicts  = [];
-        this.kanjiDicts = [];
-    }
-
-    addTermDict(termDict) {
-        this.termDicts.push(termDict);
-    }
+        this.terms       = [];
+        this.termIndices = {};
 
-    addKanjiDict(kanjiDict) {
-        this.kanjiDicts.push(kanjiDict);
+        this.kanji        = [];
+        this.kanjiIndices = {};
     }
 
-
-    findTerm(term) {
-        let results = [];
-        for (let dict of this.termDicts) {
-            results = results.concat(this.findTermInDict(term, dict));
+    addTermDict(terms) {
+        let index = this.terms.length;
+        for (const [e, r, g, t] in terms) {
+            this.storeIndex(this.termIndices, e, index);
+            this.storeIndex(this.termIndices, r, index++);
+            this.terms.push([e, r, g, t]);
         }
-
-        return results;
     }
 
-    findKanji(kanji) {
-        const results = [];
-        for (let dict of this.kanjiDicts) {
-            const result = this.findKanjiInDict(kanji, dict);
-            if (result !== null) {
-                results.push(result);
-            }
+    addKanjiDict(kanji) {
+        let index = this.kanji.length;
+        for (const [c, k, o, g] in kanji) {
+            this.storeIndex(this.kanjiIndices, c, index++);
+            this.kanji.push([c, k, o, g]);
         }
-
-        return results;
     }
 
-    findTermInDict(term, dict) {
-        return (dict.indices[term] || []).map(index => {
-            const [e, r, g, t] = dict.defs[index];
+    findTerm(term) {
+        return (this.termIndices[term] || []).map(index => {
+            const [e, r, g, t] = this.terms[index];
             return {
                 id:         index,
                 expression: e,
@@ -66,19 +56,24 @@ class Dictionary {
         });
     }
 
-    findKanjiInDict(kanji, dict) {
-        const def = dict.defs[kanji];
-        if (def === null) {
-            return null;
-        }
+    findKanji(kanji) {
+        return (this.kanjiIndices[kanji] || []).map(index => {
+            const [c, k, o, g] = def;
+            return {
+                id:        kanji.charCodeAt(0),
+                character: c,
+                kunyomi:   k,
+                onyomi:    o,
+                glossary:  g
+            };
+        });
+    }
 
-        const [c, k, o, g] = def;
-        return {
-            id:        kanji.charCodeAt(0),
-            character: c,
-            kunyomi:   k,
-            onyomi:    o,
-            glossary:  g
-        };
+    storeIndex(indices, term, index) {
+        if (term.length > 0) {
+            const indices = this.termIndices[term] || [];
+            indices.push(term);
+            this.termIndices[term] = indices;
+        }
     }
 }
diff --git a/ext/client.js b/ext/client.js
index 2d9a470f..1c8c0a9f 100644
--- a/ext/client.js
+++ b/ext/client.js
@@ -27,7 +27,9 @@ class Client {
         this.popup.classList.add('yomichan-popup');
         this.popup.addEventListener('mousedown', (e) => e.stopPropagation());
         this.popup.addEventListener('scroll', (e) => e.stopPropagation());
-        document.body.appendChild(this.popup);
+
+        const base = document.body.appendChild('div');
+        base.createShadowRoot().appendChild(this.popup);
 
         chrome.runtime.onMessage.addListener(this.onMessage.bind(this));
         window.addEventListener('mousedown', this.onMouseDown.bind(this));
diff --git a/util/compile.py b/util/compile.py
index 485537dc..790ebfc7 100755
--- a/util/compile.py
+++ b/util/compile.py
@@ -18,7 +18,6 @@
 
 
 import codecs
-import json
 import optparse
 import os.path
 import re
@@ -111,7 +110,7 @@ def load_definitions(path):
 
 
 def parse_kanji_dic(path):
-    results = {}
+    results = []
 
     for line in load_definitions(path):
         segments = line.split()
@@ -119,32 +118,20 @@ def parse_kanji_dic(path):
         kunyomi = ', '.join(filter(lambda x: filter(is_hiragana, x), segments[1:]))
         onyomi = ', '.join(filter(lambda x: filter(is_katakana, x), segments[1:]))
         glossary = '; '.join(re.findall('\{([^\}]+)\}', line))
-        results[character] = (kunyomi, onyomi, glossary)
-
-    return results
-
-
-def parse_krad_file(path):
-    results = {}
-
-    for line in load_definitions(path):
-        segments = line.split(' ')
-        character = segments[0]
-        radicals = ' '.join(segments[2:])
-        results[character] = radicals;
+        results.append((character, kunyomi, onyomi, glossary))
 
     return results
 
 
 def parse_edict(path):
-    defs = []
+    results = []
     for line in load_definitions(path):
         segments = line.split('/')
 
         expression = segments[0].split(' ')
         term = expression[0]
         match = re.search('\[([^\]]+)\]', expression[1])
-        reading = None if match is None else match.group(1)
+        reading = '' if match is None else match.group(1)
 
         glossary = '; '.join(filter(lambda x: len(x) > 0, segments[1:]))
         glossary = re.sub('\(\d+\)\s*', '', glossary)
@@ -156,30 +143,21 @@ def parse_edict(path):
         tags = set(tags).intersection(PARSED_TAGS)
         tags = ' '.join(tags)
 
-        defs.append((term, reading, glossary, tags))
-
-    indices = {}
-    for i, d in enumerate(defs):
-        for key in d[:2]:
-            if key is not None:
-                values = indices.get(key, [])
-                values.append(i)
-                indices[key] = values
+        results.append((term, reading, glossary, tags))
 
-    return {'defs': defs, 'indices': indices}
+    return results[1:]
 
 
 def build_dict(output_dir, input_file, parser):
     if input_file is not None:
         base = os.path.splitext(os.path.basename(input_file))[0]
-        with open(os.path.join(output_dir, base) + '.json', 'w') as fp:
-            # json.dump(parser(input_file), fp, sort_keys=True, indent=4, separators=(',', ': '))
-            json.dump(parser(input_file), fp)
+        with codecs.open(os.path.join(output_dir, base) + '.csv', 'wb', encoding='utf-8') as fp:
+            for d in parser(input_file):
+                fp.write('\t'.join(d) + '\n')
 
 
-def build(dict_dir, kanjidic, kradfile, edict, enamdict):
+def build(dict_dir, kanjidic, edict, enamdict):
     build_dict(dict_dir, kanjidic, parse_kanji_dic)
-    build_dict(dict_dir, kradfile, parse_krad_file)
     build_dict(dict_dir, edict, parse_edict)
     build_dict(dict_dir, enamdict, parse_edict)
 
@@ -187,7 +165,6 @@ def build(dict_dir, kanjidic, kradfile, edict, enamdict):
 def main():
     parser = optparse.OptionParser()
     parser.add_option('--kanjidic', dest='kanjidic')
-    parser.add_option('--kradfile', dest='kradfile')
     parser.add_option('--edict', dest='edict')
     parser.add_option('--enamdict', dest='enamdict')
 
@@ -196,13 +173,7 @@ def main():
     if len(args) == 0:
         parser.print_help()
     else:
-        build(
-            args[0],
-            options.kanjidic,
-            options.kradfile,
-            options.edict,
-            options.enamdict
-        )
+        build(args[0], options.kanjidic, options.edict, options.enamdict)
 
 
 if __name__ == '__main__':
author	Alex Yatskov <alex@foosoft.net>	2016-03-31 20:03:39 -0700
committer	Alex Yatskov <alex@foosoft.net>	2016-03-31 20:03:39 -0700
commit	7eadff3457690074c5c0140a6e9ffd6164021176 (patch)
tree	ad8ba8c31cba11f54ca8cab186d1d36090e070c0
parent	b97e75ba32781341c221f549780f3444d0916714 (diff)