From c998e1c0477d51c886f9e4246e102dec4d7ef8dd Mon Sep 17 00:00:00 2001
From: lonkaars <loek@pipeframe.xyz>
Date: Thu, 29 Jun 2023 11:33:23 +0200
Subject: add jmdict importer to repo

---
 db/dict/template.sql     | 113 +++++++++++++++++++++++++++++++++++++++++++++++
 db/dict/template.sql.m4  | 104 -------------------------------------------
 db/makefile              |  14 ++++--
 import/jmdict/.gitignore |   4 ++
 import/jmdict/jmdict.ts  |  64 +++++++++++++++++++++++++++
 import/jmdict/makefile   |  13 ++++++
 import/readme.md         |  17 +++++++
 import/util.ts           |   0
 language/tags.ts         |  16 ++++++-
 util/string.ts           |  13 +-----
 10 files changed, 238 insertions(+), 120 deletions(-)
 create mode 100644 db/dict/template.sql
 delete mode 100644 db/dict/template.sql.m4
 create mode 100644 import/jmdict/.gitignore
 create mode 100644 import/jmdict/jmdict.ts
 create mode 100644 import/jmdict/makefile
 create mode 100644 import/readme.md
 create mode 100644 import/util.ts

diff --git a/db/dict/template.sql b/db/dict/template.sql
new file mode 100644
index 0000000..1a07252
--- /dev/null
+++ b/db/dict/template.sql
@@ -0,0 +1,113 @@
+-- create temporary ingest table
+drop table if exists ingest;
+-- TODO: ingest pitch-accent dictionaries
+-- TODO: ingest alternate writings (space-separated)
+create temporary table ingest(
+	-- term fields
+	expression text not null, -- kanji of term (e.g. 読み込む)
+	reading text not null, -- reading of term (e.g. よみこむ)
+	term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB
+
+	-- definition fields
+	glossary_sort int not null default 1, -- order of multiple meanings (glossaries)
+	glossary text null default null, -- glossary content (support for basic HTML markup/styling)
+	glossary_tags text null default null -- add tags to single glossary entry
+);
+
+-- #DICTIONARY_CONTENT_BEGIN
+-- this template is 'rendered' by pasting a .dict.sql file in between these
+-- DICTIONARY_CONTENT markers. the makefile can render these using the
+-- following m4 code (called using m4 -P template.sql < any.dict.sql):
+--
+-- m4_undivert(`/dev/stdin')
+--
+-- this breaks when the first line of the input file is not a comment or empty
+-- line, so the makefile accounts for this by concatenating an empty line with
+-- the dict first. the runtime typescript dictionary importer handles this by
+-- not calling m4 for this.
+-- #DICTIONARY_CONTENT_END
+
+-- create dict id
+insert into dict (tag, language) values ('dict:' || :dict, :lang);
+
+-- add terms
+insert into term (expression, reading)
+select expression, reading
+from ingest;
+
+-- add definitions
+insert into definition (term_id, sort, glossary, dict_id)
+select
+	term.id,
+	ingest.glossary_sort,
+	ingest.glossary,
+	(select id from dict where tag = 'dict:' || :dict)
+from ingest
+join term on term.expression = ingest.expression and term.reading = ingest.reading;
+
+-- create map of term_id and tag code
+drop table if exists term_tag_map;
+create temporary table term_tag_map (term_id, tag);
+insert into term_tag_map
+with tag_map(term_id, temp, tag) as (
+	select
+		(select id from term where expression is ingest.expression and reading is ingest.reading),
+		term_tags || ' ',
+		''
+	from ingest
+	union
+	select
+		term_id,
+		substr(temp, instr(temp, ' ') + 1),
+		substr(temp, 0, instr(temp, ' '))
+	from tag_map
+	where length(temp) > 1
+)
+select term_id, replace(tag, ' ', '')
+from tag_map
+where length(tag) > 0;
+
+-- create map of definition_id and tag code
+drop table if exists definition_tag_map;
+create temporary table definition_tag_map (definition_id, tag);
+insert into definition_tag_map
+with tag_map(definition_id, temp, tag) as (
+	select
+		(select id from definition where glossary is ingest.glossary),
+		glossary_tags || ' ',
+		''
+	from ingest
+	union
+	select
+		definition_id,
+		substr(temp, instr(temp, ' ') + 1),
+		substr(temp, 0, instr(temp, ' '))
+	from tag_map
+	where length(temp) > 1
+)
+select definition_id, replace(tag, ' ', '')
+from tag_map
+where length(tag) > 0;
+
+-- make sure tags exist
+insert into tag (code)
+select tag from term_tag_map
+union
+select tag from definition_tag_map;
+
+-- add tags to terms
+insert into term_tag (term_id, tag_id)
+select
+	term_id,
+	tag.id
+from term_tag_map
+join tag on tag.code = term_tag_map.tag;
+
+-- add tags to definitions
+insert into definition_tag (definition_id, tag_id)
+select
+	definition_id,
+	tag.id
+from definition_tag_map
+join tag on tag.code = definition_tag_map.tag;
+
diff --git a/db/dict/template.sql.m4 b/db/dict/template.sql.m4
deleted file mode 100644
index 00de413..0000000
--- a/db/dict/template.sql.m4
+++ /dev/null
@@ -1,104 +0,0 @@
--- create temporary ingest table
-drop table if exists ingest;
--- TODO: ingest pitch-accent dictionaries
--- TODO: ingest alternate writings (space-separated)
-create temporary table ingest(
-	-- term fields
-	expression text not null, -- kanji of term (e.g. 読み込む)
-	reading text not null, -- reading of term (e.g. よみこむ)
-	term_tags text not null default '', -- space-separated *term* tags, merged if term already exists in DB
-
-	-- definition fields
-	glossary_sort int not null default 1, -- order of multiple meanings (glossaries)
-	glossary text null default null, -- glossary content (support for basic HTML markup/styling)
-	glossary_tags text null default null -- add tags to single glossary entry
-);
-
-include(`/dev/stdin')dnl --'
--- the apostrophe is so my editor highlighting keeps working if I force the
--- filetype to sql instead of m4
-
--- create dict id
-insert into dict (tag, language) values ('dict:' || :dict, :lang);
-
--- add terms
-insert into term (expression, reading)
-select expression, reading
-from ingest;
-
--- add definitions
-insert into definition (term_id, sort, glossary, dict_id)
-select
-	term.id,
-	ingest.glossary_sort,
-	ingest.glossary,
-	(select id from dict where tag = 'dict:' || :dict)
-from ingest
-join term on term.expression = ingest.expression and term.reading = ingest.reading;
-
--- create map of term_id and tag code
-drop table if exists term_tag_map;
-create temporary table term_tag_map (term_id, tag);
-insert into term_tag_map
-with tag_map(term_id, temp, tag) as (
-	select
-		(select id from term where expression is ingest.expression and reading is ingest.reading),
-		term_tags || ' ',
-		''
-	from ingest
-	union
-	select
-		term_id,
-		`substr'(temp, instr(temp, ' ') + 1),
-		`substr'(temp, 0, instr(temp, ' '))
-	from tag_map
-	where length(temp) > 1
-)
-select term_id, replace(tag, ' ', '')
-from tag_map
-where length(tag) > 0;
-
--- create map of definition_id and tag code
-drop table if exists definition_tag_map;
-create temporary table definition_tag_map (definition_id, tag);
-insert into definition_tag_map
-with tag_map(definition_id, temp, tag) as (
-	select
-		(select id from definition where glossary is ingest.glossary),
-		glossary_tags || ' ',
-		''
-	from ingest
-	union
-	select
-		definition_id,
-		`substr'(temp, instr(temp, ' ') + 1),
-		`substr'(temp, 0, instr(temp, ' '))
-	from tag_map
-	where length(temp) > 1
-)
-select definition_id, replace(tag, ' ', '')
-from tag_map
-where length(tag) > 0;
-
--- make sure tags exist
-insert into tag (code)
-select tag from term_tag_map
-union
-select tag from definition_tag_map;
-
--- add tags to terms
-insert into term_tag (term_id, tag_id)
-select
-	term_id,
-	tag.id
-from term_tag_map
-join tag on tag.code = term_tag_map.tag;
-
--- add tags to definitions
-insert into definition_tag (definition_id, tag_id)
-select
-	definition_id,
-	tag.id
-from definition_tag_map
-join tag on tag.code = definition_tag_map.tag;
-
diff --git a/db/makefile b/db/makefile
index c1e527e..88d4bba 100644
--- a/db/makefile
+++ b/db/makefile
@@ -2,7 +2,13 @@ SQL = sqlite3
 DICT_DB = dict.db
 USER_DB = user.db
 
-DICT_TEMPLATE = dict/template.sql.m4
+DICT_TEMPLATE = dict/template.sql
+
+# comment any of these lines to disable including in the default DB
+DEFAULT_DICTS += dict/test_a.sql
+DEFAULT_DICTS += dict/test_b.sql
+DEFAULT_DICTS += dict/test_pitch_accent.sql
+DEFAULT_DICTS += ../import/jmdict/jmdict.sql
 
 .PHONY: clean test
 
@@ -20,7 +26,7 @@ dict/base.sql: dict/reset.sql dict/init.sql dict/deinflections.sql dict/tags.sql
 dict/full.sql: dict/base.sql dict/dict.sql
 	cat $^ > $@
 
-dict/dict.sql: dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql dict/jmdict.sql
+dict/dict.sql: $(DEFAULT_DICTS)
 	cat $^ > $@
 
 user/base.sql: user/reset.sql user/init.sql
@@ -30,11 +36,11 @@ user/full.sql: user/base.sql user/root.sql
 	cat $^ > $@
 
 %.sql: %.dict.sql $(DICT_TEMPLATE)
-	m4 $(DICT_TEMPLATE) < $< > $@
+	echo "" | cat - $< | m4 -P $(DICT_TEMPLATE) > $@
 
 # delete generated sql files and database
 clean:
-	$(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql dict/test_a.sql dict/test_b.sql dict/test_pitch_accent.sql user/base.sql user/full.sql
+	$(RM) $(DICT_DB) $(USER_DB) dict/base.sql dict/full.sql dict/dict.sql $(DEFAULT_DICTS) user/base.sql user/full.sql
 
 test: $(DICT_DB) find.sql
 	./test/find '浮上しました'
diff --git a/import/jmdict/.gitignore b/import/jmdict/.gitignore
new file mode 100644
index 0000000..d4d466d
--- /dev/null
+++ b/import/jmdict/.gitignore
@@ -0,0 +1,4 @@
+jmdict.dict.sql
+jmdict.sql
+jmdict.json
+jmdict.zip
diff --git a/import/jmdict/jmdict.ts b/import/jmdict/jmdict.ts
new file mode 100644
index 0000000..1f391e5
--- /dev/null
+++ b/import/jmdict/jmdict.ts
@@ -0,0 +1,64 @@
+import type { JMdict } from "npm:@scriptin/jmdict-simplified-types";
+
+// this script is very messy right now, and doesn't transfer all information
+// present in the dictionary.
+//
+// proceed with caution
+
+const LANG = "eng";
+
+// no simple way to do this on non-unix using Deno.stdin
+const input = await Deno.readFile("/dev/stdin");
+
+const jmdict = JSON.parse(new TextDecoder().decode(input)) as JMdict;
+
+// TODO: more tags
+const tagLookup = {
+	["misc/uk"]: "aux:uk",
+	["class/adv"]: "class:adverb",
+	["class/vs"]: "class:verb:suru",
+	["class/v1"]: "class:verb:ru",
+	["class/v5"]: "class:verb:u",
+	["class/n"]: "class:noun",
+	["class/suf"]: "class:suffix",
+	["class/prt"]: "class:part",
+	["class/exp"]: "class:expr",
+} as { [map: string]: string };
+
+console.log(`.param set :dict 'jmdict_${LANG}'`);
+console.log(".param set :lang 'en'");
+
+// TODO: separate term and glossary tags
+console.log("insert into ingest(expression, reading, term_tags, glossary_sort, glossary) values");
+
+// var max = -100;
+for (let i = 0; i < jmdict.words.length; i++) {
+	// max++;
+	// if (max < 0) continue;
+	// if (max > 400) break;
+	let term = jmdict.words[i];
+	let last = i == jmdict.words.length - 1;
+
+	// TODO: properly resolve appliesToKanji/appliesToKana
+	var definitions = term.sense
+		.filter(s => s.gloss[0].lang == LANG)
+		.map(s => s.gloss.map(g => g.text).join(", "));
+	if (definitions.length == 0) continue;
+	var reading = term.kana[0].text;
+	if (term.kanji.length == 0) term.kanji = term.kana;
+	var writing = term.kanji[0].text;
+	var other_writings = term.kanji.filter(e => e.text != writing).map(e => e.text);
+	var tags = [... new Set([
+		...term.sense.map(s => s.field).reduce((acc, current) => [...acc, ...current], []).map(i => `field/${i}`),
+		...term.sense.map(s => s.dialect).reduce((acc, current) => [...acc, ...current], []).map(i => `dialect/${i}`),
+		...term.sense.map(s => s.misc).reduce((acc, current) => [...acc, ...current], []).map(i => `misc/${i}`),
+		...term.sense.map(s => s.partOfSpeech).reduce((acc, current) => [...acc, ...current], []).map(i => `class/${i}`),
+	])];
+	var tags = tags.filter(i => i in tagLookup).map(i => tagLookup[i]);
+	for (let j = 0; j < definitions.length; j++) {
+		var out = `\t('${writing}', '${reading}', '${tags.join(" ")}', ${j+1}, '${definitions[j].replaceAll("'", "''")}')${(last && j == definitions.length-1) ? ';' : ','}`;
+		if (j == 0 && other_writings.length > 0) out += ` -- TODO: alts: ${other_writings.join(", ")}`;
+		console.log(out);
+	}
+}
+
diff --git a/import/jmdict/makefile b/import/jmdict/makefile
new file mode 100644
index 0000000..d19c9af
--- /dev/null
+++ b/import/jmdict/makefile
@@ -0,0 +1,13 @@
+CURL = curl
+UNZIP = unzip
+
+all: jmdict.dict.sql
+
+jmdict.zip:
+	$(CURL) -Ls 'https://github.com/scriptin/jmdict-simplified/releases/download/3.5.0%2B20230619121907/jmdict-all-3.5.0+20230619121907.json.zip' > $@
+
+jmdict.json: jmdict.zip
+	$(UNZIP) -p $< > $@
+
+jmdict.dict.sql: jmdict.json jmdict.ts
+	deno run -A --unstable ./jmdict.ts < $< > $@
diff --git a/import/readme.md b/import/readme.md
new file mode 100644
index 0000000..48adf4b
--- /dev/null
+++ b/import/readme.md
@@ -0,0 +1,17 @@
+# Dictionary imports
+
+This folder contains import scripts for various dictionaries. Publicly
+available dictionaries can automatically be downloaded, others have to be
+user-provided. **None of the dictionary files will be hosted in this
+repository. All dictionaries keep their original license.**
+
+To generate a Yomikun dictionary, run `make` in one of these subdirectories.
+This command will output one or more .dict.sql files, which can be imported
+manually, or included in the default dictionary database file by editing [the
+dictionary makefile](../db/makefile).
+
+## JMdict
+
+[License](https://www.edrdg.org/edrdg/licence.html).
+
+
diff --git a/import/util.ts b/import/util.ts
new file mode 100644
index 0000000..e69de29
diff --git a/language/tags.ts b/language/tags.ts
index 4c1f134..d56ce98 100644
--- a/language/tags.ts
+++ b/language/tags.ts
@@ -5,7 +5,7 @@ export const Tag = {
 		/** @constant verb subgroup */
 		Verb: {
 			/** @constant any verb (fallback for vague dictionaries) */
-			Unspecified: "class:verb",
+			Unspecified: "class:verb", // TODO: deprecate this property and implement verb classifier in ../import/util.ts
 			/** @constant noun that can be conjugated into a verb by adding する */
 			Suru: "class:verb:suru",
 			/**
@@ -100,3 +100,17 @@ export type TokenTag = string; // no way around it
 
 export type TokenTags = Set<TokenTag>;
 
+/** @summary parse concatenated tag string to TokenTags */
+export function parseTags(input: string) {
+	var tags = input.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
+	var filteredTags: TokenTag[] = [];
+	for (var tag of tags) {
+		// skip past tense tags after -te and -tari deinflection
+		if (tag == Tag.Inflection.Tense.Past &&
+				filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
+
+		filteredTags.push(tag);
+	}
+	return new Set(filteredTags) as TokenTags;
+}
+
diff --git a/util/string.ts b/util/string.ts
index d94f5a3..e0cc5eb 100644
--- a/util/string.ts
+++ b/util/string.ts
@@ -1,4 +1,4 @@
-import { TokenTags, TokenTag, Tag } from "../language/tags.ts";
+import { TokenTags, parseTags } from "../language/tags.ts";
 import JapaneseString from "../language/japanese.ts";
 
 declare global {
@@ -60,15 +60,6 @@ String.prototype.jp = function() {
 
 /** @summary parse concatenated tag string to TokenTags */
 String.prototype.parseTags = function() {
-	var tags = this.replaceAll(/ +/g, " ").trim().split(" ") as TokenTag[];
-	var filteredTags: TokenTag[] = [];
-	for (var tag of tags) {
-		// skip past tense tags after -te and -tari deinflection
-		if (tag == Tag.Inflection.Tense.Past &&
-				filteredTags.anyOf([Tag.Inflection.Suffix.Te, Tag.Inflection.Suffix.Tari])) continue;
-
-		filteredTags.push(tag);
-	}
-	return new Set(filteredTags) as TokenTags;
+	return parseTags(this as string);
 }
 
-- 
cgit v1.2.3