add options to bulk audio script

author: lonkaars <loek@pipeframe.xyz> 2023-10-08 14:15:43 +0200
committer: lonkaars <loek@pipeframe.xyz> 2023-10-08 14:15:43 +0200
commit: 7e0166543e946a7e7a553169a49420e769a53e14 (patch)
tree: ddc8cd5ead92b79c6bdb0852c417510fbe40caa0
parent: 855483401c04f4e741733736de7958b8d88db849 (diff)
3 files changed, 163 insertions, 49 deletions
diff --git a/bulk-audio/bulk-audio.py b/bulk-audio/bulk-audio.py
index 9d34cdc..1027b3e 100755
--- a/bulk-audio/bulk-audio.py
+++ b/bulk-audio/bulk-audio.py
@@ -1,13 +1,22 @@
 #!/bin/python3
-
 import sys
 import subprocess
 import hashlib
 import os
 import re
+import io
 import argparse
 from math import floor, log10
+from time import sleep
+
+real_stdout = sys.stdout
+class TrashFileIO(object):
+  def write(self, x): pass
+trash_out = TrashFileIO()
+
+sys.stdout = trash_out
 import aqt
+sys.stdout = real_stdout
 
 # this function only works for refold-tools sentence mining card template
 pattern = re.compile("^([^[、 【]+)[^【]*(【(.+)】)?")
@@ -30,17 +39,34 @@ def parse_args(argv):
     formatter_class=argparse.ArgumentDefaultsHelpFormatter,
   )
   parser.usage = f"{argv[0]} [options] [anki options]"
-  parser.add_argument("-n", "--note-type", help="note type to add audio to", default="Sentence mining")
+  parser.add_argument("-t", "--note-type", help="note type to add audio to", default="Sentence mining")
   parser.add_argument("-a", "--audio-field", help="field name to modify with audio", default="Audio")
   parser.add_argument("-f", "--filename-prefix", help="download filename prefix", default="refold-tools-")
+  parser.add_argument("-s", "--source-list", help="set source list (see `./get -h`)", default=None)
+  parser.add_argument("-O", "--force-override", help="force override audio field, even if it is not empty", action='store_true')
+  parser.add_argument("-C", "--clear-audio", help="CLEARS ALL AUDIO FIELDS REGARDLESS OF VALUE", action='store_true')
+  parser.add_argument("-n", "--noaudio", help="only modify notes that have \"noaudio\" as AUDIO_FIELD value", action='store_true')
+  parser.add_argument("-i", "--note-id", help="select specific note (specify multiple times to select multiple notes)", action='append', nargs=1, default=[])
+  parser.add_argument("-d", "--dry-run", help="print only, do not edit anything", action='store_true')
   return parser.parse_known_args(argv[1:])
 
 def main():
   options, args = parse_args(sys.argv)
   args.insert(0, sys.argv[0]) # restore first index of argv (QT crashes if argv[] is empty)
 
+  if options.clear_audio:
+    print("Safety delay of 3 seconds (are you sure you want to clear ALL audio fields?)...")
+    print("Press Ctrl+C to cancel")
+    sleep(3)
+
   # forward remaining CLI parameters to Anki
+  sys.stdout = trash_out
   app = aqt._run(args, False)
+  sys.stdout = real_stdout
+  if aqt.mw == None:
+    print("Please close any open Anki windows before running this script!")
+    exit(1)
+
   # load last open profile if no profile was specified on command line (option parsed by Anki)
   if not aqt.mw.pm.name:
     aqt.mw.pm.load(aqt.mw.pm.last_loaded_profile_name())
@@ -50,24 +76,38 @@ def main():
   model = col.models.by_name(options.note_type)
   note_ids = col.models.nids(model)
 
+  # filter list if note ids were specified
+  if len(options.note_id) > 0:
+    filtered_note_ids = [int(arg[0]) for arg in options.note_id]
+    note_ids = [nid for nid in note_ids if nid in filtered_note_ids]
+
+  # filter only "noaudio" cards
+  if options.noaudio:
+    note_ids = [nid for nid in note_ids if col.get_note(nid)[options.audio_field] == "noaudio"]
+
+  if len(note_ids) == 0:
+    print("-- No notes found! (check your filters or note type?) --")
+    exit(1)
+
   edited_notes = 0
 
+  note_index_format = ("{:0" + str(floor(log10(len(note_ids))) + 1) + "d}/{:d}")
   for note_index, note_id in enumerate(note_ids):
     note = col.get_note(note_id)
-    note_index_format = ("{:0" + str(floor(log10(len(note_ids))) + 1) + "d}/{:d}").format(note_index + 1, len(note_ids))
-    print(f"[nid:{note_id}] ({note_index_format}) ", end="")
+    print(f"[nid:{note_id}] ({note_index_format.format(note_index + 1, len(note_ids))}) ", end="")
 
-    # bulk clear audio field (dev only)
-    # note[options.audio_field] = ""
-    # note.flush()
-    # print(f"cleared \"{options.audio_field}\" field!")
-    # continue
+    if options.clear_audio:
+      if not options.dry_run:
+        note[options.audio_field] = ""
+        note.flush()
+      print(f"cleared \"{options.audio_field}\" field!")
+      continue
 
     # autosave deck every 20 cards
     if note_index % 20 == 0: col.save()
 
     # skip any notes that already have audio
-    if len(note[options.audio_field]) > 0:
+    if not options.force_override and len(note[options.audio_field]) > 0:
       print("skipped -> audio field not empty")
       continue
 
@@ -79,10 +119,12 @@ def main():
     print(f"{kanji} ({kana}) ", end="")
 
     # attempt to download audio
-    exit_code, data = get(kanji, kana)
+    exit_code, data = get(kanji, kana, options.source_list)
     if exit_code != 0:
-      note[options.audio_field] = "noaudio"
-      note.flush()
+      if not options.dry_run:
+        note[options.audio_field] = "noaudio"
+        note.flush()
+      edited_notes += 1
       print("skipped -> no recording available, marked as 'noaudio'")
       continue
 
@@ -90,14 +132,16 @@ def main():
     digest = hashlib.sha1(data).hexdigest()
     filename = f"{options.filename_prefix}{digest}.mp3"
     output_path = os.path.join(media_dir, filename)
-    with open(output_path, "wb+") as f:
-      f.write(data)
-      f.close()
+    if not options.dry_run:
+      with open(output_path, "wb+") as f:
+        f.write(data)
+        f.close()
 
     # set audio field to audio filename
     audio_str = f"[sound:{filename}]"
-    note[options.audio_field] = audio_str
-    note.flush()
+    if not options.dry_run:
+      note[options.audio_field] = audio_str
+      note.flush()
     print(f"written audio as {audio_str}")
     edited_notes += 1
 
@@ -107,12 +151,19 @@ def main():
     print("-- Done: no edits --")
   else:
     print(f"-- Done: succesfully edited {edited_notes} note{'' if edited_notes == 1 else 's'} --")
-  print("TODO: circumvent below error message (anki python api problems, notes were edited succesfully though):")
+  # circumvent "Exception ignored in atexit callbackException ignored in sys.unraisablehook"
+  sys.stdout = trash_out
 
 # run ./get to get audio data from stdout
 # returns (exit_code, stdout_data)
-def get(kanji, kana):
-  p = subprocess.run(["./get", kanji, kana], capture_output=True)
+def get(kanji, kana, source_list):
+  args = ["./get"]
+  if source_list != None:
+    args.append("-s")
+    args.append(source_list)
+  args.append(kanji)
+  args.append(kana)
+  p = subprocess.run(args, capture_output=True)
   if p.returncode != 0:
     return (1, None)
   return (0, p.stdout)
diff --git a/bulk-audio/get b/bulk-audio/get
index b7791aa..bc3839d 100755
--- a/bulk-audio/get
+++ b/bulk-audio/get
@@ -1,59 +1,91 @@
 #!/bin/sh
 
-KANJI="$1"
-KANA="$2"
+SCRIPT_NAME="$0"
+SOURCES="lp101,lp101_alt,jisho"
+KANJI=""
+KANA=""
 
-if [ -z "$KANJI" -o -z "$KANA" ]; then
-	cat << EOF
-usage: $0 <kanji> <kana> > <output>
-
-return value is 0 if <output> was succesfully written, 1 if the word could not
-be found. this script searches languagepod101, languagepod101 (alt) and
-jisho.org.
-EOF
-	exit 1
-fi
-
-get_languagepod101() {
+lp101() {
   URL="https://assets.languagepod101.com/dictionary/japanese/audiomp3.php?kanji=$KANJI&kana=$KANA"
   # 52288 is the content-length of the "the audio for this clip is currently
   # not available. it will be recorded and uploaded shortly. thank you for your
   # patience" message (404, but server sends 200 anyways)
   curl -X HEAD -iso - "$URL" | awk '/^Content-length:/ { exit $2 == 52288 }'
-	[ $? -ne 0 ] && return 1
+	[ $? -ne 0 ] && return
 	curl -so - "$URL"
+	exit 0
 }
 
-get_languagepod101_alt() {
+lp101_alt() {
 	HTML="$(curl -s -X POST \
 		"https://www.japanesepod101.com/learningcenter/reference/dictionary_post" \
 		-H "Content-Type: application/x-www-form-urlencoded" \
 		-d "post=dictionary_reference&match_type=exact&search_query=$KANJI&vulgar=true" \
 	)"
-	[ $? -ne 0 ] && return 1
+	[ $? -ne 0 ] && return
 	URL="$(echo "$HTML" | pup "audio source attr{src}" | head -n1)"
-	[ -z "$URL" ] && return 1
+	[ -z "$URL" ] && return
 	curl -so - "$URL"
+	exit 0
 }
 
-get_jisho() {
+jisho() {
 	HTML="$(curl -s "https://jisho.org/search/$KANJI")"
-	[ $? -ne 0 ] && return 1
+	[ $? -ne 0 ] && return
 	URL="$(echo "$HTML" | pup "audio[id=\"audio_$KANJI:$KANA\"] source attr{src}" | head -n1)"
-	[ -z "$URL" ] && return 1
+	[ -z "$URL" ] && return
 	URL="https:$URL"
 	curl -so - "$URL"
+	exit 0
+}
+
+usage() {
+	cat << EOF
+usage: $SCRIPT_NAME [OPTIONS] <KANJI> <KANA> > <OUTPUT>
+
+attempt to download a native Japanese recording of word written as KANJI and
+read as KANA. outputs mp3 to stdout. return value is 0 if OUTPUT was written
+(clip was found), and 1 if no clip could be found.
+
+options:
+	-s <source1[,source2,...]>    set source order (default: $SOURCES)
+	-h                            show help
+
+sources:
+	lp101        JapanesePod101
+	lp101_alt    JapanesePod101 (Alternate)
+	jisho        Jisho.org
+EOF
 }
 
-# get_languagepod101_alt
-# [ $? -eq 0 ] && exit 0
-# 
-# get_jisho
-# [ $? -eq 0 ] && exit 0
+while getopts 'hs:' OPT; do
+	case $OPT in
+		h)
+			usage
+			exit 0
+			;;
+		s)
+			SOURCES="$OPTARG"
+			;;
+		\?|*)
+			usage > /dev/stderr
+			exit 1
+			;;
+	esac
+done
 
-get_languagepod101
-[ $? -eq 0 ] && exit 0
+# invalid argument count
+if [ $(( $# - $OPTIND + 1 )) -ne 2 ]; then
+	usage > /dev/stderr
+	exit 1
+fi
+
+KANJI=${@:$OPTIND:1}
+KANA=${@:$OPTIND+1:1}
+
+$(printf '%s;' "$SOURCES" | sed -z 's/[;,\n]/ ; /g')
 
 # if none were succesful, delete output file and exit with error
 rm -f "$OUTPUT"
 exit 1
+
diff --git a/bulk-audio/readme.md b/bulk-audio/readme.md
new file mode 100644
index 0000000..06e7542
--- /dev/null
+++ b/bulk-audio/readme.md
@@ -0,0 +1,31 @@
+# Bulk audio adder
+
+This is a Python and POSIX shell script that downloads native speaker audio for
+words from the same sources as Yomichan:
+
+- JapanesePod101
+- JapanesePod101 (Alternate)
+- Jisho\.org
+
+No Python dependencies should have to be installed, as this script only relies
+on built-in Python libraries and the `aqt` library, which should get installed
+alongside Anki.
+
+The `./get` script (should) also work on Windows under Git Bash or Msys2, but
+uses `pup` to parse HTML for the `lp101_alt` source. Disabling this source, or
+installing `pup` should work.
+
+## Usage
+
+See `./bulk-audio.py -h` for all options. The default options add audio to
+notes with an empty Audio field. If a clip can't be found for the note, the
+Audio field will be set to "noaudio". This script can be customized to work
+with other note types, but works with [my custom anki card
+template](../anki-card-template) by default.
+
+|command|action|
+|-|-|
+|`./bulk-audio.py`|Download audio for all notes with empty Audio field|
+|`./bulk-audio.py -nO`|Try to download audio again for notes with "noaudio" Audio field|
+|`./bulk-audio.py -C`|Clear all Audio fields|
+
author	lonkaars <loek@pipeframe.xyz>	2023-10-08 14:15:43 +0200
committer	lonkaars <loek@pipeframe.xyz>	2023-10-08 14:15:43 +0200
commit	7e0166543e946a7e7a553169a49420e769a53e14 (patch)
tree	ddc8cd5ead92b79c6bdb0852c417510fbe40caa0
parent	855483401c04f4e741733736de7958b8d88db849 (diff)