diff options
-rwxr-xr-x | bulk-audio/bulk-audio.py | 93 | ||||
-rwxr-xr-x | bulk-audio/get | 88 | ||||
-rw-r--r-- | bulk-audio/readme.md | 31 |
3 files changed, 163 insertions, 49 deletions
diff --git a/bulk-audio/bulk-audio.py b/bulk-audio/bulk-audio.py index 9d34cdc..1027b3e 100755 --- a/bulk-audio/bulk-audio.py +++ b/bulk-audio/bulk-audio.py @@ -1,13 +1,22 @@ #!/bin/python3 - import sys import subprocess import hashlib import os import re +import io import argparse from math import floor, log10 +from time import sleep + +real_stdout = sys.stdout +class TrashFileIO(object): + def write(self, x): pass +trash_out = TrashFileIO() + +sys.stdout = trash_out import aqt +sys.stdout = real_stdout # this function only works for refold-tools sentence mining card template pattern = re.compile("^([^[、 【]+)[^【]*(【(.+)】)?") @@ -30,17 +39,34 @@ def parse_args(argv): formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.usage = f"{argv[0]} [options] [anki options]" - parser.add_argument("-n", "--note-type", help="note type to add audio to", default="Sentence mining") + parser.add_argument("-t", "--note-type", help="note type to add audio to", default="Sentence mining") parser.add_argument("-a", "--audio-field", help="field name to modify with audio", default="Audio") parser.add_argument("-f", "--filename-prefix", help="download filename prefix", default="refold-tools-") + parser.add_argument("-s", "--source-list", help="set source list (see `./get -h`)", default=None) + parser.add_argument("-O", "--force-override", help="force override audio field, even if it is not empty", action='store_true') + parser.add_argument("-C", "--clear-audio", help="CLEARS ALL AUDIO FIELDS REGARDLESS OF VALUE", action='store_true') + parser.add_argument("-n", "--noaudio", help="only modify notes that have \"noaudio\" as AUDIO_FIELD value", action='store_true') + parser.add_argument("-i", "--note-id", help="select specific note (specify multiple times to select multiple notes)", action='append', nargs=1, default=[]) + parser.add_argument("-d", "--dry-run", help="print only, do not edit anything", action='store_true') return parser.parse_known_args(argv[1:]) def main(): options, args = parse_args(sys.argv) args.insert(0, sys.argv[0]) # restore first index of argv (QT crashes if argv[] is empty) + if options.clear_audio: + print("Safety delay of 3 seconds (are you sure you want to clear ALL audio fields?)...") + print("Press Ctrl+C to cancel") + sleep(3) + # forward remaining CLI parameters to Anki + sys.stdout = trash_out app = aqt._run(args, False) + sys.stdout = real_stdout + if aqt.mw == None: + print("Please close any open Anki windows before running this script!") + exit(1) + # load last open profile if no profile was specified on command line (option parsed by Anki) if not aqt.mw.pm.name: aqt.mw.pm.load(aqt.mw.pm.last_loaded_profile_name()) @@ -50,24 +76,38 @@ def main(): model = col.models.by_name(options.note_type) note_ids = col.models.nids(model) + # filter list if note ids were specified + if len(options.note_id) > 0: + filtered_note_ids = [int(arg[0]) for arg in options.note_id] + note_ids = [nid for nid in note_ids if nid in filtered_note_ids] + + # filter only "noaudio" cards + if options.noaudio: + note_ids = [nid for nid in note_ids if col.get_note(nid)[options.audio_field] == "noaudio"] + + if len(note_ids) == 0: + print("-- No notes found! (check your filters or note type?) --") + exit(1) + edited_notes = 0 + note_index_format = ("{:0" + str(floor(log10(len(note_ids))) + 1) + "d}/{:d}") for note_index, note_id in enumerate(note_ids): note = col.get_note(note_id) - note_index_format = ("{:0" + str(floor(log10(len(note_ids))) + 1) + "d}/{:d}").format(note_index + 1, len(note_ids)) - print(f"[nid:{note_id}] ({note_index_format}) ", end="") + print(f"[nid:{note_id}] ({note_index_format.format(note_index + 1, len(note_ids))}) ", end="") - # bulk clear audio field (dev only) - # note[options.audio_field] = "" - # note.flush() - # print(f"cleared \"{options.audio_field}\" field!") - # continue + if options.clear_audio: + if not options.dry_run: + note[options.audio_field] = "" + note.flush() + print(f"cleared \"{options.audio_field}\" field!") + continue # autosave deck every 20 cards if note_index % 20 == 0: col.save() # skip any notes that already have audio - if len(note[options.audio_field]) > 0: + if not options.force_override and len(note[options.audio_field]) > 0: print("skipped -> audio field not empty") continue @@ -79,10 +119,12 @@ def main(): print(f"{kanji} ({kana}) ", end="") # attempt to download audio - exit_code, data = get(kanji, kana) + exit_code, data = get(kanji, kana, options.source_list) if exit_code != 0: - note[options.audio_field] = "noaudio" - note.flush() + if not options.dry_run: + note[options.audio_field] = "noaudio" + note.flush() + edited_notes += 1 print("skipped -> no recording available, marked as 'noaudio'") continue @@ -90,14 +132,16 @@ def main(): digest = hashlib.sha1(data).hexdigest() filename = f"{options.filename_prefix}{digest}.mp3" output_path = os.path.join(media_dir, filename) - with open(output_path, "wb+") as f: - f.write(data) - f.close() + if not options.dry_run: + with open(output_path, "wb+") as f: + f.write(data) + f.close() # set audio field to audio filename audio_str = f"[sound:{filename}]" - note[options.audio_field] = audio_str - note.flush() + if not options.dry_run: + note[options.audio_field] = audio_str + note.flush() print(f"written audio as {audio_str}") edited_notes += 1 @@ -107,12 +151,19 @@ def main(): print("-- Done: no edits --") else: print(f"-- Done: succesfully edited {edited_notes} note{'' if edited_notes == 1 else 's'} --") - print("TODO: circumvent below error message (anki python api problems, notes were edited succesfully though):") + # circumvent "Exception ignored in atexit callbackException ignored in sys.unraisablehook" + sys.stdout = trash_out # run ./get to get audio data from stdout # returns (exit_code, stdout_data) -def get(kanji, kana): - p = subprocess.run(["./get", kanji, kana], capture_output=True) +def get(kanji, kana, source_list): + args = ["./get"] + if source_list != None: + args.append("-s") + args.append(source_list) + args.append(kanji) + args.append(kana) + p = subprocess.run(args, capture_output=True) if p.returncode != 0: return (1, None) return (0, p.stdout) diff --git a/bulk-audio/get b/bulk-audio/get index b7791aa..bc3839d 100755 --- a/bulk-audio/get +++ b/bulk-audio/get @@ -1,59 +1,91 @@ #!/bin/sh -KANJI="$1" -KANA="$2" +SCRIPT_NAME="$0" +SOURCES="lp101,lp101_alt,jisho" +KANJI="" +KANA="" -if [ -z "$KANJI" -o -z "$KANA" ]; then - cat << EOF -usage: $0 <kanji> <kana> > <output> - -return value is 0 if <output> was succesfully written, 1 if the word could not -be found. this script searches languagepod101, languagepod101 (alt) and -jisho.org. -EOF - exit 1 -fi - -get_languagepod101() { +lp101() { URL="https://assets.languagepod101.com/dictionary/japanese/audiomp3.php?kanji=$KANJI&kana=$KANA" # 52288 is the content-length of the "the audio for this clip is currently # not available. it will be recorded and uploaded shortly. thank you for your # patience" message (404, but server sends 200 anyways) curl -X HEAD -iso - "$URL" | awk '/^Content-length:/ { exit $2 == 52288 }' - [ $? -ne 0 ] && return 1 + [ $? -ne 0 ] && return curl -so - "$URL" + exit 0 } -get_languagepod101_alt() { +lp101_alt() { HTML="$(curl -s -X POST \ "https://www.japanesepod101.com/learningcenter/reference/dictionary_post" \ -H "Content-Type: application/x-www-form-urlencoded" \ -d "post=dictionary_reference&match_type=exact&search_query=$KANJI&vulgar=true" \ )" - [ $? -ne 0 ] && return 1 + [ $? -ne 0 ] && return URL="$(echo "$HTML" | pup "audio source attr{src}" | head -n1)" - [ -z "$URL" ] && return 1 + [ -z "$URL" ] && return curl -so - "$URL" + exit 0 } -get_jisho() { +jisho() { HTML="$(curl -s "https://jisho.org/search/$KANJI")" - [ $? -ne 0 ] && return 1 + [ $? -ne 0 ] && return URL="$(echo "$HTML" | pup "audio[id=\"audio_$KANJI:$KANA\"] source attr{src}" | head -n1)" - [ -z "$URL" ] && return 1 + [ -z "$URL" ] && return URL="https:$URL" curl -so - "$URL" + exit 0 +} + +usage() { + cat << EOF +usage: $SCRIPT_NAME [OPTIONS] <KANJI> <KANA> > <OUTPUT> + +attempt to download a native Japanese recording of word written as KANJI and +read as KANA. outputs mp3 to stdout. return value is 0 if OUTPUT was written +(clip was found), and 1 if no clip could be found. + +options: + -s <source1[,source2,...]> set source order (default: $SOURCES) + -h show help + +sources: + lp101 JapanesePod101 + lp101_alt JapanesePod101 (Alternate) + jisho Jisho.org +EOF } -# get_languagepod101_alt -# [ $? -eq 0 ] && exit 0 -# -# get_jisho -# [ $? -eq 0 ] && exit 0 +while getopts 'hs:' OPT; do + case $OPT in + h) + usage + exit 0 + ;; + s) + SOURCES="$OPTARG" + ;; + \?|*) + usage > /dev/stderr + exit 1 + ;; + esac +done -get_languagepod101 -[ $? -eq 0 ] && exit 0 +# invalid argument count +if [ $(( $# - $OPTIND + 1 )) -ne 2 ]; then + usage > /dev/stderr + exit 1 +fi + +KANJI=${@:$OPTIND:1} +KANA=${@:$OPTIND+1:1} + +$(printf '%s;' "$SOURCES" | sed -z 's/[;,\n]/ ; /g') # if none were succesful, delete output file and exit with error rm -f "$OUTPUT" exit 1 + diff --git a/bulk-audio/readme.md b/bulk-audio/readme.md new file mode 100644 index 0000000..06e7542 --- /dev/null +++ b/bulk-audio/readme.md @@ -0,0 +1,31 @@ +# Bulk audio adder + +This is a Python and POSIX shell script that downloads native speaker audio for +words from the same sources as Yomichan: + +- JapanesePod101 +- JapanesePod101 (Alternate) +- Jisho\.org + +No Python dependencies should have to be installed, as this script only relies +on built-in Python libraries and the `aqt` library, which should get installed +alongside Anki. + +The `./get` script (should) also work on Windows under Git Bash or Msys2, but +uses `pup` to parse HTML for the `lp101_alt` source. Disabling this source, or +installing `pup` should work. + +## Usage + +See `./bulk-audio.py -h` for all options. The default options add audio to +notes with an empty Audio field. If a clip can't be found for the note, the +Audio field will be set to "noaudio". This script can be customized to work +with other note types, but works with [my custom anki card +template](../anki-card-template) by default. + +|command|action| +|-|-| +|`./bulk-audio.py`|Download audio for all notes with empty Audio field| +|`./bulk-audio.py -nO`|Try to download audio again for notes with "noaudio" Audio field| +|`./bulk-audio.py -C`|Clear all Audio fields| + |