From 1a3cbc9a16f1d6a12291fa0b1bfde2801f31c70c Mon Sep 17 00:00:00 2001 From: lonkaars Date: Sat, 7 Oct 2023 19:16:46 +0200 Subject: fix bulk audio adding script --- bulk-audio/bulk-audio.py | 104 +++++++++++++++++++++++++++++++++++++++++++++++ bulk-audio/get | 27 +++++++----- 2 files changed, 120 insertions(+), 11 deletions(-) create mode 100755 bulk-audio/bulk-audio.py diff --git a/bulk-audio/bulk-audio.py b/bulk-audio/bulk-audio.py new file mode 100755 index 0000000..0ef8b71 --- /dev/null +++ b/bulk-audio/bulk-audio.py @@ -0,0 +1,104 @@ +#!/bin/python3 + +import sys +import subprocess +import hashlib +import os +import re +from math import floor, log10 + +import aqt + +# change these variables +AUDIO_FILENAME_PREFIX = "refold-tools-" +# the anki user to which notes of type NOTE_TYPE belong +ANKI_USER = "ルーク" +# the note type name of notes that should get audio fields filled in (see Tools > Manage note types) +NOTE_TYPE = "Sentence mining" +# field name to be filled with "[audio:...]" or "noaudio" +AUDIO_FIELD_NAME = "Audio" + +# this function only works for refold-tools sentence mining card template +pattern = re.compile("^([^[、 【]+)[^【]*(【(.+)】)?") +def note2kanji_kana(note): + word = note["Target word reading"] + result = pattern.search(word) + if result == None: return (None, None) + kanji = result.group(1) + kana = result.group(3) + if kanji == None: return (None, None) + if kana == None: kana = kanji + kana = kana.replace("・", "") + return (kanji, kana) + +def main(): + ANKI_PATH = os.path.join(os.environ["XDG_DATA_HOME"], "Anki2", ANKI_USER) + ANKI_COLLECTION = os.path.join(ANKI_PATH, "collection.anki2") + ANKI_MEDIA = os.path.join(ANKI_PATH, "collection.media") + col = aqt.Collection(ANKI_COLLECTION) + + model = col.models.by_name(NOTE_TYPE) + note_ids = col.models.nids(model) + + for note_index, note_id in enumerate(note_ids): + note = col.get_note(note_id) + note_index_format = ("{:0" + str(floor(log10(len(note_ids))) + 1) + "d}/{:d}").format(note_index + 1, len(note_ids)) + print(f"[{note_index_format}] ", end="") + + # bulk clear audio field (dev only) + # note[AUDIO_FIELD_NAME] = "" + # note.flush() + # print(f"cleared \"{AUDIO_FIELD_NAME}\" field!") + # continue + + # autosave deck every 20 cards + if note_index % 20 == 0: col.save() + + # skip any notes that already have audio + if len(note[AUDIO_FIELD_NAME]) > 0: + print("skipped -> audio field not empty") + continue + + # parse kanji and kana info from note + kanji, kana = note2kanji_kana(note) + if kanji == None or kana == None: + print("skipped -> can't parse kanji/kana from card") + continue + print(f"{kanji} ({kana}) ", end="") + + # attempt to download audio + exit_code, data = get(kanji, kana) + if exit_code != 0: + note[AUDIO_FIELD_NAME] = "noaudio" + note.flush() + print("skipped -> no recording available, marked as 'noaudio'") + continue + + # save audio if download was succesful + digest = hashlib.sha1(data).hexdigest() + filename = f"{AUDIO_FILENAME_PREFIX}{digest}.mp3" + output_path = os.path.join(ANKI_MEDIA, filename) + with open(output_path, "wb+") as f: + f.write(data) + f.close() + + # set audio field to audio filename + audio_str = f"[sound:{filename}]" + note[AUDIO_FIELD_NAME] = audio_str + note.flush() + print(f"written audio as {audio_str}") + + # save collection (and exit) + col.save() + +# run ./get to get audio data from stdout +# returns (exit_code, stdout_data) +def get(kanji, kana): + p = subprocess.run(["./get", kanji, kana], capture_output=True) + if p.returncode != 0: + return (1, None) + return (0, p.stdout) + +if __name__ == "__main__": + main() + diff --git a/bulk-audio/get b/bulk-audio/get index bd46e6b..b7791aa 100755 --- a/bulk-audio/get +++ b/bulk-audio/get @@ -2,11 +2,10 @@ KANJI="$1" KANA="$2" -OUTPUT="$3" -if [ -z "$KANJI" -o -z "$KANA" -o -z "$OUTPUT" ]; then +if [ -z "$KANJI" -o -z "$KANA" ]; then cat << EOF -usage: $0 +usage: $0 > return value is 0 if was succesfully written, 1 if the word could not be found. this script searches languagepod101, languagepod101 (alt) and @@ -16,7 +15,13 @@ EOF fi get_languagepod101() { - curl -so "$OUTPUT" "https://assets.languagepod101.com/dictionary/japanese/audiomp3.php?kanji=$KANJI&kana=$KANA" + URL="https://assets.languagepod101.com/dictionary/japanese/audiomp3.php?kanji=$KANJI&kana=$KANA" + # 52288 is the content-length of the "the audio for this clip is currently + # not available. it will be recorded and uploaded shortly. thank you for your + # patience" message (404, but server sends 200 anyways) + curl -X HEAD -iso - "$URL" | awk '/^Content-length:/ { exit $2 == 52288 }' + [ $? -ne 0 ] && return 1 + curl -so - "$URL" } get_languagepod101_alt() { @@ -28,7 +33,7 @@ get_languagepod101_alt() { [ $? -ne 0 ] && return 1 URL="$(echo "$HTML" | pup "audio source attr{src}" | head -n1)" [ -z "$URL" ] && return 1 - curl -so "$OUTPUT" "$URL" + curl -so - "$URL" } get_jisho() { @@ -37,14 +42,14 @@ get_jisho() { URL="$(echo "$HTML" | pup "audio[id=\"audio_$KANJI:$KANA\"] source attr{src}" | head -n1)" [ -z "$URL" ] && return 1 URL="https:$URL" - curl -so "$OUTPUT" "$URL" + curl -so - "$URL" } -get_languagepod101_alt -[ $? -eq 0 ] && exit 0 - -get_jisho -[ $? -eq 0 ] && exit 0 +# get_languagepod101_alt +# [ $? -eq 0 ] && exit 0 +# +# get_jisho +# [ $? -eq 0 ] && exit 0 get_languagepod101 [ $? -eq 0 ] && exit 0 -- cgit v1.2.3