ulthar.xyz > Repos

dotfiles

Seven years worth of accumulated configuration cruft

About Files Commits


          git clone https://ulthar.xyz/repos/dotfiles/dotfiles.git

dotfiles/scripts/.local/bin/dl

Download raw file: scripts/.local/bin/dl

#!/bin/sh
# This script does most of my data hoarding

set -e

LATEST_CHROME_ON_WINDOWS="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"

# Strip comments and blank lines from data files.
injest() { sed '/^[[:space:]]*$/d; /^#.*/d' "$1"; }

########################################################################
### Music From YouTube

_ytmu() { YTMU_YTDLP_EXTRA_OPTS="--download-archive archived.txt" ytmu "$1"; }

dl_youtube_music() {
	injest ~/media/.data/youtube_music.txt | while read -r line; do
		url="${line%	*}"
		dir="$HOME/media/music/${line#*	}"
		mkdir -p "$dir"
		(cd "$dir"; _ytmu "$url")
	done
}

########################################################################
### Artwork

gallery_dl_wrapper() {
	url="$1"
	dir="$2"
	gallery-dl --ugoira-conv-copy --write-metadata --write-info-json \
		   --write-tags --destination "$dir/" \
		   --write-unsupported "$HOME/dl_script_gallery-dl_unsupported_urls" \
		   --user-agent "$LATEST_CHROME_ON_WINDOWS" \
		   --download-archive "$dir/archive" "$url"
}

dl_artwork() {
	injest ~/media/.data/art_archive.txt | while read -r url; do
		gallery_dl_wrapper "$url" "$HOME/media/gallery-dl" || case $? in
			# FIXME Tumblr returns 1 for API rate limit
			# exceeded, handle this!
			4) continue ;; # Cloudflare or Instagram being a dick
			16) continue ;; # Login required or oauth expired
		esac
	done
}

########################################################################
### Torrents

# FIXME: write this!
dl_torrents() {
	return 0
}

########################################################################
### RFCs

dl_rfcs() {
	rsync -avz --delete rsync.rfc-editor.org::rfcs-text-only ~/media/rfc/
}

########################################################################
### US Patents

# FIXME Figure out how to grab from https://bulkdata.uspto.gov/
dl_patents() {
	return 0
}

########################################################################
### Project Gutenberg

dl_gutenberg() {
	rsync -av --delete aleph.gutenberg.org::gutenberg-epub ~/media/gutenberg.org
}

########################################################################
### Bitsavers

dl_bitsavers() {
	rsync -av --delete rsync://bitsavers.org:/bitsavers/ ~/media/bitsavers.org
}

########################################################################
### Anarchist Library

dl_anarchist_library() {
	curl https://theanarchistlibrary.org/mirror.txt | wget -x -N -i - -P ~/media/
}

########################################################################
### Miscellaneous Sites

_get_domain() { echo "$1" | sed "s/^http:\/\///; s/^https:\/\///" | cut -d '/' -f1; }

dl_misc_sites() {
	injest ~/media/.data/sites_to_archive.txt \
	       | wget -r -np -k -p -c -N -D "$(_get_domain "$1")" -i - -P ~/media/sites/
}

########################################################################

# Allows this file to be sourced as a library.
if [ "$(basename "$0")" = "dl" ]; then
	set -e
	# dl_youtube_music
	# dl_artwork
	# dl_torrents
	# dl_rfcs
	# dl_patents
	# dl_gutenberg
	# dl_bitsavers
	# dl_anarchist_library
	# dl_misc_sites
fi