dotfiles/.local/bin/tools/ripper

#!/bin/sh
#A script that checks multiple youtube and bitchute channels for new videos to download via youtube-dl
#This script works considerably faster than just giving youtube-dl a channel URI.
#The YouTube implementation now uses a YoutubeData API v3 key to work more reliably.
#This can be quite quota taxing, as each channel search is 1% of the allotted qutoa for the day.
#-> checking n YT channels => n% of daily quota required to run this script
#Keep this in mind when running it as a cronjob
#Either insert this key in plain text below at the variable "APIKEY" or do it via ENV vars or a password manager
#Since bitchute still doesn't have an API I'm using lynx to emulate a user.
#This can limit the number of recent videos available. For a whole download of bitchute channels consider other methods first.
#For youtube the videos per channel are limited to the last 500 uploaded videos. For the rest you can just use youtube-dl itself

#needed if run as cronjob
XDG_VIDEOS_DIR=$HOME/vids #TODO ADJUST FOR PERSONAL USE HERE!
export XDG_VIDEOS_DIR
DLARCHIVE="${XDG_VIDEOS_DIR:-$HOME/Videos}/.downloaded"
DLLOC="${XDG_VIDEOS_DIR:-$HOME/Videos}"
CHANNELSFILE="${XDG_VIDEOS_DIR:-$HOME/Videos}/.channels"
#FORMAT OF CHANNELSFILE:
#Youtube: include the channel URI: https://www.youtube.com/channel/<channelId>
#Bitchute: normal channel URI: https://www.bitchute.com/channel/<user>
#Lines starting with '#' will be ignored in this file
BLACKLIST="${XDG_VIDEOS_DIR:-$HOME/Videos}/.blacklist"

# Required to display notifications if run as a cronjob:
DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/$(id -u)/bus
export DBUS_SESSION_BUS_ADDRESS
export DISPLAY=:0.0

APIKEY="$(pass show Misc/Youtube\ Data\ API\ v3 | head -n1 )"

if [ "$(pgrep -c ripper)" -gt 1 ]; then
	echo "Ripper already running, exiting new instance..."
	exit
fi

echo "Scanning for new Videos to download"
echo "Scanning on Youtube..."
IDs="$( grep 'youtube' "$CHANNELSFILE" | grep -v '^#' | grep 'channel' | sed 's/https:\/\/www\.youtube\.com\/channel\///')"
not_correctly_formatted="$(grep 'youtube' "$CHANNELSFILE" | grep -v '^#' | grep -v 'channel')"
if [  -n "$not_correctly_formatted" ]; then
	echo Please fix the following channel urls to be scannable:
	echo "$not_correctly_formatted"
	echo "They need to be in the 'https://www.youtube.com/channel/...' format"
fi
for channel_id in $IDs; do
	echo "ID: $channel_id"
	json="$(curl -s "https://www.googleapis.com/youtube/v3/search?key=$APIKEY&channelId=$channel_id&part=snippet,id&order=date&maxResults=500")"
	#Fallback to legacy mode if API quota is exceeded
	if [ "$(echo "$json" | jq '."error"."errors"[]."reason"')" = '"quotaExceeded"' ];then
		echo "YT API Quota exceeded, using fallback"
		lynx --dump --nonumbers -listonly "https://www.youtube.com/channel/$channel_id" | grep 'videos.xml' | xargs curl -s > /tmp/"${channel_id}.xml"
		python -c "from lxml import etree
file=\"/tmp/${channel_id}.xml\"
root = etree.parse(file)
for el in root.iter():
	if(el.tag in '{http://www.youtube.com/xml/schemas/2015}videoId'):
		print(el.text)" |
			sed 's/^/https:\/\/www\.youtube\.com\/watch\?v=/' | grep -vf "$BLACKLIST" >> /tmp/todownload$$
		rm -f "/tmp/${channel_id}.xml"
	else
		echo "$json" | jq '."items"[].id."videoId"' | tr -d '"' | grep -v '^null$'| sed 's/^/https:\/\/www\.youtube\.com\/watch\?v=/' | grep -vf "$BLACKLIST" >> /tmp/todownload$$
	fi
done
grep 'youtube' "$DLARCHIVE" | sed 's/youtube /https:\/\/www\.youtube\.com\/watch?v=/' > /tmp/alreadydownloaded$$

echo "Scanning on Bitchute..."
grep 'bitchute' "$CHANNELSFILE" | grep -v '^#' | xargs -L1 lynx --dump --nonumbers -listonly | grep 'bitchute\.com\/video' | sort -u | grep -vf "$BLACKLIST" >> /tmp/todownload$$
grep 'bitchute' "$DLARCHIVE" | sed 's/bitchute /https:\/\/www\.bitchute\.com\/video\//' >> /tmp/alreadydownloaded$$

grep -vf /tmp/alreadydownloaded$$ /tmp/todownload$$ | sort -u > /tmp/new_videos$$
rm -f /tmp/alreadydownloaded$$ /tmp/todownload$$
number=$(wc -l /tmp/new_videos$$ | cut -d ' ' -f 1 )
if [ "$number" -gt 0 ]; then
	[ "$number" -gt 1 ] && plural="s"
	notify-send "Channel Ripper" "$number new video$plural available for download, downloading now."
	echo  "$number new video$plural for download available, downloading now."
	if [ "$number"  -lt 10 ];then
		youtube-dl --get-filename -o "'%(uploader)s' '%(title)s'" -a /tmp/new_videos$$ | xargs -L1 notify-send
	fi
	youtube-dl --hls-prefer-native -i --download-archive "$DLARCHIVE" -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' --add-metadata -o "$DLLOC/%(uploader)s/%(upload_date)s-%(title)s.%(ext)s" -a /tmp/new_videos$$
	rm -f /tmp/new_videos$$
	notify-send "Channel Ripper" "Finished downloading"
fi

if [ "$number" -eq 0 ]; then
	echo "No new videos"
fi
made shellcheck happy 2021-01-02 09:37:42 +01:00			`#!/bin/sh`
added docs 2021-01-01 19:22:03 +01:00			`#A script that checks multiple youtube and bitchute channels for new videos to download via youtube-dl`
			`#This script works considerably faster than just giving youtube-dl a channel URI.`
			`#The YouTube implementation now uses a YoutubeData API v3 key to work more reliably.`
added explanation to quota issue with YT to ripper 2021-01-02 08:34:11 +01:00			`#This can be quite quota taxing, as each channel search is 1% of the allotted qutoa for the day.`
			`#-> checking n YT channels => n% of daily quota required to run this script`
			`#Keep this in mind when running it as a cronjob`
added docs 2021-01-01 19:22:03 +01:00			`#Either insert this key in plain text below at the variable "APIKEY" or do it via ENV vars or a password manager`
			`#Since bitchute still doesn't have an API I'm using lynx to emulate a user.`
more doc in ripper 2021-01-01 20:26:19 +01:00			`#This can limit the number of recent videos available. For a whole download of bitchute channels consider other methods first.`
			`#For youtube the videos per channel are limited to the last 500 uploaded videos. For the rest you can just use youtube-dl itself`
added docs 2021-01-01 19:22:03 +01:00
inital commit 2020-07-04 14:23:27 +02:00			`#needed if run as cronjob`
made shellcheck happy 2021-01-02 09:37:42 +01:00			`XDG_VIDEOS_DIR=$HOME/vids #TODO ADJUST FOR PERSONAL USE HERE!`
			`export XDG_VIDEOS_DIR`
Fallback if XDG not present 2021-01-01 19:26:09 +01:00			`DLARCHIVE="${XDG_VIDEOS_DIR:-$HOME/Videos}/.downloaded"`
			`DLLOC="${XDG_VIDEOS_DIR:-$HOME/Videos}"`
			`CHANNELSFILE="${XDG_VIDEOS_DIR:-$HOME/Videos}/.channels"`
added docs 2021-01-01 19:22:03 +01:00			`#FORMAT OF CHANNELSFILE:`
			`#Youtube: include the channel URI: https://www.youtube.com/channel/<channelId>`
			`#Bitchute: normal channel URI: https://www.bitchute.com/channel/<user>`
			`#Lines starting with '#' will be ignored in this file`
Fallback if XDG not present 2021-01-01 19:26:09 +01:00			`BLACKLIST="${XDG_VIDEOS_DIR:-$HOME/Videos}/.blacklist"`
inital commit 2020-07-04 14:23:27 +02:00
			`# Required to display notifications if run as a cronjob:`
made shellcheck happy 2021-01-02 09:37:42 +01:00			`DBUS_SESSION_BUS_ADDRESS=unix:path=/run/user/$(id -u)/bus`
			`export DBUS_SESSION_BUS_ADDRESS`
inital commit 2020-07-04 14:23:27 +02:00			`export DISPLAY=:0.0`

ripper uses YT API now 2021-01-01 17:25:13 +01:00			`APIKEY="$(pass show Misc/Youtube\ Data\ API\ v3 \| head -n1 )"`

made shellcheck happy 2021-01-02 09:37:42 +01:00			`if [ "$(pgrep -c ripper)" -gt 1 ]; then`
fixed ripper 2020-12-26 14:04:07 +01:00			`echo "Ripper already running, exiting new instance..."`
			`exit`
			`fi`

inital commit 2020-07-04 14:23:27 +02:00			`echo "Scanning for new Videos to download"`
added old method as fallback for yt 2021-01-02 15:33:03 +01:00			`echo "Scanning on Youtube..."`
reliable ignorance of commented out lines in channelsfile 2021-01-01 19:12:38 +01:00			`IDs="$( grep 'youtube' "$CHANNELSFILE" \| grep -v '^#' \| grep 'channel' \| sed 's/https:\/\/www\.youtube\.com\/channel\///')"`
			`not_correctly_formatted="$(grep 'youtube' "$CHANNELSFILE" \| grep -v '^#' \| grep -v 'channel')"`
only prints wrong channel format if some channels present 2021-01-02 08:23:29 +01:00			`if [ -n "$not_correctly_formatted" ]; then`
ripper uses YT API now 2021-01-01 17:25:13 +01:00			`echo Please fix the following channel urls to be scannable:`
			`echo "$not_correctly_formatted"`
			`echo "They need to be in the 'https://www.youtube.com/channel/...' format"`
			`fi`
			`for channel_id in $IDs; do`
			`echo "ID: $channel_id"`
added old method as fallback for yt 2021-01-02 15:33:03 +01:00			`json="$(curl -s "https://www.googleapis.com/youtube/v3/search?key=$APIKEY&channelId=$channel_id&part=snippet,id&order=date&maxResults=500")"`
			`#Fallback to legacy mode if API quota is exceeded`
			`if [ "$(echo "$json" \| jq '."error"."errors"[]."reason"')" = '"quotaExceeded"' ];then`
			`echo "YT API Quota exceeded, using fallback"`
yt fallback uses xml parser now 2021-01-02 20:53:03 +01:00			`lynx --dump --nonumbers -listonly "https://www.youtube.com/channel/$channel_id" \| grep 'videos.xml' \| xargs curl -s > /tmp/"${channel_id}.xml"`
			`python -c "from lxml import etree`
			`file=\"/tmp/${channel_id}.xml\"`
			`root = etree.parse(file)`
			`for el in root.iter():`
			`if(el.tag in '{http://www.youtube.com/xml/schemas/2015}videoId'):`
			`print(el.text)" \|`
			`sed 's/^/https:\/\/www\.youtube\.com\/watch\?v=/' \| grep -vf "$BLACKLIST" >> /tmp/todownload$$`
			`rm -f "/tmp/${channel_id}.xml"`
added old method as fallback for yt 2021-01-02 15:33:03 +01:00			`else`
			`echo "$json" \| jq '."items"[].id."videoId"' \| tr -d '"' \| grep -v '^null$'\| sed 's/^/https:\/\/www\.youtube\.com\/watch\?v=/' \| grep -vf "$BLACKLIST" >> /tmp/todownload$$`
			`fi`
ripper uses YT API now 2021-01-01 17:25:13 +01:00			`done`
fixed ripper 2020-12-26 14:04:07 +01:00			`grep 'youtube' "$DLARCHIVE" \| sed 's/youtube /https:\/\/www\.youtube\.com\/watch?v=/' > /tmp/alreadydownloaded$$`
inital commit 2020-07-04 14:23:27 +02:00
ripper uses YT API now 2021-01-01 17:25:13 +01:00			`echo "Scanning on Bitchute..."`
reliable ignorance of commented out lines in channelsfile 2021-01-01 19:12:38 +01:00			`grep 'bitchute' "$CHANNELSFILE" \| grep -v '^#' \| xargs -L1 lynx --dump --nonumbers -listonly \| grep 'bitchute\.com\/video' \| sort -u \| grep -vf "$BLACKLIST" >> /tmp/todownload$$`
fixed ripper 2020-12-26 14:04:07 +01:00			`grep 'bitchute' "$DLARCHIVE" \| sed 's/bitchute /https:\/\/www\.bitchute\.com\/video\//' >> /tmp/alreadydownloaded$$`
inital commit 2020-07-04 14:23:27 +02:00
small improvement to ripper 2020-12-26 15:30:55 +01:00			`grep -vf /tmp/alreadydownloaded$$ /tmp/todownload$$ \| sort -u > /tmp/new_videos$$`
added docs 2021-01-01 19:22:03 +01:00			`rm -f /tmp/alreadydownloaded$$ /tmp/todownload$$`
small improvement to ripper 2020-12-26 15:30:55 +01:00			`number=$(wc -l /tmp/new_videos$$ \| cut -d ' ' -f 1 )`
made shellcheck happy 2021-01-02 09:37:42 +01:00			`if [ "$number" -gt 0 ]; then`
			`[ "$number" -gt 1 ] && plural="s"`
fixed ripper 2020-12-26 14:04:07 +01:00			`notify-send "Channel Ripper" "$number new video$plural available for download, downloading now."`
			`echo "$number new video$plural for download available, downloading now."`
fixed broken pipe error, only print notifs if its not wasting time 2020-12-25 21:54:03 +01:00			`if [ "$number" -lt 10 ];then`
ripper uses YT API now 2021-01-01 17:25:13 +01:00			`youtube-dl --get-filename -o "'%(uploader)s' '%(title)s'" -a /tmp/new_videos$$ \| xargs -L1 notify-send`
fixed broken pipe error, only print notifs if its not wasting time 2020-12-25 21:54:03 +01:00			`fi`
made shellcheck happy 2021-01-02 09:37:42 +01:00			`youtube-dl --hls-prefer-native -i --download-archive "$DLARCHIVE" -f 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best' --add-metadata -o "$DLLOC/%(uploader)s/%(upload_date)s-%(title)s.%(ext)s" -a /tmp/new_videos$$`
fixed ripper 2020-12-26 14:04:07 +01:00			`rm -f /tmp/new_videos$$`
inital commit 2020-07-04 14:23:27 +02:00			`notify-send "Channel Ripper" "Finished downloading"`
			`fi`

made shellcheck happy 2021-01-02 09:37:42 +01:00			`if [ "$number" -eq 0 ]; then`
ripper uses YT API now 2021-01-01 17:25:13 +01:00			`echo "No new videos"`
inital commit 2020-07-04 14:23:27 +02:00			`fi`