AVL/script/gethtml

#!/bin/zsh
# This script is just the initial step in porting the avl data to anki
# It does not work 100% reliable, but it's good enough for me
# Manual adjustments on the exported data should be expected
# This script is rather ugly in its Implementation
[ -f cards.csv ] && rm cards.csv
touch cards.csv

handle_cleaned_html(){
## Handling of multiple meanings
	meaning_clean_func="$1"
	no_func="$2"
	word_func="$3"
	pof_func="$4"
	if echo "$meaning_clean_func" | grep -q "SUP"; then
		count_func="$(echo "$meaning_clean_func" | grep -c "SUP")"
		echo "Found $count_func multiple meanings for the word $word_func ($pof_func)"

		for j in {1..$count_func}; do
			meaning="$( echo "$meaning_clean_func" | grep "<SUP> $j" | perl -pe "s|<P> <SUP> $j </SUP> <TT> (.*?) </TT> </P>|\1|" )"
			echo MEANING:
			echo "$meaning"
			echo DONE
			[ -f categoriestmp ] && rm categoriestmp
			touch categoriestmp
			[ -f backofcardtmp ] && rm backofcardtmp
			touch backofcardtmp
			printout=false

			while IFS= read -r line; do
				if [ $printout = true ]; then
					if echo "$line" | grep -q "<P> <SUP> $(( j + 1 )) </SUP>" ; then
						printout=false
					else
						echo "$line" >> backofcardtmp
						if echo "$line" | grep -q '<P> <U>'; then
							echo "$line" | perl -pe "s|<P> <U> (.*?) </U>(.*?)$|<P> <U> \1 </U> </P>|" >> categoriestmp
						fi
					fi
				fi
				if echo "$line" | grep -q "SUP"; then
					if echo "$line" |  grep -q "<P> <SUP> $j </SUP>"; then
						printout=true
					fi
				fi
			done <<< $(echo "$meaning_clean_func")
			echo BACKSIDE:
			cat backofcardtmp
			echo CATEGORIES
			cat categoriestmp
			echo DONE
			categories="$(cat categoriestmp | tr '\n' ' ')"
			backside="$(cat backofcardtmp | tr '\n' ' ' )"
			printf "%s.%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$j" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv
		done
	else
		echo "Found only one meaning for the word $word_func ($pof_func)"
		meaning="" #ozdic only provides meaning if there are different ones and the collocations for those different meanings differ.
		[ -f categoriestmp ] && rm categoriestmp
		touch categoriestmp
		while IFS= read -r line; do
			#echo current line:
			#echo "$line"
			echo "$line" | grep "<P> <U> " | perl -pe "s|<P> <U> (.*?) </U>(.*?)$|<P> <U> \1 </U> </P>|" >> categoriestmp
		done <<< $(echo "$meaning_clean")
		backside="$( echo "$meaning_clean_func"  | grep "<P> <U> " | grep -vF '<DIV class="item"><P class="word"><B>'| tr '\n' ' ')"
		#cat categoriestmp
		categories="$(cat categoriestmp | tr '\n' ' ' )"
		printf "%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv

	fi
	#printf "%s;\"%s\";\"%s\"\n" "$no" "$word" "$meaning_clean" >> cards.csv
}


cat avl.csv | while read line || [[ -n $line ]];
do
	no="$(echo "$line" | awk 'BEGIN{FS = "\t" } {print $1}' )"
	word="$(echo "$line" | awk 'BEGIN{FS = "\t" } {print $2}' )"
	printf "card:%s\t(%s)\n" "$no" "$word"
	meaning_clean="$(curl --no-progress-meter http://www.ozdic.com/collocation-dictionary/"$word" | sed '6,35d; 38,48d' | tac | sed '6,30d' | tac | tr '\t' ' ' | tr -d '\r')"
	#echo MEANINGCLEAN1:
	#echo "$meaning_clean"
	#echo DONE
	echo "Checking whether there are multiple parts of speech..."
	pof_count="$(echo "$meaning_clean" | grep -c "<P class=\"word\">" )"
	if [ $pof_count -gt 1 ]; then
		echo "$pof_count parts of speech found!"
		# seperate into multiple "meaning_clean"
		pofs="$( echo "$meaning_clean" | grep "<DIV class=\"item\"><P class=\"word\"><B>" | perl -pe "s|^.*?<DIV class=\"item\"><P class=\"word\"><B>[a-zA-Z]*? </B><I>([a-zA-Z\.]*?) </I> </P>|\1|g")"
		#echo "pofs:"
		#echo "$pofs"
		pofs="$(printf "%s\nscript" "$pofs" )"
		for i in {1..$pof_count}; do
			current_pof=$( echo "$pofs" | sed "${i}q;d") #| sed 's/\./\\./g')
			next_pof=$(echo "$pofs" | sed "$((i+1))q;d") #| sed 's/\./\\./g')
			echo current_pof: $current_pof
			echo next_pof: $next_pof
			#start="<DIV class=\"item\"><P class=\"word\"><B>$word <\/B><I>$current_pof <\/I> <\/P>"
			start="$( printf '<DIV class="item"><P class="word"><B>%s </B><I>%s </I> </P>\n' "$word" "$current_pof" )"
			if [ $i -eq $pof_count ]; then
				stop="$next_pof"
				#echo MEANINGCLEAN_LASTPOF:
				#echo "$meaning_clean"
				#echo DONE
			else
				stop="$( printf '<DIV class="item"><P class="word"><B>%s </B><I>%s </I> </P>\n' "$word" "$next_pof" )"
			fi
			meaning_clean_temp="$( echo "$meaning_clean" | sed 's,'"$start"',START,')"
			meaning_clean_temp="$( echo "$meaning_clean_temp" | sed 's,'"$stop"',STOP,' )"
			echo MEANINGCLEAN_TEMP:
			echo "$meaning_clean_temp"
			echo DONE
			pof_meaning_clean="$(echo "$meaning_clean_temp" | sed -n '/START/,/STOP/ { /START/n; /STOP/!p }' )"
			echo POFMEANINGCLEAN
			echo "$pof_meaning_clean"
			echo DONE

			notemp="$no"'.'"$i"
			handle_cleaned_html "$pof_meaning_clean" "$notemp" "$word" "$current_pof"
		done

	else
		echo "only one part of speech found"
		pof="$( echo "$meaning_clean" | grep "<DIV class=\"item\"><P class=\"word\"><B>" | perl -pe "s|^.*?<DIV class=\"item\"><P class=\"word\"><B>[a-zA-Z]*? </B><I>([a-zA-Z\.]*?) </I> </P>|\1|g")"
		echo POF:"$pof"
		handle_cleaned_html "$meaning_clean" "$no" "$word" "$pof"
	fi

done

rm backofcardtmp categoriestmp
updated script for better card layout (categories on front) 2020-09-21 11:44:44 +02:00			`#!/bin/zsh`
added explanation to expect format errors 2020-09-21 11:56:12 +02:00			`# This script is just the initial step in porting the avl data to anki`
			`# It does not work 100% reliable, but it's good enough for me`
			`# Manual adjustments on the exported data should be expected`
			`# This script is rather ugly in its Implementation`
added script used to generate cards 2020-09-18 15:05:47 +02:00			`[ -f cards.csv ] && rm cards.csv`
			`touch cards.csv`
updated script for better card layout (categories on front) 2020-09-21 11:44:44 +02:00
			`handle_cleaned_html(){`
			`## Handling of multiple meanings`
			`meaning_clean_func="$1"`
			`no_func="$2"`
			`word_func="$3"`
			`pof_func="$4"`
			`if echo "$meaning_clean_func" \| grep -q "SUP"; then`
			`count_func="$(echo "$meaning_clean_func" \| grep -c "SUP")"`
			`echo "Found $count_func multiple meanings for the word $word_func ($pof_func)"`

			`for j in {1..$count_func}; do`
			`meaning="$( echo "$meaning_clean_func" \| grep "<SUP> $j" \| perl -pe "s\|<P> <SUP> $j </SUP> <TT> (.*?) </TT> </P>\|\1\|" )"`
			`echo MEANING:`
			`echo "$meaning"`
			`echo DONE`
			`[ -f categoriestmp ] && rm categoriestmp`
			`touch categoriestmp`
			`[ -f backofcardtmp ] && rm backofcardtmp`
			`touch backofcardtmp`
			`printout=false`

			`while IFS= read -r line; do`
			`if [ $printout = true ]; then`
			`if echo "$line" \| grep -q "<P> <SUP> $(( j + 1 )) </SUP>" ; then`
			`printout=false`
			`else`
			`echo "$line" >> backofcardtmp`
			`if echo "$line" \| grep -q '<P> <U>'; then`
			`echo "$line" \| perl -pe "s\|<P> <U> (.?) </U>(.?)$\|<P> <U> \1 </U> </P>\|" >> categoriestmp`
			`fi`
			`fi`
			`fi`
			`if echo "$line" \| grep -q "SUP"; then`
			`if echo "$line" \| grep -q "<P> <SUP> $j </SUP>"; then`
			`printout=true`
			`fi`
			`fi`
			`done <<< $(echo "$meaning_clean_func")`
			`echo BACKSIDE:`
			`cat backofcardtmp`
			`echo CATEGORIES`
			`cat categoriestmp`
			`echo DONE`
			`categories="$(cat categoriestmp \| tr '\n' ' ')"`
			`backside="$(cat backofcardtmp \| tr '\n' ' ' )"`
			`printf "%s.%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$j" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv`
			`done`
			`else`
			`echo "Found only one meaning for the word $word_func ($pof_func)"`
			`meaning="" #ozdic only provides meaning if there are different ones and the collocations for those different meanings differ.`
			`[ -f categoriestmp ] && rm categoriestmp`
			`touch categoriestmp`
			`while IFS= read -r line; do`
			`#echo current line:`
			`#echo "$line"`
			`echo "$line" \| grep "<P> <U> " \| perl -pe "s\|<P> <U> (.?) </U>(.?)$\|<P> <U> \1 </U> </P>\|" >> categoriestmp`
			`done <<< $(echo "$meaning_clean")`
massively reduced unnecessary html in fields 2020-09-21 22:21:02 +02:00			`backside="$( echo "$meaning_clean_func" \| grep "<P> <U> " \| grep -vF '<DIV class="item"><P class="word"><B>'\| tr '\n' ' ')"`
updated script for better card layout (categories on front) 2020-09-21 11:44:44 +02:00			`#cat categoriestmp`
			`categories="$(cat categoriestmp \| tr '\n' ' ' )"`
			`printf "%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv`

			`fi`
			`#printf "%s;\"%s\";\"%s\"\n" "$no" "$word" "$meaning_clean" >> cards.csv`
			`}`



added script used to generate cards 2020-09-18 15:05:47 +02:00			`cat avl.csv \| while read line \|\| [[ -n $line ]];`
			`do`
			`no="$(echo "$line" \| awk 'BEGIN{FS = "\t" } {print $1}' )"`
			`word="$(echo "$line" \| awk 'BEGIN{FS = "\t" } {print $2}' )"`
			`printf "card:%s\t(%s)\n" "$no" "$word"`
updated script for better card layout (categories on front) 2020-09-21 11:44:44 +02:00			`meaning_clean="$(curl --no-progress-meter http://www.ozdic.com/collocation-dictionary/"$word" \| sed '6,35d; 38,48d' \| tac \| sed '6,30d' \| tac \| tr '\t' ' ' \| tr -d '\r')"`
			`#echo MEANINGCLEAN1:`
			`#echo "$meaning_clean"`
			`#echo DONE`
			`echo "Checking whether there are multiple parts of speech..."`
			`pof_count="$(echo "$meaning_clean" \| grep -c "<P class=\"word\">" )"`
			`if [ $pof_count -gt 1 ]; then`
			`echo "$pof_count parts of speech found!"`
			`# seperate into multiple "meaning_clean"`
			`pofs="$( echo "$meaning_clean" \| grep "<DIV class=\"item\"><P class=\"word\"><B>" \| perl -pe "s\|^.?<DIV class=\"item\"><P class=\"word\"><B>[a-zA-Z]? </B><I>([a-zA-Z\.]*?) </I> </P>\|\1\|g")"`
			`#echo "pofs:"`
			`#echo "$pofs"`
			`pofs="$(printf "%s\nscript" "$pofs" )"`
			`for i in {1..$pof_count}; do`
			`current_pof=$( echo "$pofs" \| sed "${i}q;d") #\| sed 's/\./\\./g')`
			`next_pof=$(echo "$pofs" \| sed "$((i+1))q;d") #\| sed 's/\./\\./g')`
			`echo current_pof: $current_pof`
			`echo next_pof: $next_pof`
			`#start="<DIV class=\"item\"><P class=\"word\"><B>$word <\/B><I>$current_pof <\/I> <\/P>"`
			`start="$( printf '<DIV class="item"><P class="word"><B>%s </B><I>%s </I> </P>\n' "$word" "$current_pof" )"`
			`if [ $i -eq $pof_count ]; then`
			`stop="$next_pof"`
			`#echo MEANINGCLEAN_LASTPOF:`
			`#echo "$meaning_clean"`
			`#echo DONE`
			`else`
			`stop="$( printf '<DIV class="item"><P class="word"><B>%s </B><I>%s </I> </P>\n' "$word" "$next_pof" )"`
			`fi`
			`meaning_clean_temp="$( echo "$meaning_clean" \| sed 's,'"$start"',START,')"`
			`meaning_clean_temp="$( echo "$meaning_clean_temp" \| sed 's,'"$stop"',STOP,' )"`
			`echo MEANINGCLEAN_TEMP:`
			`echo "$meaning_clean_temp"`
			`echo DONE`
			`pof_meaning_clean="$(echo "$meaning_clean_temp" \| sed -n '/START/,/STOP/ { /START/n; /STOP/!p }' )"`
			`echo POFMEANINGCLEAN`
			`echo "$pof_meaning_clean"`
			`echo DONE`

			`notemp="$no"'.'"$i"`
			`handle_cleaned_html "$pof_meaning_clean" "$notemp" "$word" "$current_pof"`
			`done`

			`else`
			`echo "only one part of speech found"`
			`pof="$( echo "$meaning_clean" \| grep "<DIV class=\"item\"><P class=\"word\"><B>" \| perl -pe "s\|^.?<DIV class=\"item\"><P class=\"word\"><B>[a-zA-Z]? </B><I>([a-zA-Z\.]*?) </I> </P>\|\1\|g")"`
			`echo POF:"$pof"`
			`handle_cleaned_html "$meaning_clean" "$no" "$word" "$pof"`
			`fi`

added script used to generate cards 2020-09-18 15:05:47 +02:00			`done`
updated script for better card layout (categories on front) 2020-09-21 11:44:44 +02:00
			`rm backofcardtmp categoriestmp`