#!/bin/zsh # This script is just the initial step in porting the avl data to anki # It does not work 100% reliable, but it's good enough for me # Manual adjustments on the exported data should be expected # This script is rather ugly in its Implementation [ -f cards.csv ] && rm cards.csv touch cards.csv handle_cleaned_html(){ ## Handling of multiple meanings meaning_clean_func="$1" no_func="$2" word_func="$3" pof_func="$4" if echo "$meaning_clean_func" | grep -q "SUP"; then count_func="$(echo "$meaning_clean_func" | grep -c "SUP")" echo "Found $count_func multiple meanings for the word $word_func ($pof_func)" for j in {1..$count_func}; do meaning="$( echo "$meaning_clean_func" | grep " $j" | perl -pe "s|

$j (.*?)

|\1|" )" echo MEANING: echo "$meaning" echo DONE [ -f categoriestmp ] && rm categoriestmp touch categoriestmp [ -f backofcardtmp ] && rm backofcardtmp touch backofcardtmp printout=false while IFS= read -r line; do if [ $printout = true ]; then if echo "$line" | grep -q "

$(( j + 1 )) " ; then printout=false else echo "$line" >> backofcardtmp if echo "$line" | grep -q '

'; then echo "$line" | perl -pe "s|

(.*?) (.*?)$|

\1

|" >> categoriestmp fi fi fi if echo "$line" | grep -q "SUP"; then if echo "$line" | grep -q "

$j "; then printout=true fi fi done <<< $(echo "$meaning_clean_func") echo BACKSIDE: cat backofcardtmp echo CATEGORIES cat categoriestmp echo DONE categories="$(cat categoriestmp | tr '\n' ' ')" backside="$(cat backofcardtmp | tr '\n' ' ' )" printf "%s.%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$j" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv done else echo "Found only one meaning for the word $word_func ($pof_func)" meaning="" #ozdic only provides meaning if there are different ones and the collocations for those different meanings differ. [ -f categoriestmp ] && rm categoriestmp touch categoriestmp while IFS= read -r line; do #echo current line: #echo "$line" echo "$line" | grep "

" | perl -pe "s|

(.*?) (.*?)$|

\1

|" >> categoriestmp done <<< $(echo "$meaning_clean") backside="$( echo "$meaning_clean_func" | grep "

" | grep -vF '

'| tr '\n' ' ')" #cat categoriestmp categories="$(cat categoriestmp | tr '\n' ' ' )" printf "%s;\"%s\";\"%s\";\"%s\";\"%s\";\"%s\"\n" "$no_func" "$word_func" "$meaning" "$categories" "$backside" "$pof_func" >> cards.csv fi #printf "%s;\"%s\";\"%s\"\n" "$no" "$word" "$meaning_clean" >> cards.csv } cat avl.csv | while read line || [[ -n $line ]]; do no="$(echo "$line" | awk 'BEGIN{FS = "\t" } {print $1}' )" word="$(echo "$line" | awk 'BEGIN{FS = "\t" } {print $2}' )" printf "card:%s\t(%s)\n" "$no" "$word" meaning_clean="$(curl --no-progress-meter http://www.ozdic.com/collocation-dictionary/"$word" | sed '6,35d; 38,48d' | tac | sed '6,30d' | tac | tr '\t' ' ' | tr -d '\r')" #echo MEANINGCLEAN1: #echo "$meaning_clean" #echo DONE echo "Checking whether there are multiple parts of speech..." pof_count="$(echo "$meaning_clean" | grep -c "

" )" if [ $pof_count -gt 1 ]; then echo "$pof_count parts of speech found!" # seperate into multiple "meaning_clean" pofs="$( echo "$meaning_clean" | grep "

" | perl -pe "s|^.*?

[a-zA-Z]*? ([a-zA-Z\.]*?)

|\1|g")" #echo "pofs:" #echo "$pofs" pofs="$(printf "%s\nscript" "$pofs" )" for i in {1..$pof_count}; do current_pof=$( echo "$pofs" | sed "${i}q;d") #| sed 's/\./\\./g') next_pof=$(echo "$pofs" | sed "$((i+1))q;d") #| sed 's/\./\\./g') echo current_pof: $current_pof echo next_pof: $next_pof #start="

$word <\/B>$current_pof <\/I> <\/P>" start="$( printf '

%s %s

\n' "$word" "$current_pof" )" if [ $i -eq $pof_count ]; then stop="$next_pof" #echo MEANINGCLEAN_LASTPOF: #echo "$meaning_clean" #echo DONE else stop="$( printf '

%s %s

\n' "$word" "$next_pof" )" fi meaning_clean_temp="$( echo "$meaning_clean" | sed 's,'"$start"',START,')" meaning_clean_temp="$( echo "$meaning_clean_temp" | sed 's,'"$stop"',STOP,' )" echo MEANINGCLEAN_TEMP: echo "$meaning_clean_temp" echo DONE pof_meaning_clean="$(echo "$meaning_clean_temp" | sed -n '/START/,/STOP/ { /START/n; /STOP/!p }' )" echo POFMEANINGCLEAN echo "$pof_meaning_clean" echo DONE notemp="$no"'.'"$i" handle_cleaned_html "$pof_meaning_clean" "$notemp" "$word" "$current_pof" done else echo "only one part of speech found" pof="$( echo "$meaning_clean" | grep "

" | perl -pe "s|^.*?

[a-zA-Z]*? ([a-zA-Z\.]*?)

|\1|g")" echo POF:"$pof" handle_cleaned_html "$meaning_clean" "$no" "$word" "$pof" fi done rm backofcardtmp categoriestmp