improved composites script to remove duplicates

This commit is contained in:
Alexander Bocken 2021-02-22 21:11:23 +01:00
parent 98b507245c
commit 5b62e93e11
Signed by: Alexander
GPG Key ID: 1D237BE83F9B05E8
5 changed files with 9042 additions and 0 deletions

3500
composites/composites Normal file

File diff suppressed because it is too large Load Diff

2092
composites/for_import Normal file

File diff suppressed because it is too large Load Diff

57
composites/formatter Executable file
View File

@ -0,0 +1,57 @@
#!/bin/bash
kanji_ref="Kanji.txt"
composite_ref="composites"
countpipes(){
var="$1"
res="${var//[^|]}"
echo "${#res}"
}
for i in {1..2200}; do
kanji="$(grep -P "^$i\t" "$kanji_ref" | cut -f2)"
regexp_composites="^L-\d{1,4}\t$kanji"
composites_line="$(grep -P "$regexp_composites" "$composite_ref" )"
#echo "$composites_line"
ON_readings="$( echo "$composites_line" | cut -f3 | tr '\n' '|' | uniq | sed 's/|$//')"
if [ "$ON_readings" = "" ]; then
#echo "No composites found for the kanji $kanji"
continue
fi
composites="$( echo "$composites_line" | cut -f4 | awk '!seen[$0]++' | tr '\n' '|' | sed 's/|$//')"
composites_kana="$( echo "$composites_line" | cut -f5 | awk '!seen[$0]++' | tr '\n' '|' | sed 's/|$//')"
composites_meaning="$( echo "$composites_line" | cut -f6 | awk '!seen[$0]++' | tr '\n' '|' | sed 's/|$//')"
#CHECK FOR CORRECT FORMATTING
if [ "$ON_readings" = "" ];then
echo missing on reading for no. $i
fi
if [ "$composites" = "" ];then
echo missing composites for no. $i
fi
if [ "$composites_kana" = "" ];then
echo missing composites kana no. $i
fi
if [ "$( echo "$composites_kana" | grep -E '(\[|\])')" != "" ];then
echo composite likely in composites_kana for no.$i
echo "composite_kana: $composites_kana"
fi
if [ "$composites_meaning" = "" ];then
echo missing composite meaning for $i
fi
flag=0
if [ "$(countpipes "$ON_readings")" != "$(countpipes "$composites")" ]; then
flag=1
elif [ "$(countpipes "$composites")" != "$(countpipes "$composites_kana")" ]; then
flag=1
elif [ "$(countpipes "$composites_kana")" != "$(countpipes "$composites_meaning")" ]; then
flag=1
fi
if [ $flag -eq 1 ]; then
echo line-mismatch for kanji no.$i
fi
#WANTED OUTPUT
printf '%s\t%s\t%s\t%s\t%s\n' "$i" "$ON_readings" "$composites" "$composites_kana" "$composites_meaning"
#sleep 0.2
done

3370
composites/readings_index Normal file

File diff suppressed because it is too large Load Diff

23
composites/script Executable file
View File

@ -0,0 +1,23 @@
#!/bin/bash
for i in {1..3218}; do
reading_indexx="L-$i"
foreign_kanji="$(grep "${reading_indexx}$" kanji_behalten.txt | cut -f4 | grep -o '^.')"
personal_kanji="$(grep -P "${reading_indexx}," Kanji.txt | cut -f2 )"
if [ "$personal_kanji" = "" ]; then
personal_kanji="$(grep -P "${reading_indexx}$" Kanji.txt | cut -f2 )"
fi
#echo "personal: $personal_kanji"
#echo "foreign: $foreign_kanji"
if [ "$foreign_kanji" != "$personal_kanji" ]; then
[ "$foreign_kanji" != "" ] && printf 'Kanji mismatch for %s:\tP:%s\tF:%s\n' "$reading_index" "$personal_kanji" "$foreign_kanji"
#continue
fi
composite="$(grep "${reading_indexx}$" kanji_behalten.txt | cut -f1 )"
meaning_german="$(grep "${reading_indexx}$" kanji_behalten.txt | cut -f2 )"
composite_kana="$(grep "${reading_indexx}$" kanji_behalten.txt | cut -f3 | tr -d '"' | sed "s/^${composite}.* //" )"
#echo "composite_kana: $composite_kana"
kanji_kana="$(grep "${reading_indexx}$" kanji_behalten.txt | cut -f4 | perl -pe 's/^.*?\[(.*?)\]/\1/')"
printf '%s\t%s\t%s\t%s\t%s\t%s\n' "$reading_indexx" "$personal_kanji" "$kanji_kana" "$composite" "$composite_kana" "$meaning_german"
#sleep 2
done;