fix: use python3 for emoji codepoint extraction in font subsetting

grep -oP '.' splits multi-byte emoji into individual bytes when the locale is not UTF-8 (e.g. CI runners with LANG=C), causing pyftsubset to fail on invalid codepoints.
2026-02-17 16:05:51 +01:00
parent eeb3030186
commit 716c6cc6e6
1 changed files with 4 additions and 11 deletions
--- a/scripts/subset-emoji-font.sh
+++ b/scripts/subset-emoji-font.sh
@@ -29,18 +29,11 @@ fi
 EMOJIS="☀✝❄🌷🍂🎄🐇🍽🥫🛒🛍🚆⚡🎉🤝💸❤🖤✅❌🚀⚠✨🔄📋🖼📖🤖🌐🔐🔍🚫"
 # ────────────────────────────────────────────────────────────────────
-# Build Unicode codepoint list from the emoji string
+# Build Unicode codepoint list from the emoji string (Python for reliable Unicode handling)
-UNICODES=""
+UNICODES=$(python3 -c "print(','.join(f'U+{ord(c):04X}' for c in '$EMOJIS'))")
-for char in $(echo "$EMOJIS" | grep -oP '.'); do
+GLYPH_COUNT=$(python3 -c "print(len('$EMOJIS'))")
 	code=$(printf 'U+%04X' "'$char")
 	if [ -n "$UNICODES" ]; then
 		UNICODES="$UNICODES,$code"
 	else
 		UNICODES="$code"
 	fi
 done
-echo "Subsetting NotoColorEmoji with $(echo "$EMOJIS" | grep -oP '.' | wc -l) glyphs..."
+echo "Subsetting NotoColorEmoji with $GLYPH_COUNT glyphs..."
 # Subset to TTF
 pyftsubset "$SRC_FONT" \