From ae8f1c812ba247e6e7088599190e4787edfd7b18 Mon Sep 17 00:00:00 2001 From: Nicolas Mailhot Date: Jun 16 2009 22:40:10 +0000 Subject: Add a quick & dirty script to audit font packaging in a repository --- diff --git a/bin/repo-font-audit b/bin/repo-font-audit new file mode 100755 index 0000000..725ecdd --- /dev/null +++ b/bin/repo-font-audit @@ -0,0 +1,503 @@ +#!/bin/bash +# Quick and dirty script to audit font repartition in a yum package repository +# +# It is slow, it is ugly, and it requires a good network connection + +# Function declarations + +usage() { +/bin/cat >&2 << EOF_USAGE +Usage: $0 ID URL1 + +With: +— ID: identifier of the package repository to test +— URL1: url of the package repository to test + +EOF_USAGE +exit 1 +} + + + +# FIXME: only extracts info about the first typeface in a TTC file for now +parse_localized_fc_query() { + field="$1" + file="$2" + fieldstring=$(awk -F ':' -v field="$field" '$1 == "\t"field { print $2 ; exit }' "$file" \ + | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|" + default=$(echo $fieldstring | awk -F "|" '{ print $1 }') + if $(grep -q "^"$'\t'$field"lang:" "$file") ; then + langstring=$(awk -F ':' -v field="$field" \ + '$1 == "\t"field"lang" { print $2 ; exit }' "$file" \ + | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|" + # Try to find the English label + while [ "$langstring" != "" -a \ + "$(echo $langstring | awk -F '|' '{ print $1 }')" != "en" ] ; do + fieldstring=$(echo "$fieldstring" | sed 's+\([^|]*\)|\(.*\)+\2+g') + langstring=$(echo "$langstring" | sed 's+\([^|]*\)|\(.*\)+\2+g') + done + # We could hide problems by reporting the first label regardless of its + # language. But this is an audit script — we do not hide problems + echo "$fieldstring" | awk -F "|" '{ print $1 }' + if [ "$fieldstring" == "" ] ; then echo -ne "\b×" >&2 ; fi + else + echo $(echo $fieldstring | awk -F "|" '{ print $1 }') + fi +} + + +pretty_indent() { + fold -s -w $(($(tput cols) - 4)) \ + | while read line ; do echo " $line" ; done + echo "" +} + +tally() { +t_datafile=$1 + +t_file=$(cat "$t_datafile" | wc -l) +t_file_size=$(awk -F '|' '{ sum += $12 } END { print sum }' "$t_datafile") +t_file_size=$((t_file_size/(1024*1024))) + +t_rpm=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' "$t_datafile" | sort | uniq | wc -l) +t_rpm_size=$(awk -F '|' '{ print $2 "-" $3 "." $4 "|" $5 }' "$t_datafile" | sort | uniq \ + |awk -F '|' '{ sum += $2 } END { print sum }') +t_rpm_size=$((t_rpm_size/(1024*1024))) + +t_srpm=$(awk -F '|' '{ print $1 }' "$t_datafile" | sort | uniq | wc -l) + +echo "$t_file|$t_rpm|$t_srpm|$t_file_size|$t_rpm_size" +} + +summary() { + +tally "$1" | awk -F '|' '{ print "⇒ " $1 " files (" $4 " MiB) in " $2 \ + " packages (" $5 " MiB) generated from " $3 " source packages." }' \ + | pretty_indent +} + +substats() { +ss_datafile="$1" + +awk -F '|' '{ print $NF }' "$ss_datafile" | sort | uniq \ + | while read key ; do + echo -n "$key|" + awk -F '|' -v key="$key" '$NF==key' "$ss_datafile" > tmp.ss.csv + tally tmp.ss.csv + rm tmp.ss.csv + done +} + +stats() { +s_datafile="$1" + +summary "$s_datafile" + +( +echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)" +awk -F '|' '$10 != "" { print $0 "|" $10 }' "$s_datafile" > tmp.s.csv +substats tmp.s.csv) | column -t -s '|' | pretty_indent + +(echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)" +awk -F '|' '{ print $0 "|" $4 }' "$s_datafile" > tmp.s.csv +substats tmp.s.csv)| column -t -s '|' | pretty_indent + +rm tmp.s.csv +} + + +# End of function declarations + +[ "$#" -lt "2" ] && usage + +ID=$1 +REPOID="$1-fontrepo" +REPOURL=$2 +TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%S) +TMPDIR=$(mktemp -d --tmpdir=/tmp font-package-audit-XXXXXXXXXX) + +FPL="$ID-$TIMESTAMP-font-packages.csv" +PWFL="$ID-$TIMESTAMP-packages-with-fonts.csv" +FFL="$ID-$TIMESTAMP-font-files.csv" +CSL="$ID-$TIMESTAMP-checksums.csv" +FL="$ID-$TIMESTAMP-repo-font-audit.csv" +FLNM="$ID-$TIMESTAMP-repo-font-audit-no-multilib.csv" + +ORIGDIR="$PWD" +cd $TMPDIR + +echo "Searching for packages with font metadata…" +repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \ + --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \ + --whatprovides "font(*)" 2>/dev/null | sort | uniq \ + > "$FPL" + +echo "Searching for packages that include files with common font extensions…" +repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \ + --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \ + -f '*.ttf' -f '*.otf' -f '*.ttc' \ + -f '*.pfb' -f '*.pfa' -f '*.pcf.gz' 2>/dev/null | sort | uniq \ + > "$PWFL" + +echo "Inspecting packages:" +rm -f "$FFL" +mkdir "tmp" +cd "tmp" +cat ../$FPL ../$PWFL | awk -F '|' '{ print $2 "-" $3 "." $4 }' | sort | uniq \ + | while read rpm ; do + echo -n " – $rpm" + mkdir "$rpm" + cd "$rpm" + echo -n " ◔" + wget --quiet -O "$rpm.rpm" $(repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID --location "$rpm" 2>/dev/null) + echo -ne "\b◑" + rpm2cpio "$rpm.rpm" > "$rpm.cpio" + echo -ne "\b◕" + cpio --quiet -it < "$rpm.cpio" \ + | grep -iE '\.((ttf)|(ttc)|(otf)|(pfa)|(pfb)|(pcf)|(pcf\.gz))$' \ + > "$rpm.lst" + cpio -idm --quiet -E "$rpm.lst" < "$rpm.cpio" + echo -ne "\b● " + cat "$rpm.lst" | while read file; do + unset target checksum type family style format + type=$(file -bzh "$file") + case $(echo "$type" | sed 's+ (\(.*\)++g' \ + | sed 's+ `\(.*\)++g' \ + | sed 's+,\(.*\)++g' \ + | sed 's+\( \)*$++g' ) in + "TrueType font data") + echo -n "t" + ;; + "TrueType font collection data") + echo -n "T" + ;; + "OpenType font data") + echo -n "o" + ;; + "PostScript Type 1 font text") + echo -n "P" + ;; + "PostScript Type 1 font program data") + echo -n "p" + ;; + "X11 Portable Compiled Font data") + echo -n "b" + ;; + "PostScript document text"|\ + "PostScript document text conforming DSC level 3.0"|\ + "PostScript document text conforming DSC level 3.0"|\ + "8086 relocatable") + echo -n "x" >&2 + ;; + "symbolic link to"|"broken symbolic link to") + target=$(readlink -m "$file" | sed "s+^$PWD++g") + if $(echo "$target" | grep -q "^/usr/share/fonts") ; then + type="Link" + echo -n "l" + else + type="ignored" + echo -n "-" + fi + ;; + *) + type="unknown" + echo -n "?" + ;; + esac + if [ "$type" != "unknown" -a "$type" != "ignored" ] ; then + size=$(du -b "$file" | awk '{ print $1 ; exit }') + if [ ! -h "$file" ] ; then + checksum=$(sha256sum "$file" | awk '{ print $1 ; exit }') + if $(fc-query "$file" 2> /dev/null > "$file.desc") ; then + family=$(parse_localized_fc_query family "$file.desc") + style=$(parse_localized_fc_query style "$file.desc") + format=$(parse_localized_fc_query fontformat "$file.desc") + else + echo -ne "\bX" >&2 + fi + fi + file=$(echo "$file" | sed "s+^./+/+g") + echo "$rpm|$file|$family|$style|$format|$type|$size|$checksum|$target" >> "../../$FFL" + fi + done + cd .. + rm -fr "$rpm" + echo " ♻" +done +cd .. +rm -fr tmp + +echo "Consolidating data…" +rm -f "$FL" +cat "$PWFL" | while read rpmline; do + grep -q "$rpmline" "$FPL" && metadata="M" || metadata="" + rpm=$(echo "$rpmline" | awk -F '|' '{ print $2 "-" $3 "." $4 ; exit }') + cat "$FFL" \ + | awk -F '|' -v rpm="$rpm" '$1 == rpm { print $2 "|" $3 "|" $4 "|" $5 "|" $6 "|" $7 "|" $8 "|" $9 }' \ + | while read fileline; do + if [ "$(echo $fileline| awk -F '|' '{ print $5 }')" == "Link" ] ; then + source="$(awk -F '|' -v target=""$(echo $fileline| awk -F '|' '{ print $8 }')"" \ + '$2 == target { print $1 ; exit }' ""$FFL"" )" + else source="" + fi + echo "$rpmline|$metadata|$fileline|$source" >> "$FL" + done +done + +awk -F '|' '$13 != "" { print $1 "|" $2 "|" $7 "|" $13 }' "$FL" | sort | uniq \ + | while read sig ; do + awk -F '|' -v sig="$sig" \ + '($1 "|" $2 "|" $7 "|" $13) == sig { print $0 ; exit }' "$FL" + done > "$FLNM" + +echo "" +echo "Statistics:" +echo "" + +echo "– packages that declare font metadata:" +echo "" + +awk -F '|' '$6=="M"' "$FL" > tmp.csv +stats tmp.csv +rm tmp.csv + +echo "☛ File size is computed as extracted, while rpm is a compressed \ +format." | pretty_indent +echo "☛ Mid-term, files in legacy PCF or Type1 formats need to be converted \ +or removed." | pretty_indent + +echo "– font files in other packages (we should not find any!)" +echo "" + +awk -F '|' '($6 != "M") && ($11 != "Link")' "$FL" > tmp.csv +stats tmp.csv +rm tmp.csv + +echo "☛ Bad packaging may result in arched packages or mixed content." \ +| pretty_indent + +echo "" +echo "Problem report:" +echo "" + +# Arch check + +echo "– packages that include fonts, but are not noarch:" + +awk -F '|' '($11 != "Link") && ($4 != "noarch")' "$FL" > tmp.csv + +awk -F '|' '{ print $2 "." $4 "|" $6 }' tmp.csv | sort | uniq \ + | awk -F '|' '{ if ( $2 == "M" ) list=(list " [" $1 "]") ; + else list=(list " " $1 ) } END \ + { print list }' | pretty_indent + +summary tmp.csv +rm tmp.csv + +# Install location check + +echo "– packages that install fonts outside /usr/share/fonts:" + +awk -F '|' '($11 != "Link") && ($7 !~ /^\/usr\/share\/fonts\//)' "$FL" > tmp.csv + +awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \ + | awk -F '|' '{ list = list " " $1 } END { print list }' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ Font files need to be installed under the /usr/share/fonts root for \ +fontconfig to expose them." | pretty_indent + +# Metadata check + +echo "– packages that include fonts, but do not declare font metadata:" + +awk -F '|' '($11 != "Link") && ($6 != "M")' "$FL" > tmp.csv + +awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \ + | awk -F '|' '{ list = list " " $1 } END { print list }' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ Automatic font installation relies on this metadata being present \ +to work." | pretty_indent + +# Duplication checks + +echo "– exact file duplication (ignoring multilib):" +echo "☛ Ignoring multilib to keep it short" | pretty_indent + +awk -F '|' '{ print $13 }' "$FLNM" | sort | uniq -d \ + | while read checksum ; do + awk -F '|' -v checksum="$checksum" '$13==checksum' "$FLNM" +done > tmp.csv + +awk -F '|' '{ print $13 }' tmp.csv | uniq \ + | while read checksum ; do + awk -F '|' -v checksum="$checksum" '$13==checksum \ + { if ( $6 == "M" ) print $7 "|[" $2 "." $4 "]|" "(" $1 ")" ; + else print $7 "|" $2 "." $4 "|" "(" $1 ")" }' \ + tmp.csv | column -t -s '|' | pretty_indent + done + +summary tmp.csv +rm tmp.csv + +echo "– font faces duplicated by different packages:" +echo "☛ Excluding multilib and PCF fonts (because they are pretty much \ +hopeless)." | pretty_indent + +rm -f tmp.csv +awk -F '|' '($8 != "") && ($9 != "") && ($10 != "PCF") && ($10 != "Type 1") \ + { print $2 "-" $3 "." $4 "|" $8 "|" $9 }' "$FLNM" \ + | sort | uniq | awk -F '|' '{ print $2 "|" $3 }' \ + | sort | uniq -d | while read face ; do + awk -F '|' -v face="$face" \ + '($10 != "PCF") && (($8 "|" $9)==face)' "$FLNM" > tmp1.csv + packages=$(awk -F '|' '{ if ( $6 == "M" ) print "[" $2 "]" ; \ + else print $2 }' tmp1.csv \ + | sort | uniq | while read rpm ; do echo -n "$rpm " ; done) + count=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp1.csv \ + | sort | uniq | wc -l) + cat tmp1.csv >> tmp.csv + echo "$count|$face|$packages" + done | sort -nr | column -t -s '|' | pretty_indent + +rm tmp1.csv +summary tmp.csv +rm tmp.csv + +echo "☛ Face duplication wastes resources \ +infrastructure and user side. Very often an upstream that copied some fonts \ +will forget to keep them up to date, and the duplication will result in the \ +distribution of old buggy data. Even if some duplicate font files are a \ +genuine fork with different features from the original, applications won't be \ +able to select them relyably because of naming collisions. We should alway \ +ship a single version of any font face in a dedicated font package, and use \ +fontconfig or symlinks to share it accross packages." | pretty_indent + +echo "– font faces duplicated within a package (ignoring legacy formats):" + +awk -F '|' '($8 != "") && ($9 != "") && ($10 != "PCF") && ($10 != "Type 1") \ + { print $2 "-" $3 "." $4 "|" $8 "|" $9 }' "$FLNM" \ + | sort | uniq -d | while read sig ; do + awk -F '|' -v sig="$sig" \ + '($10 != "PCF") && ($10 != "Type 1") && \ + (($2 "-" $3 "." $4 "|" $8 "|" $9 ) == sig)' \ + "$FLNM" ; + done > tmp.csv + +awk -F '|' '{ print $2 "|" $8 "|" $9 "|" $7 }' tmp.csv \ + | column -t -s '|' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ Face duplication within a package is almost certainly a bug, except \ +for special symbol font families." | pretty_indent + +echo "— packages that mix several font families (ignoring legacy formats):" + +awk -F '|' '($8 != "") && ($10 != "PCF") && ($10 != "Type 1")' "$FL" > tmp.csv +awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | sort | uniq \ + | while read rpm ; do + awk -F '|' -v rpm="$rpm" '(($2 "-" $3 "." $4) == rpm) \ + { print $2 "|" $8 }' tmp.csv | sort | uniq \ + | awk -F '|' '{ sum+=1 ; rpm=$1 } END { if (sum > 1) print sum " " rpm }' + done | sort -nr | awk '{ list = list " " $2 " (" $1 ")" } END \ + { print list }' | pretty_indent + +rm tmp.csv + +echo "☛ Reliable font autoinstallation requires shipping only one font family \ +per font package. This indicates problems in the packaging or the packaged \ +font metadata." | pretty_indent + +# Symlink-related checks + +echo "– packages that symlink font files:" + +awk -F '|' '$11=="Link"' "$FL" > tmp.csv + +awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \ + | while read rpm ; do echo -n "$rpm " ; done | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo " 5 most symlinked packages:" +awk -F '|' '$11=="Link" { print $15 "|" $2 "-" $3 "." $4 }' "$FL" | sort | uniq \ +| awk -F '|' '{ print $1 }' | uniq -c | sort -nr | head -5 | column -t | pretty_indent + +echo "☛ Symlinking font files is a way for non-font \ +packages to comply with guidelines and avoid duplicating files, but it is \ +also a symptom of missing or incomplete fontconfig support in the package. \ +Please ask upstream to use fontconfig (possibly, via a higher-level library \ +such as pangocairo)." | pretty_indent + +echo "– broken symlinks to font files:" + +awk -F '|' '($11 == "Link") && ($15 == "")' "$FL" > tmp.csv +awk -F '|' '{ print $7 " → " $14 "|" $2 "-" $3 "." $4 }' tmp.csv \ + | column -t -s "|" | pretty_indent + +summary tmp.csv +rm tmp.csv + +# Magic and other parsing checks + +echo "– packages with font files not identified as such by libmagic:" + +awk -F '|' '($11 !~ /font/) && ($11 !~ /Font/) && ($11 != "Link")' "$FL" > tmp.csv + +awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | uniq \ + | while read rpm ; do + awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \ + { sum+=1 ; srpm =$1 } END \ + { print sum "|" rpm "|(" srpm ")|" }' tmp.csv + done | sort -nr | column -t -s '|' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ Either libmagic has a bug or the files are malformed and need to be \ +fixed or dumped." | pretty_indent + +echo "– packages with font files fc-query can not parse:" + +awk -F '|' '($11 != "Link") && ($10 == "")' "$FL" > tmp.csv + +awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | uniq \ + | while read rpm ; do + awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \ + { sum+=1 ; srpm =$1 } END \ + { print sum "|" rpm "|(" srpm ")|" }' tmp.csv + done | sort -nr | column -t -s '|' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ Either fontconfig has a bug or the files are malformed and need to be \ +fixed or dumped." | pretty_indent + +echo "– packages with localized metadata but no English variant:" + +awk -F '|' '($10 != "") && (($8 == "") || ($9 == ""))' "$FL" > tmp.csv +awk -F '|' '{ print $7 "|" $2 "-" $3 "." $4 }' tmp.csv \ + | column -t -s '|' | pretty_indent + +summary tmp.csv +rm tmp.csv + +echo "☛ The font files need to be fixed to declare metadata in English too." \ +| pretty_indent + + +bzip2 -9 "$FL" +mv "$FL.bz2" "$ORIGDIR/" +echo "Raw extracted data : $ORIGDIR/$FL.bz2" +cd "$ORIGDIR" +rm -fr "$TMPDIR" +echo "♻" diff --git a/changelog.txt b/changelog.txt index ee7cdfe..e1e65ac 100644 --- a/changelog.txt +++ b/changelog.txt @@ -1,3 +1,4 @@ +1.22 - Add a quick & dirty script to audit font packaging in a repository - Fix %_font_pkg macro not to eat the following end of line in the spec 1.21 - Split naming computation in a separate %_font_pkg_name macro and change its logic to handle more corner naming cases. Some fallout in