From ae8f1c812ba247e6e7088599190e4787edfd7b18 Mon Sep 17 00:00:00 2001
From: Nicolas Mailhot <nim@fedoraproject.org>
Date: Jun 16 2009 22:40:10 +0000
Subject:  Add a quick & dirty script to audit font packaging in a repository


---

diff --git a/bin/repo-font-audit b/bin/repo-font-audit
new file mode 100755
index 0000000..725ecdd
--- /dev/null
+++ b/bin/repo-font-audit
@@ -0,0 +1,503 @@
+#!/bin/bash
+# Quick and dirty script to audit font repartition in a yum package repository
+#
+# It is slow, it is ugly, and it requires a good network connection
+
+# Function declarations
+
+usage() {
+/bin/cat >&2 << EOF_USAGE
+Usage: $0 ID URL1
+
+With:
+— ID:   identifier of the package repository to test
+— URL1: url of the package repository to test
+
+EOF_USAGE
+exit 1
+}
+
+
+
+# FIXME: only extracts info about the first typeface in a TTC file for now
+parse_localized_fc_query() {
+  field="$1"
+  file="$2"
+  fieldstring=$(awk -F ':' -v field="$field"  '$1 == "\t"field { print $2 ; exit }' "$file" \
+               | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|"
+  default=$(echo $fieldstring | awk -F "|" '{ print $1 }')
+  if $(grep -q "^"$'\t'$field"lang:" "$file") ; then
+    langstring=$(awk -F ':' -v field="$field" \
+                '$1 == "\t"field"lang" { print $2 ; exit }' "$file" \
+                | sed 's="(s)="=g' | sed 's=" *"=|=g'| sed 's= *" *==g')"|"
+    # Try to find the English label
+    while [ "$langstring" != "" -a \
+            "$(echo $langstring | awk -F '|' '{ print $1 }')" != "en" ] ; do
+      fieldstring=$(echo "$fieldstring" | sed 's+\([^|]*\)|\(.*\)+\2+g')
+      langstring=$(echo "$langstring" | sed 's+\([^|]*\)|\(.*\)+\2+g')
+    done
+    # We could hide problems by reporting the first label regardless of its
+    # language. But this is an audit script — we do not hide problems
+    echo "$fieldstring" |  awk -F "|" '{ print $1 }'
+    if [ "$fieldstring" == "" ] ; then  echo -ne "\b×" >&2 ; fi
+  else
+    echo $(echo $fieldstring | awk -F "|" '{ print $1 }')
+  fi
+}
+
+
+pretty_indent() {
+  fold -s -w $(($(tput cols) - 4)) \
+  | while read line ; do echo "    $line" ; done
+  echo ""
+}
+
+tally() {
+t_datafile=$1
+
+t_file=$(cat "$t_datafile" | wc -l)
+t_file_size=$(awk -F '|' '{ sum += $12 } END { print sum }' "$t_datafile")
+t_file_size=$((t_file_size/(1024*1024)))
+
+t_rpm=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' "$t_datafile" | sort | uniq | wc -l)
+t_rpm_size=$(awk -F '|' '{ print $2 "-" $3 "." $4 "|" $5 }' "$t_datafile" | sort | uniq \
+            |awk -F '|' '{ sum += $2 } END { print sum }')
+t_rpm_size=$((t_rpm_size/(1024*1024)))
+
+t_srpm=$(awk -F '|' '{ print $1 }' "$t_datafile" | sort | uniq | wc -l)
+
+echo "$t_file|$t_rpm|$t_srpm|$t_file_size|$t_rpm_size"
+}
+
+summary() {
+
+tally "$1" | awk -F '|' '{ print "⇒ " $1 " files (" $4 " MiB) in " $2 \
+      " packages (" $5 " MiB) generated from " $3 " source packages." }' \
+      | pretty_indent
+}
+
+substats() {
+ss_datafile="$1"
+
+awk -F '|' '{ print $NF }' "$ss_datafile" | sort | uniq \
+  | while read key ; do
+    echo -n "$key|"
+    awk -F '|' -v key="$key" '$NF==key' "$ss_datafile" > tmp.ss.csv
+    tally tmp.ss.csv
+    rm tmp.ss.csv
+  done
+}
+
+stats() {
+s_datafile="$1"
+
+summary "$s_datafile"
+
+(
+echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)"
+awk -F '|' '$10 != "" { print $0 "|" $10 }' "$s_datafile" > tmp.s.csv
+substats tmp.s.csv) | column -t -s '|' | pretty_indent
+
+(echo "Format|Files|rpm|srpm|Files (MiB)|rpm (MiB)"
+awk -F '|' '{ print $0 "|" $4 }' "$s_datafile" > tmp.s.csv
+substats tmp.s.csv)| column -t -s '|' | pretty_indent
+
+rm tmp.s.csv
+}
+
+
+# End of function declarations
+
+[ "$#" -lt "2" ] && usage
+
+ID=$1
+REPOID="$1-fontrepo"
+REPOURL=$2
+TIMESTAMP=$(date -u +%Y-%m-%dT%H:%M:%S)
+TMPDIR=$(mktemp -d --tmpdir=/tmp font-package-audit-XXXXXXXXXX)
+
+FPL="$ID-$TIMESTAMP-font-packages.csv"
+PWFL="$ID-$TIMESTAMP-packages-with-fonts.csv"
+FFL="$ID-$TIMESTAMP-font-files.csv"
+CSL="$ID-$TIMESTAMP-checksums.csv"
+FL="$ID-$TIMESTAMP-repo-font-audit.csv"
+FLNM="$ID-$TIMESTAMP-repo-font-audit-no-multilib.csv"
+
+ORIGDIR="$PWD"
+cd $TMPDIR
+
+echo "Searching for packages with font metadata…"
+repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \
+          --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \
+          --whatprovides "font(*)" 2>/dev/null | sort | uniq \
+          > "$FPL"
+
+echo "Searching for packages that include files with common font extensions…"
+repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID \
+          --qf "%{sourcerpm}|%{name}|%{epoch}:%{version}-%{release}|%{arch}|%{packagesize}" \
+          -f '*.ttf' -f '*.otf' -f '*.ttc' \
+          -f '*.pfb' -f '*.pfa' -f '*.pcf.gz' 2>/dev/null | sort | uniq \
+          > "$PWFL"
+
+echo "Inspecting packages:"
+rm -f "$FFL"
+mkdir "tmp"
+cd "tmp"
+cat ../$FPL ../$PWFL | awk -F '|' '{ print $2 "-" $3 "." $4 }' | sort | uniq \
+  | while read rpm ; do
+  echo -n " – $rpm"
+  mkdir "$rpm"
+  cd "$rpm"
+  echo -n " ◔"
+  wget --quiet -O "$rpm.rpm" $(repoquery --repofrompath=$REPOID,$REPOURL --repoid=$REPOID --location "$rpm" 2>/dev/null)
+  echo -ne "\b◑"
+  rpm2cpio "$rpm.rpm" > "$rpm.cpio"
+  echo -ne "\b◕"
+  cpio --quiet -it < "$rpm.cpio" \
+    | grep -iE '\.((ttf)|(ttc)|(otf)|(pfa)|(pfb)|(pcf)|(pcf\.gz))$' \
+    > "$rpm.lst"
+  cpio -idm --quiet -E "$rpm.lst" < "$rpm.cpio"
+  echo -ne "\b● "
+  cat "$rpm.lst" | while read file; do
+    unset target checksum type family style format
+    type=$(file -bzh "$file")
+    case $(echo "$type" | sed 's+ (\(.*\)++g' \
+                             | sed 's+ `\(.*\)++g' \
+                             | sed 's+,\(.*\)++g' \
+                             | sed 's+\( \)*$++g' ) in
+      "TrueType font data")
+        echo -n "t"
+        ;;
+      "TrueType font collection data")
+	echo -n "T"
+        ;;
+      "OpenType font data")
+	echo -n "o"
+        ;;
+      "PostScript Type 1 font text")
+        echo -n "P"
+        ;;
+      "PostScript Type 1 font program data")
+	echo -n "p"
+        ;;
+      "X11 Portable Compiled Font data")
+	echo -n "b"
+        ;;
+      "PostScript document text"|\
+      "PostScript document text conforming DSC level 3.0"|\
+      "PostScript document text conforming DSC level 3.0"|\
+      "8086 relocatable")
+        echo -n "x" >&2
+        ;;
+      "symbolic link to"|"broken symbolic link to")
+        target=$(readlink -m "$file" | sed "s+^$PWD++g")
+        if $(echo "$target" | grep -q "^/usr/share/fonts") ; then
+          type="Link"
+          echo -n "l"
+        else
+          type="ignored"
+          echo -n "-"
+        fi
+        ;;
+      *)
+        type="unknown"
+	echo -n "?"
+        ;;
+    esac
+    if [ "$type" != "unknown" -a "$type" != "ignored" ] ; then
+      size=$(du -b "$file" | awk '{ print $1 ; exit }')
+      if [ ! -h "$file" ] ; then
+        checksum=$(sha256sum "$file" | awk '{ print $1 ; exit }')
+        if $(fc-query "$file" 2> /dev/null > "$file.desc") ; then
+          family=$(parse_localized_fc_query family "$file.desc")
+          style=$(parse_localized_fc_query style "$file.desc")
+          format=$(parse_localized_fc_query fontformat "$file.desc")
+        else
+          echo -ne "\bX" >&2
+        fi
+      fi
+      file=$(echo "$file" | sed "s+^./+/+g")
+      echo "$rpm|$file|$family|$style|$format|$type|$size|$checksum|$target" >> "../../$FFL"
+    fi
+  done
+  cd ..
+  rm -fr "$rpm"
+  echo " ♻"
+done
+cd ..
+rm -fr tmp
+
+echo "Consolidating data…"
+rm -f "$FL"
+cat "$PWFL" | while read rpmline; do
+  grep -q "$rpmline" "$FPL" && metadata="M" || metadata=""
+  rpm=$(echo "$rpmline" | awk -F '|' '{ print $2 "-" $3 "." $4 ; exit }')
+  cat "$FFL" \
+     | awk -F '|' -v rpm="$rpm" '$1 == rpm { print $2 "|" $3 "|" $4 "|" $5 "|" $6 "|" $7 "|" $8 "|" $9 }' \
+     | while read fileline; do
+     if [ "$(echo $fileline| awk -F '|' '{ print $5 }')" == "Link" ] ; then
+       source="$(awk -F '|' -v target=""$(echo $fileline| awk -F '|' '{ print $8 }')"" \
+                  '$2 == target { print $1 ; exit }' ""$FFL"" )"
+     else source=""
+     fi
+     echo "$rpmline|$metadata|$fileline|$source" >> "$FL"
+     done
+done
+
+awk -F '|' '$13 != "" { print $1 "|" $2 "|" $7 "|" $13 }' "$FL" | sort | uniq \
+  | while read sig ; do
+    awk -F '|' -v sig="$sig" \
+        '($1 "|" $2 "|" $7 "|" $13) == sig { print $0 ; exit }' "$FL"
+  done > "$FLNM"
+
+echo ""
+echo "Statistics:"
+echo ""
+
+echo "– packages that declare font metadata:"
+echo ""
+
+awk -F '|' '$6=="M"' "$FL" > tmp.csv
+stats tmp.csv
+rm tmp.csv
+
+echo "☛ File size is computed as extracted, while rpm is a compressed \
+format." | pretty_indent
+echo "☛ Mid-term, files in legacy PCF or Type1 formats need to be converted \
+or removed." | pretty_indent
+
+echo "– font files in other packages (we should not find any!)"
+echo ""
+
+awk -F '|' '($6 != "M") && ($11 != "Link")' "$FL" > tmp.csv
+stats tmp.csv
+rm tmp.csv
+
+echo "☛ Bad packaging may result in arched packages or mixed content." \
+| pretty_indent
+
+echo ""
+echo "Problem report:"
+echo ""
+
+# Arch check
+
+echo "– packages that include fonts, but are not noarch:"
+
+awk -F '|' '($11 != "Link") && ($4 != "noarch")' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 "." $4 "|" $6 }' tmp.csv | sort | uniq \
+  | awk -F '|' '{ if ( $2 == "M" ) list=(list " [" $1 "]") ;
+                              else list=(list " "  $1    ) } END \
+                { print list }' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+# Install location check
+
+echo "– packages that install fonts outside /usr/share/fonts:"
+
+awk -F '|' '($11 != "Link") && ($7 !~ /^\/usr\/share\/fonts\//)' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \
+  | awk -F '|' '{ list = list " " $1 } END { print list }' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Font files need to be installed under the /usr/share/fonts root for \
+fontconfig to expose them." | pretty_indent
+
+# Metadata check
+
+echo "– packages that include fonts, but do not declare font metadata:"
+
+awk -F '|' '($11 != "Link") && ($6 != "M")' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \
+  | awk -F '|' '{ list = list " " $1 } END { print list }' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Automatic font installation relies on this metadata being present \
+to work." | pretty_indent
+
+# Duplication checks
+
+echo "– exact file duplication (ignoring multilib):"
+echo "☛ Ignoring multilib to keep it short" | pretty_indent
+
+awk -F '|' '{ print $13 }' "$FLNM" | sort | uniq -d \
+  | while read checksum ; do
+    awk -F '|' -v checksum="$checksum" '$13==checksum' "$FLNM"
+done > tmp.csv
+
+awk -F '|' '{ print $13 }' tmp.csv | uniq \
+  | while read checksum ; do
+    awk -F '|' -v checksum="$checksum" '$13==checksum \
+        { if ( $6 == "M" ) print $7 "|[" $2 "." $4 "]|" "(" $1 ")" ;
+          else             print $7 "|"  $2 "." $4  "|" "(" $1 ")" }' \
+    tmp.csv | column -t -s '|' | pretty_indent
+  done
+
+summary tmp.csv
+rm tmp.csv
+
+echo "– font faces duplicated by different packages:"
+echo "☛ Excluding multilib and PCF fonts (because they are pretty much \
+hopeless)." | pretty_indent
+
+rm -f tmp.csv
+awk -F '|' '($8 != "") && ($9 != "") && ($10 != "PCF") && ($10 != "Type 1") \
+    { print $2 "-" $3 "." $4 "|" $8 "|" $9 }' "$FLNM" \
+    | sort | uniq | awk -F '|' '{ print $2 "|" $3 }' \
+    | sort | uniq -d | while read face ; do
+    awk -F '|' -v face="$face" \
+        '($10 != "PCF") && (($8 "|" $9)==face)' "$FLNM" > tmp1.csv
+    packages=$(awk -F '|' '{ if ( $6 == "M" ) print "[" $2 "]" ; \
+                             else print $2 }' tmp1.csv \
+               | sort | uniq | while read rpm ; do echo -n "$rpm " ; done)
+    count=$(awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp1.csv \
+            | sort | uniq | wc -l)
+    cat tmp1.csv >> tmp.csv
+    echo "$count|$face|$packages"
+  done | sort -nr | column -t -s '|' | pretty_indent
+
+rm tmp1.csv
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Face duplication wastes resources \
+infrastructure and user side. Very often an upstream that copied some fonts \
+will forget to keep them up to date, and the duplication will result in the \
+distribution of old buggy data. Even if some duplicate font files are a \
+genuine fork with different features from the original, applications won't be \
+able to select them relyably because of naming collisions. We should alway \
+ship a single version of any font face in a dedicated font package, and use \
+fontconfig or symlinks to share it accross packages." | pretty_indent
+
+echo "– font faces duplicated within a package (ignoring legacy formats):"
+
+awk -F '|' '($8 != "") && ($9 != "") && ($10 != "PCF") && ($10 != "Type 1") \
+    { print $2 "-" $3 "." $4 "|" $8 "|" $9 }' "$FLNM" \
+  | sort | uniq -d | while read sig ; do
+    awk -F '|' -v sig="$sig" \
+        '($10 != "PCF") && ($10 != "Type 1") && \
+         (($2 "-" $3 "." $4 "|" $8 "|" $9 ) == sig)' \
+         "$FLNM" ;
+    done > tmp.csv
+
+awk -F '|' '{ print $2 "|" $8 "|" $9 "|" $7 }' tmp.csv \
+  | column -t -s '|' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Face duplication within a package is almost certainly a bug, except \
+for special symbol font families." | pretty_indent
+
+echo "— packages that mix several font families (ignoring legacy formats):"
+
+awk -F '|' '($8 != "") && ($10 != "PCF") && ($10 != "Type 1")' "$FL" > tmp.csv
+awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | sort | uniq \
+  | while read rpm ; do
+    awk -F '|' -v rpm="$rpm" '(($2 "-" $3 "." $4) == rpm) \
+          { print $2 "|" $8 }' tmp.csv | sort | uniq \
+      | awk -F '|' '{ sum+=1 ; rpm=$1 } END { if (sum > 1) print sum " " rpm  }'
+  done | sort -nr | awk '{ list = list " " $2 " (" $1 ")" } END \
+                         { print list }' | pretty_indent
+
+rm tmp.csv
+
+echo "☛ Reliable font autoinstallation requires shipping only one font family \
+per font package. This indicates problems in the packaging or the packaged \
+font metadata." | pretty_indent
+
+# Symlink-related checks
+
+echo "– packages that symlink font files:"
+
+awk -F '|' '$11=="Link"' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 }' tmp.csv | sort | uniq \
+  | while read rpm ; do echo -n "$rpm " ; done | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "    5 most symlinked packages:"
+awk -F '|' '$11=="Link" { print $15 "|" $2 "-" $3 "." $4 }' "$FL" | sort | uniq \
+| awk -F '|' '{ print $1 }' | uniq -c | sort -nr | head -5 | column -t | pretty_indent
+
+echo "☛ Symlinking font files is a way for non-font \
+packages to comply with guidelines and avoid duplicating files, but it is \
+also a symptom of missing or incomplete fontconfig support in the package. \
+Please ask upstream to use fontconfig (possibly, via a higher-level library \
+such as pangocairo)." | pretty_indent
+
+echo "– broken symlinks to font files:"
+
+awk -F '|' '($11 == "Link") && ($15 == "")' "$FL" > tmp.csv
+awk -F '|' '{ print $7 " → " $14 "|" $2 "-" $3 "." $4  }' tmp.csv \
+  | column -t -s "|" | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+# Magic and other parsing checks
+
+echo "– packages with font files not identified as such by libmagic:"
+
+awk -F '|' '($11 !~ /font/) && ($11 !~ /Font/) && ($11 != "Link")' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | uniq \
+  | while read rpm ; do
+    awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \
+        { sum+=1 ; srpm =$1 } END \
+        { print sum "|" rpm "|(" srpm ")|" }' tmp.csv
+  done | sort -nr | column -t -s '|' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Either libmagic has a bug or the files are malformed and need to be \
+fixed or dumped." | pretty_indent
+
+echo "– packages with font files fc-query can not parse:"
+
+awk -F '|' '($11 != "Link") && ($10 == "")' "$FL" > tmp.csv
+
+awk -F '|' '{ print $2 "-" $3 "." $4 }' tmp.csv | uniq \
+  | while read rpm ; do
+    awk -F '|' -v rpm="$rpm" '($2 "-" $3 "." $4) == rpm \
+        { sum+=1 ; srpm =$1 } END \
+        { print sum "|" rpm "|(" srpm ")|" }' tmp.csv
+  done | sort -nr | column -t -s '|' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ Either fontconfig has a bug or the files are malformed and need to be \
+fixed or dumped." | pretty_indent
+
+echo "– packages with localized metadata but no English variant:"
+
+awk -F '|' '($10 != "") && (($8 == "") || ($9 == ""))' "$FL" > tmp.csv
+awk -F '|' '{ print $7 "|" $2 "-" $3 "." $4 }' tmp.csv \
+  | column -t -s '|' | pretty_indent
+
+summary tmp.csv
+rm tmp.csv
+
+echo "☛ The font files need to be fixed to declare metadata in English too." \
+| pretty_indent
+
+
+bzip2 -9 "$FL"
+mv "$FL.bz2" "$ORIGDIR/"
+echo "Raw extracted data : $ORIGDIR/$FL.bz2"
+cd "$ORIGDIR"
+rm -fr "$TMPDIR"
+echo "♻"
diff --git a/changelog.txt b/changelog.txt
index ee7cdfe..e1e65ac 100644
--- a/changelog.txt
+++ b/changelog.txt
@@ -1,3 +1,4 @@
+1.22 - Add a quick & dirty script to audit font packaging in a repository
      - Fix %_font_pkg macro not to eat the following end of line in the spec
 1.21 - Split naming computation in a separate %_font_pkg_name macro and
        change its logic to handle more corner naming cases. Some fallout in