From 5115258961f3b622be8fff6993835a41443e4d19 Mon Sep 17 00:00:00 2001
From: Matthew Miller <mattdm@mattdm.org>
Date: Jan 27 2022 23:15:39 +0000
Subject: wip. find the peak weeks.


---

diff --git a/TODO.md b/TODO.md
index 2f78dc7..a258cef 100644
--- a/TODO.md
+++ b/TODO.md
@@ -17,43 +17,7 @@
 
 * for the slicer, put the groups in their definitions in the config.toml
 
-* be smarter about which timeseries to make
-
-  * age already includes 0 and 1-4, so having separate ephemeral/persistent
-    views isn't useful
-
-  * age unstacked line chart isn't really useful -- skip.
-
-  * arch stacked chart isn't super useful either -- share is best, line is
-    ok.
-     - few enough lines that we can probably put ephemeral and persistent 
-        on same chart. 
-
-  * variant stacked also isn't useful -- share is best, line ok
-     - but ephemeral vs persistent is a nightmare!
-
-  * for release, all three are good (but maybe present ephemeral and
-    persistent on same line chart?)
-
-  * secondary timeline charts for variants:
-
-    * epel variants without centos linux (or rhel?)
-
-    * fedora variants with just:
-       * desktop variants
-       * server/cloud/iot variants
-       * labs (compneuro, design suite)
-       * the three above, grouped
-       * ostree vs non-ostree (summed!)
-
-  * the grouped one for arch
-
-  * So, that's:
-
-    * age over time  — share and stacked (no special handling for ephemeral)
-    * arch over time — share and line (ephemeral on same chart?)
-    * variant over time — share and line (ephemeral separate charts)
-    * release — line, share, stacked (ephemeral on same charts?)
+* secondary timeline charts for variants:
 
     * variant variants!
       * epel without CentOS Linux
@@ -81,9 +45,6 @@
     point, not summed (because that's its most interesting!)
   * don't bother with ephemeral/persistent view (age view is enough)
 
-* something is messed up with the old waffle chart code. throw away, start
-  again 
-
 * sanatize all values read from config.toml
 
 * useful waffle charts (show current week, maybe average last 2-4):
@@ -97,11 +58,6 @@
 * make animations by week of full [arch,variant,release]
   * maybe of the breakouts too?
 
-* Instead of a hard-coded thing in the plotter, generalize the
-  table and column-name-to-human-term code. Could also be used for formatting
-  "Mate-Compiz" and the like. 
-
-
     
 * change the timeseries "hide" to collect small things into "other"
 
@@ -124,41 +80,17 @@
   old systems dropping out and being replaced by new ones. (In the latter
   case, we have _fewer_ ephemerial systems than we are currently guessing.)
 
-* clean up the in-triplicate writing for ephemeral, permanent, and all 
-
-* add totals for the waffle charts
-
 * skip waffle charts that will never be interesting
 
-* figure out how to estimate chart time better
-
 * once we have more than a year of data, start Fedora chart at 2021-01-01,
   same as epel, because that initial growth curve is not really
   representative of anything but upgrades and all the initial data
   therefore skewed
 
-* add numeric labels to the waffle charts! ("1 square = nnn systems")
-
-* Add Rawhide as a separate table. Needs special handling because it's hard
-  to sort out development on a regular Fedora OS release vs actually running
-  Rawhide.
-
-* something to make colors consistent
-
-* Filtering out obviously ridiculous data should be done before
-  the "dicer" stage, because otherwise it balloons the dataset.
 
 * Change ./run.sh into a makefile, because old-school.
 
-
-* Rework it so temporary files go in tmpdirs and data goes in var or
-  something (configurable)
-
-
-* Related todo: with the by-release graphs, stop after the release is 
-  no longer current.
-
-* Bonus: separate graphs for "which variants tend to persist after EOL"
+* Can we get anything interesting for "which variants tend to persist after EOL"?
 
 * import estimates from old data
 
@@ -177,6 +109,7 @@
 
 * fix the code in brotosaurus washer to merge '' to 'none' rather than just
   renaming (works now because there are no natural 'none' entries).
+  
 * map "unknown" to "generic"
 
 * instead of throwing away entries in the washing phase (especially those
diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh
new file mode 100755
index 0000000..ba14da8
--- /dev/null
+++ b/brontosaurus-fight.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+#
+# Create a view which only shows the weeks where each release
+# is at its peak. If someone actually is Good At SQL, I would
+# not mind help making this more clear.
+
+sqlite3 db/bronto.db << EOF
+    DROP VIEW IF EXISTS peak;
+    CREATE VIEW peak AS
+    SELECT checkins.week,
+           checkins.dataset,
+           checkins.release,
+           checkins.variant,
+           checkins.arch,
+           checkins.age,
+           checkins.hits
+    FROM checkins
+    INNER JOIN
+    (SELECT week,dataset,release,max(hits)
+        FROM (SELECT week,dataset,release,sum(hits) AS hits 
+        FROM  checkins
+        GROUP BY week,dataset,release
+        ORDER BY week) 
+    GROUP BY dataset,release) AS peaks
+    ON peaks.week = checkins.week AND peaks.release = checkins.release;
+EOF
diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py
index c3aa3a8..5eb4401 100755
--- a/brontosaurus-plotter.py
+++ b/brontosaurus-plotter.py
@@ -156,17 +156,17 @@ def main():
     colormappings = defaultdict(OrderedDict)
 
     database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES)
-    cursor = database.cursor()
-
+    # cursor = database.cursor()
+    '''
     for timeseries in config['timeseries']:
         params = config['timeseries_defaults'].copy()
         params.update(timeseries)
 
         query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits
                 FROM checkins
-                WHERE dataset=\"{params['dataset']}\"
+                WHERE dataset =\"{params['dataset']}\"
                 {params['extraselect']}
-                GROUP BY week,{params['dataseries']}
+                GROUP BY week, {params['dataseries']}
                 ORDER BY week
               """
         df = pd.read_sql_query(query, parse_dates='week',   con=database)
@@ -178,6 +178,24 @@ def main():
             dataframe=df.pivot(index='week', columns=params['dataseries'],
                                values='hits').astype("Int64"),
         )
+    '''
+
+    # sorry about this.
+    # what it does is: find all the rows from the peak
+    # week for each release.
+    query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN
+                    (SELECT week,dataset,release,max(hits)
+                        FROM (SELECT week,dataset,release,sum(hits) AS hits 
+                        FROM  checkins
+                        GROUP BY week,dataset,release
+                        ORDER BY week) 
+                    GROUP BY dataset,release) AS peaks
+                    ON peaks.week = checkins.week AND peaks.release = checkins.release
+            """
+
+    at_peak = pd.read_sql_query(query, parse_dates='week',  con=database)
+    pd.set_option('display.max_rows', len(at_peak))
+    print(at_peak)
 
 
 if __name__ == "__main__":
diff --git a/run.sh b/run.sh
index 71512cf..41cfa7d 100755
--- a/run.sh
+++ b/run.sh
@@ -62,6 +62,14 @@ echo -n "* Scrubbing off the dirt... "
   fi  
 echo "  shiny!"
 
+echo -n "* Finding the strongest... "
+  ./brontosaurus-fight.sh
+  if [[ $? != 0 ]]; then
+    echo "! Oops."
+    exit 1
+  fi  
+echo "  rarrhhhhr!"
+
 echo -n "* Sorting the eggs... "
   ./brontosaurus-egg-sorter.py
   if [[ $? != 0 ]]; then