From 5115258961f3b622be8fff6993835a41443e4d19 Mon Sep 17 00:00:00 2001 From: Matthew Miller Date: Jan 27 2022 23:15:39 +0000 Subject: wip. find the peak weeks. --- diff --git a/TODO.md b/TODO.md index 2f78dc7..a258cef 100644 --- a/TODO.md +++ b/TODO.md @@ -17,43 +17,7 @@ * for the slicer, put the groups in their definitions in the config.toml -* be smarter about which timeseries to make - - * age already includes 0 and 1-4, so having separate ephemeral/persistent - views isn't useful - - * age unstacked line chart isn't really useful -- skip. - - * arch stacked chart isn't super useful either -- share is best, line is - ok. - - few enough lines that we can probably put ephemeral and persistent - on same chart. - - * variant stacked also isn't useful -- share is best, line ok - - but ephemeral vs persistent is a nightmare! - - * for release, all three are good (but maybe present ephemeral and - persistent on same line chart?) - - * secondary timeline charts for variants: - - * epel variants without centos linux (or rhel?) - - * fedora variants with just: - * desktop variants - * server/cloud/iot variants - * labs (compneuro, design suite) - * the three above, grouped - * ostree vs non-ostree (summed!) - - * the grouped one for arch - - * So, that's: - - * age over time — share and stacked (no special handling for ephemeral) - * arch over time — share and line (ephemeral on same chart?) - * variant over time — share and line (ephemeral separate charts) - * release — line, share, stacked (ephemeral on same charts?) +* secondary timeline charts for variants: * variant variants! * epel without CentOS Linux @@ -81,9 +45,6 @@ point, not summed (because that's its most interesting!) * don't bother with ephemeral/persistent view (age view is enough) -* something is messed up with the old waffle chart code. throw away, start - again - * sanatize all values read from config.toml * useful waffle charts (show current week, maybe average last 2-4): @@ -97,11 +58,6 @@ * make animations by week of full [arch,variant,release] * maybe of the breakouts too? -* Instead of a hard-coded thing in the plotter, generalize the - table and column-name-to-human-term code. Could also be used for formatting - "Mate-Compiz" and the like. - - * change the timeseries "hide" to collect small things into "other" @@ -124,41 +80,17 @@ old systems dropping out and being replaced by new ones. (In the latter case, we have _fewer_ ephemerial systems than we are currently guessing.) -* clean up the in-triplicate writing for ephemeral, permanent, and all - -* add totals for the waffle charts - * skip waffle charts that will never be interesting -* figure out how to estimate chart time better - * once we have more than a year of data, start Fedora chart at 2021-01-01, same as epel, because that initial growth curve is not really representative of anything but upgrades and all the initial data therefore skewed -* add numeric labels to the waffle charts! ("1 square = nnn systems") - -* Add Rawhide as a separate table. Needs special handling because it's hard - to sort out development on a regular Fedora OS release vs actually running - Rawhide. - -* something to make colors consistent - -* Filtering out obviously ridiculous data should be done before - the "dicer" stage, because otherwise it balloons the dataset. * Change ./run.sh into a makefile, because old-school. - -* Rework it so temporary files go in tmpdirs and data goes in var or - something (configurable) - - -* Related todo: with the by-release graphs, stop after the release is - no longer current. - -* Bonus: separate graphs for "which variants tend to persist after EOL" +* Can we get anything interesting for "which variants tend to persist after EOL"? * import estimates from old data @@ -177,6 +109,7 @@ * fix the code in brotosaurus washer to merge '' to 'none' rather than just renaming (works now because there are no natural 'none' entries). + * map "unknown" to "generic" * instead of throwing away entries in the washing phase (especially those diff --git a/brontosaurus-fight.sh b/brontosaurus-fight.sh new file mode 100755 index 0000000..ba14da8 --- /dev/null +++ b/brontosaurus-fight.sh @@ -0,0 +1,26 @@ +#!/bin/bash +# +# Create a view which only shows the weeks where each release +# is at its peak. If someone actually is Good At SQL, I would +# not mind help making this more clear. + +sqlite3 db/bronto.db << EOF + DROP VIEW IF EXISTS peak; + CREATE VIEW peak AS + SELECT checkins.week, + checkins.dataset, + checkins.release, + checkins.variant, + checkins.arch, + checkins.age, + checkins.hits + FROM checkins + INNER JOIN + (SELECT week,dataset,release,max(hits) + FROM (SELECT week,dataset,release,sum(hits) AS hits + FROM checkins + GROUP BY week,dataset,release + ORDER BY week) + GROUP BY dataset,release) AS peaks + ON peaks.week = checkins.week AND peaks.release = checkins.release; +EOF diff --git a/brontosaurus-plotter.py b/brontosaurus-plotter.py index c3aa3a8..5eb4401 100755 --- a/brontosaurus-plotter.py +++ b/brontosaurus-plotter.py @@ -156,17 +156,17 @@ def main(): colormappings = defaultdict(OrderedDict) database = sqlite3.connect(DATAFILE, detect_types=sqlite3.PARSE_DECLTYPES) - cursor = database.cursor() - + # cursor = database.cursor() + ''' for timeseries in config['timeseries']: params = config['timeseries_defaults'].copy() params.update(timeseries) query = f"""SELECT week, {params['dataseries']}, SUM(hits) as hits FROM checkins - WHERE dataset=\"{params['dataset']}\" + WHERE dataset =\"{params['dataset']}\" {params['extraselect']} - GROUP BY week,{params['dataseries']} + GROUP BY week, {params['dataseries']} ORDER BY week """ df = pd.read_sql_query(query, parse_dates='week', con=database) @@ -178,6 +178,24 @@ def main(): dataframe=df.pivot(index='week', columns=params['dataseries'], values='hits').astype("Int64"), ) + ''' + + # sorry about this. + # what it does is: find all the rows from the peak + # week for each release. + query = """SELECT checkins.week,checkins.dataset,checkins.release,checkins.variant,checkins.arch,checkins.age,checkins.hits from checkins INNER JOIN + (SELECT week,dataset,release,max(hits) + FROM (SELECT week,dataset,release,sum(hits) AS hits + FROM checkins + GROUP BY week,dataset,release + ORDER BY week) + GROUP BY dataset,release) AS peaks + ON peaks.week = checkins.week AND peaks.release = checkins.release + """ + + at_peak = pd.read_sql_query(query, parse_dates='week', con=database) + pd.set_option('display.max_rows', len(at_peak)) + print(at_peak) if __name__ == "__main__": diff --git a/run.sh b/run.sh index 71512cf..41cfa7d 100755 --- a/run.sh +++ b/run.sh @@ -62,6 +62,14 @@ echo -n "* Scrubbing off the dirt... " fi echo " shiny!" +echo -n "* Finding the strongest... " + ./brontosaurus-fight.sh + if [[ $? != 0 ]]; then + echo "! Oops." + exit 1 + fi +echo " rarrhhhhr!" + echo -n "* Sorting the eggs... " ./brontosaurus-egg-sorter.py if [[ $? != 0 ]]; then