From 02b80559a60c1080c48810e23098d74ede92643a Mon Sep 17 00:00:00 2001 From: Josseline Perdomo Date: Apr 01 2021 09:21:22 +0000 Subject: Fixed all the codestyles error and added flake8 config file --- diff --git a/.flake8 b/.flake8 new file mode 100644 index 0000000..bee7d51 --- /dev/null +++ b/.flake8 @@ -0,0 +1,3 @@ +[flake8] +max-line-length = 120 +exclude = .git, __pycache__, env, venv, .env, .venv, ENV, env.bak, venv.bak diff --git a/fedmsgconfig.py b/fedmsgconfig.py index bef1b47..136a98a 100644 --- a/fedmsgconfig.py +++ b/fedmsgconfig.py @@ -1 +1 @@ -config = {'endpoints': {}} \ No newline at end of file +config = {"endpoints": {}} diff --git a/generate-activity-charts.py b/generate-activity-charts.py index 272cdeb..31cfc12 100755 --- a/generate-activity-charts.py +++ b/generate-activity-charts.py @@ -1,305 +1,434 @@ #!/usr/bin/python3 import os -import pandas + import matplotlib as m +import matplotlib.pyplot as plt +import pandas + m.use("Agg") -import matplotlib.pyplot as plt -m.rcParams['font.size'] = 12 -m.rcParams['font.family'] = 'Overpass' -m.rcParams['legend.frameon'] = False + +m.rcParams["font.size"] = 12 +m.rcParams["font.family"] = "Overpass" +m.rcParams["legend.frameon"] = False try: - os.makedirs('./images') + os.makedirs("./images") except OSError: pass -datagit=pandas.read_csv("data/org.fedoraproject.prod.git.receive.bucketed-activity.csv",parse_dates=[0]) -datagit.set_index('weekstart',inplace=True) - -graph=datagit[['users1','users9','users40','userrest']].rename(columns={"users1": "Top 1%","users9":"Top 9%","users40":"Top 40%","userrest":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,yticks=range(0,301,25)) -#graph.legend(ncol=4) +datagit = pandas.read_csv( + "data/org.fedoraproject.prod.git.receive.bucketed-activity.csv", parse_dates=[0] +) +datagit.set_index("weekstart", inplace=True) + +graph = ( + datagit[["users1", "users9", "users40", "userrest"]] + .rename( + columns={ + "users1": "Top 1%", + "users9": "Top 9%", + "users40": "Top 40%", + "userrest": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + yticks=range(0, 301, 25), + ) +) +# graph.legend(ncol=4) # totally abusing this. -plt.suptitle("Number of Contributors Making Changes to Packages Each Week",fontsize=24) -graph.set_title("Grouped by Quarterly Activity Level of Each Contributor",fontsize=16) -graph.set_xlabel('') -fig=graph.get_figure() -fig.savefig('images/git.user.count.svg',dpi=300) +plt.suptitle("Number of Contributors Making Changes to Packages Each Week", fontsize=24) +graph.set_title("Grouped by Quarterly Activity Level of Each Contributor", fontsize=16) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/git.user.count.svg", dpi=300) ############################################# -datagit['msgstotal']=datagit[['msgs1','msgs9','msgs40','msgsrest']].sum(1) -datagit['msgs1%']=100*datagit['msgs1']/datagit['msgstotal'] -datagit['msgs9%']=100*datagit['msgs9']/datagit['msgstotal'] -datagit['msgs40%']=100*datagit['msgs40']/datagit['msgstotal'] -datagit['msgsrest%']=100*datagit['msgsrest']/datagit['msgstotal'] - - - - -m.rcParams['legend.frameon'] = True -graph=datagit[['msgs1%','msgs9%','msgs40%','msgsrest%']].rename(columns={"msgs1%": "Top 1%","msgs9%":"Top 9%","msgs40%":"Top 40%","msgsrest%":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,ylim=(0,100)) -plt.suptitle("Percent of Package Changes Each Week From Each Activity Level Group",fontsize=24) -graph.set_title("",fontsize=16) -graph.set_xlabel('') - -fig=graph.get_figure() -fig.savefig('images/git.activity.share.svg',dpi=300) +datagit["msgstotal"] = datagit[["msgs1", "msgs9", "msgs40", "msgsrest"]].sum(1) +datagit["msgs1%"] = 100 * datagit["msgs1"] / datagit["msgstotal"] +datagit["msgs9%"] = 100 * datagit["msgs9"] / datagit["msgstotal"] +datagit["msgs40%"] = 100 * datagit["msgs40"] / datagit["msgstotal"] +datagit["msgsrest%"] = 100 * datagit["msgsrest"] / datagit["msgstotal"] + + +m.rcParams["legend.frameon"] = True +graph = ( + datagit[["msgs1%", "msgs9%", "msgs40%", "msgsrest%"]] + .rename( + columns={ + "msgs1%": "Top 1%", + "msgs9%": "Top 9%", + "msgs40%": "Top 40%", + "msgsrest%": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + ylim=(0, 100), + ) +) +plt.suptitle( + "Percent of Package Changes Each Week From Each Activity Level Group", fontsize=24 +) +graph.set_title("", fontsize=16) +graph.set_xlabel("") + +fig = graph.get_figure() +fig.savefig("images/git.activity.share.svg", dpi=300) ############################################### -#graph=datagit[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), +# graph=datagit[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), # color='#579d1c', # grid=True,legend=False) -#plt.suptitle("New Packaging Contributor Count Per Week",fontsize=24) -#graph.set_title('') -#graph.set_xlabel('') -#fig=graph.get_figure() -#fig.savefig('images/git.newusers.svg',dpi=300) +# plt.suptitle("New Packaging Contributor Count Per Week",fontsize=24) +# graph.set_title('') +# graph.set_xlabel('') +# fig=graph.get_figure() +# fig.savefig('images/git.newusers.svg',dpi=300) ############################################# -#datagit['newuseractions%']=100*datagit['newuseractions']/datagit['msgstotal'] -#datagit['monthuseractions%']=100*datagit['monthuseractions']/datagit['msgstotal'] -#datagit['yearuseractions%']=100*datagit['yearuseractions']/datagit['msgstotal'] -#datagit['olderuseractions%']=100*datagit['olderuseractions']/datagit['msgstotal'] - - +# datagit['newuseractions%']=100*datagit['newuseractions']/datagit['msgstotal'] +# datagit['monthuseractions%']=100*datagit['monthuseractions']/datagit['msgstotal'] +# datagit['yearuseractions%']=100*datagit['yearuseractions']/datagit['msgstotal'] +# datagit['olderuseractions%']=100*datagit['olderuseractions']/datagit['msgstotal'] -#m.rcParams['legend.frameon'] = True -#graph=datagit[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:].rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month","yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), +# m.rcParams['legend.frameon'] = True +# graph=datagit[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:] +# .rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month", +# "yearuseractions%":"New This Year","olderuseractions%":"Old School"}) +# .plot.area(figsize=(16, 9), # color=['#579d1c','#ffd320', '#ff420e', '#004586' ], # grid=True,ylim=(0,100)) -#plt.suptitle("Percent of Package Changes Each Week By Time Since Packager's First Action",fontsize=24) -#graph.set_title("",fontsize=16) -#graph.set_xlabel('') +# plt.suptitle("Percent of Package Changes Each Week By Time Since Packager's First Action",fontsize=24) +# graph.set_title("",fontsize=16) +# graph.set_xlabel('') # -#fig=graph.get_figure() -#fig.savefig('images/git.activity.length.svg',dpi=300) +# fig=graph.get_figure() +# fig.savefig('images/git.activity.length.svg',dpi=300) # ################################################################################################################ ################################################################################################################ -databodhi=pandas.read_csv("data/org.fedoraproject.prod.bodhi.update.comment.bucketed-activity.csv",parse_dates=[0]) -databodhi.set_index('weekstart',inplace=True) - -graph=databodhi[['users1','users9','users40','userrest']].rename(columns={"users1": "Top 1%","users9":"Top 9%","users40":"Top 40%","userrest":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,yticks=range(0,301,25)) -#graph.legend(ncol=4) +databodhi = pandas.read_csv( + "data/org.fedoraproject.prod.bodhi.update.comment.bucketed-activity.csv", + parse_dates=[0], +) +databodhi.set_index("weekstart", inplace=True) + +graph = ( + databodhi[["users1", "users9", "users40", "userrest"]] + .rename( + columns={ + "users1": "Top 1%", + "users9": "Top 9%", + "users40": "Top 40%", + "userrest": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + yticks=range(0, 301, 25), + ) +) +# graph.legend(ncol=4) # totally abusing this. -plt.suptitle("Number of Contributors Providing Feedback on Package Updates Each Week",fontsize=24) -graph.set_title("Grouped by Quarterly Activity Level of Each Contributor",fontsize=16) -graph.set_xlabel('') -fig=graph.get_figure() -fig.savefig('images/bodhi.user.count.svg',dpi=300) +plt.suptitle( + "Number of Contributors Providing Feedback on Package Updates Each Week", + fontsize=24, +) +graph.set_title("Grouped by Quarterly Activity Level of Each Contributor", fontsize=16) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/bodhi.user.count.svg", dpi=300) ############################################# -databodhi['msgstotal']=databodhi[['msgs1','msgs9','msgs40','msgsrest']].sum(1) -databodhi['msgs1%']=100*databodhi['msgs1']/databodhi['msgstotal'] -databodhi['msgs9%']=100*databodhi['msgs9']/databodhi['msgstotal'] -databodhi['msgs40%']=100*databodhi['msgs40']/databodhi['msgstotal'] -databodhi['msgsrest%']=100*databodhi['msgsrest']/databodhi['msgstotal'] - - - - -m.rcParams['legend.frameon'] = True -graph=databodhi[['msgs1%','msgs9%','msgs40%','msgsrest%']].rename(columns={"msgs1%": "Top 1%","msgs9%":"Top 9%","msgs40%":"Top 40%","msgsrest%":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,ylim=(0,100)) -plt.suptitle("Percent of Update Feedback Each Week From Each Activity Level Group",fontsize=24) -graph.set_title("",fontsize=16) -graph.set_xlabel('') - -fig=graph.get_figure() -fig.savefig('images/bodhi.activity.share.svg',dpi=300) +databodhi["msgstotal"] = databodhi[["msgs1", "msgs9", "msgs40", "msgsrest"]].sum(1) +databodhi["msgs1%"] = 100 * databodhi["msgs1"] / databodhi["msgstotal"] +databodhi["msgs9%"] = 100 * databodhi["msgs9"] / databodhi["msgstotal"] +databodhi["msgs40%"] = 100 * databodhi["msgs40"] / databodhi["msgstotal"] +databodhi["msgsrest%"] = 100 * databodhi["msgsrest"] / databodhi["msgstotal"] + + +m.rcParams["legend.frameon"] = True +graph = ( + databodhi[["msgs1%", "msgs9%", "msgs40%", "msgsrest%"]] + .rename( + columns={ + "msgs1%": "Top 1%", + "msgs9%": "Top 9%", + "msgs40%": "Top 40%", + "msgsrest%": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + ylim=(0, 100), + ) +) +plt.suptitle( + "Percent of Update Feedback Each Week From Each Activity Level Group", fontsize=24 +) +graph.set_title("", fontsize=16) +graph.set_xlabel("") + +fig = graph.get_figure() +fig.savefig("images/bodhi.activity.share.svg", dpi=300) ############################################### -#graph=databodhi[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), +# graph=databodhi[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), # color='#579d1c', # grid=True,legend=False) -#plt.suptitle("New Update Testing Contributor Count Per Week",fontsize=24) -#graph.set_title('') -#graph.set_xlabel('') -#fig=graph.get_figure() -#fig.savefig('images/bodhi.newusers.svg',dpi=300) +# plt.suptitle("New Update Testing Contributor Count Per Week",fontsize=24) +# graph.set_title('') +# graph.set_xlabel('') +# fig=graph.get_figure() +# fig.savefig('images/bodhi.newusers.svg',dpi=300) ############################################# -#databodhi['newuseractions%']=100*databodhi['newuseractions']/databodhi['msgstotal'] -#databodhi['monthuseractions%']=100*databodhi['monthuseractions']/databodhi['msgstotal'] -#databodhi['yearuseractions%']=100*databodhi['yearuseractions']/databodhi['msgstotal'] -#databodhi['olderuseractions%']=100*databodhi['olderuseractions']/databodhi['msgstotal'] +# databodhi['newuseractions%']=100*databodhi['newuseractions']/databodhi['msgstotal'] +# databodhi['monthuseractions%']=100*databodhi['monthuseractions']/databodhi['msgstotal'] +# databodhi['yearuseractions%']=100*databodhi['yearuseractions']/databodhi['msgstotal'] +# databodhi['olderuseractions%']=100*databodhi['olderuseractions']/databodhi['msgstotal'] - - -#m.rcParams['legend.frameon'] = True -#graph=databodhi[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:].rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month","yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), +# m.rcParams['legend.frameon'] = True +# graph=databodhi[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:] +# .rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month", +# "yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), # color=['#579d1c','#ffd320', '#ff420e', '#004586' ], # grid=True,ylim=(0,100)) -#plt.suptitle("Percent of Update Feedback Each Week By Time Since Packager's First Action",fontsize=24) -#graph.set_title("",fontsize=16) -#graph.set_xlabel('') +# plt.suptitle("Percent of Update Feedback Each Week By Time Since Packager's First Action",fontsize=24) +# graph.set_title("",fontsize=16) +# graph.set_xlabel('') # -#fig=graph.get_figure() -#fig.savefig('images/bodhi.activity.length.svg',dpi=300) +# fig=graph.get_figure() +# fig.savefig('images/bodhi.activity.length.svg',dpi=300) ################################################################################################################ ################################################################################################################ -datawiki=pandas.read_csv("data/org.fedoraproject.prod.wiki.article.edit.bucketed-activity.csv",parse_dates=[0]) -datawiki.set_index('weekstart',inplace=True) - -graph=datawiki[['users1','users9','users40','userrest']].rename(columns={"users1": "Top 1%","users9":"Top 9%","users40":"Top 40%","userrest":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,yticks=range(0,301,25)) -#graph.legend(ncol=4) +datawiki = pandas.read_csv( + "data/org.fedoraproject.prod.wiki.article.edit.bucketed-activity.csv", + parse_dates=[0], +) +datawiki.set_index("weekstart", inplace=True) + +graph = ( + datawiki[["users1", "users9", "users40", "userrest"]] + .rename( + columns={ + "users1": "Top 1%", + "users9": "Top 9%", + "users40": "Top 40%", + "userrest": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + yticks=range(0, 301, 25), + ) +) +# graph.legend(ncol=4) # totally abusing this. -plt.suptitle("Number of Wiki Editors Each Week",fontsize=24) -graph.set_title("Grouped by Quarterly Activity Level of Each Contributor",fontsize=16) -graph.set_xlabel('') -fig=graph.get_figure() -fig.savefig('images/wiki.user.count.svg',dpi=300) +plt.suptitle("Number of Wiki Editors Each Week", fontsize=24) +graph.set_title("Grouped by Quarterly Activity Level of Each Contributor", fontsize=16) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/wiki.user.count.svg", dpi=300) ############################################# -datawiki['msgstotal']=datawiki[['msgs1','msgs9','msgs40','msgsrest']].sum(1) -datawiki['msgs1%']=100*datawiki['msgs1']/datawiki['msgstotal'] -datawiki['msgs9%']=100*datawiki['msgs9']/datawiki['msgstotal'] -datawiki['msgs40%']=100*datawiki['msgs40']/datawiki['msgstotal'] -datawiki['msgsrest%']=100*datawiki['msgsrest']/datawiki['msgstotal'] - - - - -m.rcParams['legend.frameon'] = True -graph=datawiki[['msgs1%','msgs9%','msgs40%','msgsrest%']].rename(columns={"msgs1%": "Top 1%","msgs9%":"Top 9%","msgs40%":"Top 40%","msgsrest%":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,ylim=(0,100)) -plt.suptitle("Percent of Wiki Edits Each Week From Each Activity Level Group",fontsize=24) -graph.set_title("",fontsize=16) -graph.set_xlabel('') - -fig=graph.get_figure() -fig.savefig('images/wiki.activity.share.svg',dpi=300) +datawiki["msgstotal"] = datawiki[["msgs1", "msgs9", "msgs40", "msgsrest"]].sum(1) +datawiki["msgs1%"] = 100 * datawiki["msgs1"] / datawiki["msgstotal"] +datawiki["msgs9%"] = 100 * datawiki["msgs9"] / datawiki["msgstotal"] +datawiki["msgs40%"] = 100 * datawiki["msgs40"] / datawiki["msgstotal"] +datawiki["msgsrest%"] = 100 * datawiki["msgsrest"] / datawiki["msgstotal"] + + +m.rcParams["legend.frameon"] = True +graph = ( + datawiki[["msgs1%", "msgs9%", "msgs40%", "msgsrest%"]] + .rename( + columns={ + "msgs1%": "Top 1%", + "msgs9%": "Top 9%", + "msgs40%": "Top 40%", + "msgsrest%": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + ylim=(0, 100), + ) +) +plt.suptitle( + "Percent of Wiki Edits Each Week From Each Activity Level Group", fontsize=24 +) +graph.set_title("", fontsize=16) +graph.set_xlabel("") + +fig = graph.get_figure() +fig.savefig("images/wiki.activity.share.svg", dpi=300) ############################################### -#graph=datawiki[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), +# graph=datawiki[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), # color='#579d1c', # grid=True,legend=False) -#plt.suptitle("New Wiki Contributor Count Per Week",fontsize=24) -#graph.set_title('') -#graph.set_xlabel('') -#fig=graph.get_figure() -#fig.savefig('images/wiki.newusers.svg',dpi=300) +# plt.suptitle("New Wiki Contributor Count Per Week",fontsize=24) +# graph.set_title('') +# graph.set_xlabel('') +# fig=graph.get_figure() +# fig.savefig('images/wiki.newusers.svg',dpi=300) ############################################### -#graph=datawiki[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), +# graph=datawiki[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), # color='#579d1c', # grid=True,legend=False) -#plt.suptitle("New Wiki Contributor Count Per Week",fontsize=24) -#graph.set_title('') -#graph.set_xlabel('') -#fig=graph.get_figure() -#fig.savefig('images/wiki.newusers.svg',dpi=300) +# plt.suptitle("New Wiki Contributor Count Per Week",fontsize=24) +# graph.set_title('') +# graph.set_xlabel('') +# fig=graph.get_figure() +# fig.savefig('images/wiki.newusers.svg',dpi=300) ############################################# -#datawiki['newuseractions%']=100*datawiki['newuseractions']/datawiki['msgstotal'] -#datawiki['monthuseractions%']=100*datawiki['monthuseractions']/datawiki['msgstotal'] -#datawiki['yearuseractions%']=100*datawiki['yearuseractions']/datawiki['msgstotal'] -#datawiki['olderuseractions%']=100*datawiki['olderuseractions']/datawiki['msgstotal'] - +# datawiki['newuseractions%']=100*datawiki['newuseractions']/datawiki['msgstotal'] +# datawiki['monthuseractions%']=100*datawiki['monthuseractions']/datawiki['msgstotal'] +# datawiki['yearuseractions%']=100*datawiki['yearuseractions']/datawiki['msgstotal'] +# datawiki['olderuseractions%']=100*datawiki['olderuseractions']/datawiki['msgstotal'] - -#m.rcParams['legend.frameon'] = True -#graph=datawiki[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:].rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month","yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), +# m.rcParams['legend.frameon'] = True +# graph=datawiki[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:] +# .rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month", +# "yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), # color=['#579d1c','#ffd320', '#ff420e', '#004586' ], # grid=True,ylim=(0,100)) -#plt.suptitle("Percent of Wiki Edits Each Week By Time Since Editor's First Edit",fontsize=24) -#graph.set_title("",fontsize=16) -#graph.set_xlabel('') +# plt.suptitle("Percent of Wiki Edits Each Week By Time Since Editor's First Edit",fontsize=24) +# graph.set_title("",fontsize=16) +# graph.set_xlabel('') # -#fig=graph.get_figure() -#fig.savefig('images/wiki.activity.length.svg',dpi=300) +# fig=graph.get_figure() +# fig.savefig('images/wiki.activity.length.svg',dpi=300) ############################################### ############################################### -datapagure=pandas.read_csv("data/io.pagure.prod.pagure.git.receive.bucketed-activity.csv",parse_dates=[0]) -datapagure.set_index('weekstart',inplace=True) - -graph=datapagure[['users1','users9','users40','userrest']].rename(columns={"users1": "Top 1%","users9":"Top 9%","users40":"Top 40%","userrest":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,yticks=range(0,25,5)) -#graph.legend(ncol=4) +datapagure = pandas.read_csv( + "data/io.pagure.prod.pagure.git.receive.bucketed-activity.csv", parse_dates=[0] +) +datapagure.set_index("weekstart", inplace=True) + +graph = ( + datapagure[["users1", "users9", "users40", "userrest"]] + .rename( + columns={ + "users1": "Top 1%", + "users9": "Top 9%", + "users40": "Top 40%", + "userrest": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + yticks=range(0, 25, 5), + ) +) +# graph.legend(ncol=4) # totally abusing this. -plt.suptitle("Number of Contributors Making Commits to Pagure Each Week",fontsize=24) -graph.set_title("Grouped by Quarterly Activity Level of Each Contributor",fontsize=16) -graph.set_xlabel('') -fig=graph.get_figure() -fig.savefig('images/pagure.user.count.svg',dpi=300) +plt.suptitle("Number of Contributors Making Commits to Pagure Each Week", fontsize=24) +graph.set_title("Grouped by Quarterly Activity Level of Each Contributor", fontsize=16) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/pagure.user.count.svg", dpi=300) ############################################# -datapagure['msgstotal']=datapagure[['msgs1','msgs9','msgs40','msgsrest']].sum(1) -datapagure['msgs1%']=100*datapagure['msgs1']/datapagure['msgstotal'] -datapagure['msgs9%']=100*datapagure['msgs9']/datapagure['msgstotal'] -datapagure['msgs40%']=100*datapagure['msgs40']/datapagure['msgstotal'] -datapagure['msgsrest%']=100*datapagure['msgsrest']/datapagure['msgstotal'] - - - - -m.rcParams['legend.frameon'] = True -graph=datapagure[['msgs1%','msgs9%','msgs40%','msgsrest%']].rename(columns={"msgs1%": "Top 1%","msgs9%":"Top 9%","msgs40%":"Top 40%","msgsrest%":"Remaining 50%"}).plot.area(figsize=(16, 9), - color=['#579d1c','#ffd320', '#ff420e', '#004586' ], - grid=True,ylim=(0,100)) -plt.suptitle("Percent of Pagure Commits Each Week From Each Activity Level Group",fontsize=24) -graph.set_title("",fontsize=16) -graph.set_xlabel('') - -fig=graph.get_figure() -fig.savefig('images/pagure.activity.share.svg',dpi=300) +datapagure["msgstotal"] = datapagure[["msgs1", "msgs9", "msgs40", "msgsrest"]].sum(1) +datapagure["msgs1%"] = 100 * datapagure["msgs1"] / datapagure["msgstotal"] +datapagure["msgs9%"] = 100 * datapagure["msgs9"] / datapagure["msgstotal"] +datapagure["msgs40%"] = 100 * datapagure["msgs40"] / datapagure["msgstotal"] +datapagure["msgsrest%"] = 100 * datapagure["msgsrest"] / datapagure["msgstotal"] + + +m.rcParams["legend.frameon"] = True +graph = ( + datapagure[["msgs1%", "msgs9%", "msgs40%", "msgsrest%"]] + .rename( + columns={ + "msgs1%": "Top 1%", + "msgs9%": "Top 9%", + "msgs40%": "Top 40%", + "msgsrest%": "Remaining 50%", + } + ) + .plot.area( + figsize=(16, 9), + color=["#579d1c", "#ffd320", "#ff420e", "#004586"], + grid=True, + ylim=(0, 100), + ) +) +plt.suptitle( + "Percent of Pagure Commits Each Week From Each Activity Level Group", fontsize=24 +) +graph.set_title("", fontsize=16) +graph.set_xlabel("") + +fig = graph.get_figure() +fig.savefig("images/pagure.activity.share.svg", dpi=300) ############################################### -#graph=datapagure[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), +# graph=datapagure[['newusercount']].rename(columns={"newusercount": "New Users"}).plot.area(figsize=(16, 9), # color='#579d1c', # grid=True,legend=False) -#plt.suptitle("New Pagure Contributor Count Per Week",fontsize=24) -#graph.set_title('') -#graph.set_xlabel('') -#fig=graph.get_figure() -#fig.savefig('images/pagure.newusers.svg',dpi=300) +# plt.suptitle("New Pagure Contributor Count Per Week",fontsize=24) +# graph.set_title('') +# graph.set_xlabel('') +# fig=graph.get_figure() +# fig.savefig('images/pagure.newusers.svg',dpi=300) ############################################# -#datapagure['newuseractions%']=100*datapagure['newuseractions']/datapagure['msgstotal'] -#datapagure['monthuseractions%']=100*datapagure['monthuseractions']/datapagure['msgstotal'] -#datapagure['yearuseractions%']=100*datapagure['yearuseractions']/datapagure['msgstotal'] -#datapagure['olderuseractions%']=100*datapagure['olderuseractions']/datapagure['msgstotal'] - - +# datapagure['newuseractions%']=100*datapagure['newuseractions']/datapagure['msgstotal'] +# datapagure['monthuseractions%']=100*datapagure['monthuseractions']/datapagure['msgstotal'] +# datapagure['yearuseractions%']=100*datapagure['yearuseractions']/datapagure['msgstotal'] +# datapagure['olderuseractions%']=100*datapagure['olderuseractions']/datapagure['msgstotal'] -#m.rcParams['legend.frameon'] = True -#graph=datapagure[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:].rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month","yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), +# m.rcParams['legend.frameon'] = True +# graph=datapagure[['newuseractions%','monthuseractions%','yearuseractions%','olderuseractions%']][42:] +# .rename(columns={"newuseractions%": "New This Week","monthuseractions%":"New This Month", +# "yearuseractions%":"New This Year","olderuseractions%":"Old School"}).plot.area(figsize=(16, 9), # color=['#579d1c','#ffd320', '#ff420e', '#004586' ], # grid=True,ylim=(0,100)) -#plt.suptitle("Percent of Pagure Commits Each Week By Time Since Packager's First Action",fontsize=24) -#graph.set_title("",fontsize=16) -#graph.set_xlabel('') +# plt.suptitle("Percent of Pagure Commits Each Week By Time Since Packager's First Action",fontsize=24) +# graph.set_title("",fontsize=16) +# graph.set_xlabel('') # -#fig=graph.get_figure() -#fig.savefig('images/pagure.activity.length.svg',dpi=300) +# fig=graph.get_figure() +# fig.savefig('images/pagure.activity.length.svg',dpi=300) diff --git a/generate-contributor-charts.py b/generate-contributor-charts.py index 16930dc..cc2ca2c 100755 --- a/generate-contributor-charts.py +++ b/generate-contributor-charts.py @@ -1,51 +1,90 @@ #!/usr/bin/python3 import os -import pandas + import matplotlib as m +import matplotlib.pyplot as plt +import pandas + m.use("Agg") -import matplotlib.pyplot as plt -m.rcParams['font.size'] = 12 -m.rcParams['font.family'] = 'Overpass' -m.rcParams['legend.frameon'] = True + +m.rcParams["font.size"] = 12 +m.rcParams["font.family"] = "Overpass" +m.rcParams["legend.frameon"] = True try: - os.makedirs('./images') + os.makedirs("./images") except OSError: pass -data=pandas.read_csv("data/contributor-count.csv",parse_dates=[0]) -data.set_index('weekstart',inplace=True) - +data = pandas.read_csv("data/contributor-count.csv", parse_dates=[0]) +data.set_index("weekstart", inplace=True) -graph=data[['oldactive','midactive','newactive']].rename(columns={"oldactive": "Old School","midactive":"Intermediate","newactive":"New Contributors"}).plot.area(figsize=(16, 9), - color=[ '#ff420e','#ffd320', '#579d1c' ], # '#004586' - grid=True,stacked=True ,yticks=range(0,451,25)) -data[['rawcount']].rename(columns={"rawcount": "All Contributors\nincluding less active"}).plot(figsize=(16, 9), - ax=graph ,yticks=range(0,426,25)) - -graph.xaxis.grid(True, which='minor', linestyle='-', linewidth=0.25) -graph.yaxis.grid(True, which='major', linestyle='-', linewidth=0.25) -plt.suptitle("Fedora Contributors by Week",fontsize=24) -graph.set_title("Stacked graph of contributors with measured activity each week — and at least four weeks total in the last year.\n“Old school” contributors have been active for longer than two years; new contributors, less than one.\nBlue line shows all contributors active this week regardless of amount of other activity.",fontsize=12) -graph.set_xlabel('') +graph = ( + data[["oldactive", "midactive", "newactive"]] + .rename( + columns={ + "oldactive": "Old School", + "midactive": "Intermediate", + "newactive": "New Contributors", + } + ) + .plot.area( + figsize=(16, 9), + color=["#ff420e", "#ffd320", "#579d1c"], # '#004586' + grid=True, + stacked=True, + yticks=range(0, 451, 25), + ) +) +data[["rawcount"]].rename( + columns={"rawcount": "All Contributors\nincluding less active"} +).plot(figsize=(16, 9), ax=graph, yticks=range(0, 426, 25)) -fig=graph.get_figure() -fig.savefig('images/active-contributors-by-week.svg',dpi=300) +graph.xaxis.grid(True, which="minor", linestyle="-", linewidth=0.25) +graph.yaxis.grid(True, which="major", linestyle="-", linewidth=0.25) +plt.suptitle("Fedora Contributors by Week", fontsize=24) +graph.set_title( + "Stacked graph of contributors with measured activity each week — and at least four weeks total in the last year.\n" + "“Old school” contributors have been active for longer than two years; new contributors, less than one.\n" + "Blue line shows all contributors active this week regardless of amount of other activity.", + fontsize=12, +) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/active-contributors-by-week.svg", dpi=300) -graph=data[['oldcore','midcore','newcore']].rename(columns={"oldcore": "Old School","midcore":"Intermediate","newcore":"New Contributors"}).plot.area(figsize=(16, 9), - color=[ '#ff420e', '#ffd320', '#579d1c' ], # '#004586' - grid=True,stacked=True ,yticks=range(0,101,25)) - -graph.xaxis.grid(True, which='minor', linestyle='-', linewidth=0.25) -graph.yaxis.grid(True, which='major', linestyle='-', linewidth=0.25) +graph = ( + data[["oldcore", "midcore", "newcore"]] + .rename( + columns={ + "oldcore": "Old School", + "midcore": "Intermediate", + "newcore": "New Contributors", + } + ) + .plot.area( + figsize=(16, 9), + color=["#ff420e", "#ffd320", "#579d1c"], # '#004586' + grid=True, + stacked=True, + yticks=range(0, 101, 25), + ) +) -plt.suptitle("Core Fedora Contributors by Week",fontsize=24) -graph.set_title("Stacked graph of contributors with measured activity this week — and at least four weeks total in the last year.\nOld school contributors have been active for longer than two years; new contributors, less than one.\n“Core” means part of the set doing about ⅔s of all actions over the past year.",fontsize=12) -graph.set_xlabel('') -fig=graph.get_figure() -fig.savefig('images/active-core-contributors-by-week.svg',dpi=300) +graph.xaxis.grid(True, which="minor", linestyle="-", linewidth=0.25) +graph.yaxis.grid(True, which="major", linestyle="-", linewidth=0.25) +plt.suptitle("Core Fedora Contributors by Week", fontsize=24) +graph.set_title( + "Stacked graph of contributors with measured activity this week — and at least four weeks total in the last year.\n" + "Old school contributors have been active for longer than two years; new contributors, less than one.\n" + "“Core” means part of the set doing about ⅔s of all actions over the past year.", + fontsize=12, +) +graph.set_xlabel("") +fig = graph.get_figure() +fig.savefig("images/active-core-contributors-by-week.svg", dpi=300) diff --git a/new-and-old-users-report.py b/new-and-old-users-report.py index 3461e59..c322e1d 100755 --- a/new-and-old-users-report.py +++ b/new-and-old-users-report.py @@ -10,7 +10,7 @@ import sys # BUT if --csv (or --csvh, for csv with header) is given, it gives the number for _that week only_ firstseen = collections.OrderedDict() -lastseen = collections.OrderedDict() +lastseen = collections.OrderedDict() actioncount = collections.defaultdict(int) weeksactive = collections.defaultdict(int) oldschoolornew = {} @@ -18,157 +18,183 @@ totalactions = 0 n = len(sys.argv[1:]) -csvoutput=False +csvoutput = False if n == 0: - reportweek = int((datetime.datetime.now()-datetime.datetime.strptime("2012-01-01", "%Y-%m-%d")).days/7)-1 + reportweek = ( + int( + ( + datetime.datetime.now() + - datetime.datetime.strptime("2012-01-01", "%Y-%m-%d") + ).days + / 7 + ) + - 1 + ) elif n == 1: - reportweek=int(sys.argv[1]) + reportweek = int(sys.argv[1]) elif sys.argv[1] == "--csv": - reportweek=int(sys.argv[2]) - csvoutput=True - csvheader=False + reportweek = int(sys.argv[2]) + csvoutput = True + csvheader = False elif sys.argv[1] == "--csvh": - reportweek=int(sys.argv[2]) - csvoutput=True - csvheader=True + reportweek = int(sys.argv[2]) + csvoutput = True + csvheader = True else: - sys.exit(1) + sys.exit(1) - -reporttime = datetime.datetime.strptime("2012-01-01", "%Y-%m-%d") + datetime.timedelta(days=reportweek*7+6) +reporttime = datetime.datetime.strptime("2012-01-01", "%Y-%m-%d") + datetime.timedelta( + days=reportweek * 7 + 6 +) # 52 weeks is precise enough for metrics gathering :) lastyear = reporttime - datetime.timedelta(364) twoyears = reporttime - datetime.timedelta(728) - -weeks = range(reportweek-51,reportweek+1) -datasources = ( "org.fedoraproject.prod.bodhi.update.comment", - "org.fedoraproject.prod.git.receive", - "org.fedoraproject.prod.irc.karma", - "org.fedoraproject.prod.wiki.article.edit", - "org.fedoraproject.prod.infragit.receive" - ) +weeks = range(reportweek - 51, reportweek + 1) + +datasources = ( + "org.fedoraproject.prod.bodhi.update.comment", + "org.fedoraproject.prod.git.receive", + "org.fedoraproject.prod.irc.karma", + "org.fedoraproject.prod.wiki.article.edit", + "org.fedoraproject.prod.infragit.receive", +) for datasource in datasources: - for week in weeks: - try: - datafragment=pandas.read_csv("data/weekly/{}.userdata.{:05}.csv".format(datasource,week),parse_dates=[2,3]) - except FileNotFoundError: - # ignore missing data.... probably should errror on _everything_ missing (FIXME) - continue - - for index, row in datafragment.iterrows(): - user=row['user'] - - totalactions += row['actions'] - actioncount[user]+=row['actions'] - - if not user in weeksactive: - weeksactive[user]=set() - weeksactive[user].add(week) - - if not user in firstseen: - firstseen[user]=row['firstseen'] - else: - if row['firstseen'] < firstseen[user]: - firstseen[user]=row['firstseen'] - - if not user in lastseen: - lastseen[user]=row['lastseen'] - else: - if row['lastseen'] < lastseen[user]: - lastseen[user]=row['lastseen'] - - if firstseen[user] < twoyears: - oldschoolornew[user]="old-school" - elif firstseen[user] >= lastyear: - oldschoolornew[user]="new contributor" - else: - oldschoolornew[user]="" - -rawcount=0 -oldcount=0 -midcount=0 -newcount=0 -allactive=0 - -accumulator=0 -topusers=[] + for week in weeks: + try: + datafragment = pandas.read_csv( + "data/weekly/{}.userdata.{:05}.csv".format(datasource, week), + parse_dates=[2, 3], + ) + except FileNotFoundError: + # ignore missing data.... probably should errror on _everything_ missing (FIXME) + continue + + for index, row in datafragment.iterrows(): + user = row["user"] + + totalactions += row["actions"] + actioncount[user] += row["actions"] + + if user not in weeksactive: + weeksactive[user] = set() + weeksactive[user].add(week) + + if user not in firstseen: + firstseen[user] = row["firstseen"] + else: + if row["firstseen"] < firstseen[user]: + firstseen[user] = row["firstseen"] + + if user not in lastseen: + lastseen[user] = row["lastseen"] + else: + if row["lastseen"] < lastseen[user]: + lastseen[user] = row["lastseen"] + + if firstseen[user] < twoyears: + oldschoolornew[user] = "old-school" + elif firstseen[user] >= lastyear: + oldschoolornew[user] = "new contributor" + else: + oldschoolornew[user] = "" + +rawcount = 0 +oldcount = 0 +midcount = 0 +newcount = 0 +allactive = 0 + +accumulator = 0 +topusers = [] for user in sorted(actioncount, key=actioncount.get, reverse=True): - accumulator+=actioncount[user] - topusers.append(user) - #print("{:20} {}".format(user,oldschoolornew[user])) - if accumulator>totalactions*2.0/3: - break - -newcore=0 -midcore=0 -oldcore=0 + accumulator += actioncount[user] + topusers.append(user) + # print("{:20} {}".format(user,oldschoolornew[user])) + if accumulator > totalactions * 2.0 / 3: + break + +newcore = 0 +midcore = 0 +oldcore = 0 for user in oldschoolornew: - # in csv mode, only report on activity *this* week - if csvoutput and not reportweek in weeksactive[user]: - continue - - rawcount+=1 - - # only count users who are active - # at least 4 distinct weeks in the past year - if len(weeksactive[user]) < 4: - continue - - - allactive+=1 - if oldschoolornew[user] == "old-school": - oldcount+=1 - if user in topusers: - oldcore+=1 - elif oldschoolornew[user] == "new contributor": - newcount+=1 - if user in topusers: - newcore+=1 - else: - midcount+=1 - if user in topusers: - midcore+=1 - + # in csv mode, only report on activity *this* week + if csvoutput and reportweek not in weeksactive[user]: + continue + + rawcount += 1 + + # only count users who are active + # at least 4 distinct weeks in the past year + if len(weeksactive[user]) < 4: + continue + + allactive += 1 + if oldschoolornew[user] == "old-school": + oldcount += 1 + if user in topusers: + oldcore += 1 + elif oldschoolornew[user] == "new contributor": + newcount += 1 + if user in topusers: + newcore += 1 + else: + midcount += 1 + if user in topusers: + midcore += 1 + if csvoutput: - if csvheader: - print("weekstart,rawcount,oldactive,midactive,newactive,oldcore,midcore,newcore") - print("{0:%Y-%m-%d}".format(reporttime),rawcount, - oldcount,midcount,newcount, - oldcore,midcore,newcore, - sep=",") - sys.exit(0) - - -print ("Report for year ending {:%Y-%m-%d} (through week #{}):".format(reporttime,reportweek)) -print ("") -print ("Raw total contributors: {:>5}".format(rawcount)) -print ("Total active contributors: {:>5}".format(allactive)) -print ("Core contributors (⅔ actions) {:>5}".format(len(topusers))) -print ("") -print ("Old-school contributors: {:>5}".format(oldcount)) -print ("Intermediate contributors: {:>5}".format(midcount)) -print ("New contributors this year: {:>5}".format(newcount)) -print ("") -print ("Old core contributors: {:>5}".format(oldcore)) -print ("Intermediate core contributors:{:>5}".format(midcore)) -print ("New core contributors: {:>5}".format(newcore)) -print ("\n") -print ("This report is an aggregate of dist-git, bodhi karma, wiki edits,") -print ("infra git, and irc cookies. It doesn't measure all Fedora activity.") -print ("") -print ("Active means at least four separate weeks of activity.") -print ("Core means part of the set doing about ⅔s of all actions.") -print ("Old-school contributors started at least two years (104 weeks) ago.") -print ("New contributors are new in the past 52 weeks.") -print ("Note that by this metric, \"mattdm\" is not a core contributor.") - -print ("\n-------------------------------------------\n") + if csvheader: + print( + "weekstart,rawcount,oldactive,midactive,newactive,oldcore,midcore,newcore" + ) + print( + "{0:%Y-%m-%d}".format(reporttime), + rawcount, + oldcount, + midcount, + newcount, + oldcore, + midcore, + newcore, + sep=",", + ) + sys.exit(0) + + +print( + "Report for year ending {:%Y-%m-%d} (through week #{}):".format( + reporttime, reportweek + ) +) +print("") +print("Raw total contributors: {:>5}".format(rawcount)) +print("Total active contributors: {:>5}".format(allactive)) +print("Core contributors (⅔ actions) {:>5}".format(len(topusers))) +print("") +print("Old-school contributors: {:>5}".format(oldcount)) +print("Intermediate contributors: {:>5}".format(midcount)) +print("New contributors this year: {:>5}".format(newcount)) +print("") +print("Old core contributors: {:>5}".format(oldcore)) +print("Intermediate core contributors:{:>5}".format(midcore)) +print("New core contributors: {:>5}".format(newcore)) +print("\n") +print("This report is an aggregate of dist-git, bodhi karma, wiki edits,") +print("infra git, and irc cookies. It doesn't measure all Fedora activity.") +print("") +print("Active means at least four separate weeks of activity.") +print("Core means part of the set doing about ⅔s of all actions.") +print("Old-school contributors started at least two years (104 weeks) ago.") +print("New contributors are new in the past 52 weeks.") +print('Note that by this metric, "mattdm" is not a core contributor.') + +print("\n-------------------------------------------\n") for user in topusers: - print("{:20} {}".format(user,oldschoolornew[user])) + print("{:20} {}".format(user, oldschoolornew[user])) diff --git a/utils.py b/utils.py index 924d63c..b3aa96c 100644 --- a/utils.py +++ b/utils.py @@ -1,6 +1,7 @@ import requests -url = 'https://apps.fedoraproject.org/datagrepper/raw' +url = "https://apps.fedoraproject.org/datagrepper/raw" + def grep(tries=0, **kwargs): response = requests.get(url, params=kwargs) @@ -11,15 +12,15 @@ def grep(tries=0, **kwargs): yield item data = response.json() - pages = data['pages'] + pages = data["pages"] - for message in data['raw_messages']: + for message in data["raw_messages"]: yield message for page in range(1, pages): for attempt in range(20): try: - kwargs['page'] = page + kwargs["page"] = page response = requests.get(url, params=kwargs) try: data = response.json() @@ -35,7 +36,5 @@ def grep(tries=0, **kwargs): break else: raise ValueError("Ran out of retries") - for message in data.get('raw_messages', []): + for message in data.get("raw_messages", []): yield message - - diff --git a/weekly-user-activity.py b/weekly-user-activity.py index 3ce36a8..07bb8a3 100755 --- a/weekly-user-activity.py +++ b/weekly-user-activity.py @@ -4,9 +4,10 @@ # # output: a CSV file with fields: # -# date, msgs1, msgs9, msgs40, msgsrest, users1, users9, users40, userrest, newusers, actionsnew, actionsmonth, actionsyear, actionsolder, newspammers, spamactions, botactions, relengactions +# date, msgs1, msgs9, msgs40, msgsrest, users1, users9, users40, userrest, newusers, actionsnew, actionsmonth, +# actionsyear, actionsolder, newspammers, spamactions, botactions, relengactions # -# where and 1, 9, 40, rest correspond to activity from the cohort of +# where and 1, 9, 40, rest correspond to activity from the cohort of # users in the top 1%, next 9%, next 40% or rest in that quarter (where # quarter is a sliding 13-week window) and users is the count of users in # that cohort that week while msgs is overall work. display the user count @@ -18,202 +19,214 @@ # # todo: create those graphs here in addition to CSV -import utils - -import fedmsg.meta -import fedmsg.config -config = fedmsg.config.load_config(filenames=['fedmsgconfig.py']) -fedmsg.meta.make_processors(**config) - - -import time +import collections import datetime import logging import os +import pickle +import re import sys +import time -import string -import re +import fedmsg.meta -import collections -import pprint +import utils -import pickle +config = fedmsg.config.load_config(filenames=["fedmsgconfig.py"]) +fedmsg.meta.make_processors(**config) -#logging.basicConfig(level=logging.DEBUG) +# logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.ERROR) class TimeoutError(Exception): """too much timeout""" + pass + class InvalidDiscriminantError(Exception): """invalid discriminant""" + pass -spammers = [line.rstrip('\n') for line in open('badpeople.list')] -bots = [line.rstrip('\n') for line in open('bots.list')] +spammers = [line.rstrip("\n") for line in open("badpeople.list")] +bots = [line.rstrip("\n") for line in open("bots.list")] epoch = datetime.datetime.utcfromtimestamp(0) -ipaddrre = re.compile("^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") +ipaddrre = re.compile(r"^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}$") discriminant = sys.argv[-1] -if __file__.split('/')[-1] in discriminant: +if __file__.split("/")[-1] in discriminant: print("usage: '$ ./weekly-user-activity.py TOPIC'") sys.exit(1) - -if not re.match("^[a-z\.]*$", discriminant): + +if not re.match(r"^[a-z\.]*$", discriminant): print("bad discriminant") sys.exit(2) - print("operating with discriminant", discriminant) verboten = [ - 'org.fedoraproject.prod.buildsys.rpm.sign', - 'org.fedoraproject.prod.buildsys.repo.init', - 'org.fedoraproject.prod.buildsys.tag', - 'org.fedoraproject.prod.buildsys.untag', + "org.fedoraproject.prod.buildsys.rpm.sign", + "org.fedoraproject.prod.buildsys.repo.init", + "org.fedoraproject.prod.buildsys.tag", + "org.fedoraproject.prod.buildsys.untag", ] try: - os.makedirs('./data') + os.makedirs("./data") except OSError: pass try: - os.makedirs('./data/weekly') + os.makedirs("./data/weekly") except OSError: pass try: - os.makedirs('./cache') + os.makedirs("./cache") except OSError: pass - - -weeknum=0 + +weeknum = 0 # the year in which fedmesg starts. starttime = datetime.datetime.strptime("2012-01-01", "%Y-%m-%d") +WeekActions = collections.namedtuple( + "WeekActions", ["week", "useractions", "newusers", "actionsbyage", "nonhuman"] +) - -WeekActions = collections.namedtuple('WeekActions',['week','useractions','newusers','actionsbyage','nonhuman']) - -firstseen={} -lastseen={} +firstseen = {} +lastseen = {} # 13 weeks = 1 quarter (rolling) -ring = collections.deque(maxlen=13) +ring = collections.deque(maxlen=13) -with open('data/%s.bucketed-activity.csv' % (discriminant), 'w') as bucketcsv: - bucketcsv.write("weekstart,msgs1,msgs9,msgs40,msgsrest,users1,users9,users40,userrest,newusercount,newuseractions,monthuseractions,yearuseractions,olderuseractions,newspammers,spamactions,botactions,relengactions\n") +with open("data/%s.bucketed-activity.csv" % (discriminant), "w") as bucketcsv: + bucketcsv.write( + "weekstart,msgs1,msgs9,msgs40,msgsrest,users1,users9,users40,userrest,newusercount,newuseractions," + "monthuseractions,yearuseractions,olderuseractions,newspammers,spamactions,botactions,relengactions\n" + ) bucketcsv.flush() - - while starttime < datetime.datetime.now() + datetime.timedelta(42): # weeks in the future because see below - endtime = starttime + datetime.timedelta(7) - weekinfo = WeekActions(starttime, collections.Counter(), collections.Counter(), collections.Counter(), collections.Counter()) - weekbreakdown=collections.Counter() - print("Working on %s / %s" % (discriminant, starttime.strftime("%Y-%m-%d")),) + while starttime < datetime.datetime.now() + datetime.timedelta( + 42 + ): # weeks in the future because see below + endtime = starttime + datetime.timedelta(7) + weekinfo = WeekActions( + starttime, + collections.Counter(), + collections.Counter(), + collections.Counter(), + collections.Counter(), + ) + weekbreakdown = collections.Counter() + + print( + "Working on %s / %s" % (discriminant, starttime.strftime("%Y-%m-%d")), + ) + + msgcachefile = ( + "cache/" + discriminant + "." + starttime.strftime("%Y-%m-%d") + ".pickle" + ) - msgcachefile = "cache/" + discriminant + "." + starttime.strftime("%Y-%m-%d") + ".pickle" - if os.path.exists(msgcachefile): - with open(msgcachefile,"r") as msgcache: - [firstseen,lastseen,weekinfo,weekbreakdown]=pickle.load(msgcache) - print("(cached)") + with open(msgcachefile, "r") as msgcache: + [firstseen, lastseen, weekinfo, weekbreakdown] = pickle.load(msgcache) + print("(cached)") else: - - for attempt in range(10): - try: - messages = utils.grep( - rows_per_page=100, - meta='usernames', - start=int((starttime-epoch).total_seconds()), - end=int((endtime - epoch).total_seconds()), - order='asc', # Start at the beginning, end at now. - topic=discriminant, - # Cut this stuff out, because its just so spammy. - not_user=['anonymous','koschei'], - not_topic=verboten, - ) - except IOError: - print("Retrying.") - time.sleep(5) - else: - break - else: - raise TimeoutError("too much timeout") - - for i, msg in enumerate(messages): - # sanity check - if msg['topic'] in verboten: - raise InvalidDiscriminantError("hell") - - for user in msg['meta']['usernames']: - if user == 'releng': - weekinfo.nonhuman['relengactions'] +=1 - continue - if user in bots: - weekinfo.nonhuman['botactions'] +=1 - continue - if user in spammers: - weekinfo.nonhuman['spamactions'] +=1 - if not user in firstseen: - firstseen[user]=datetime.datetime.fromtimestamp(msg['timestamp']) - weekinfo.nonhuman['newspammers'] +=1 - continue - if '@' in user: - # some msgs put email for anon users - continue - if ipaddrre.match(user): - # some msgs (wiki) put ip addr for anon users - continue - - weekinfo.useractions[user] += 1 - weekbreakdown[user] += 1 - - if not user in firstseen: - firstseen[user]=datetime.datetime.fromtimestamp(msg['timestamp']) - - if (starttime - firstseen[user]).days < 7: - weekinfo.actionsbyage['new'] += 1 - elif (starttime - firstseen[user]).days < 31: - weekinfo.actionsbyage['month'] += 1 - elif (starttime - firstseen[user]).days < 365: - weekinfo.actionsbyage['year'] += 1 - else: - weekinfo.actionsbyage['older'] += 1 - - lastseen[user]=datetime.datetime.fromtimestamp(msg['timestamp']) - - - if i % 50 == 0: - sys.stdout.write(".") - sys.stdout.flush() - - print() - #pprint.pprint(dict(weekinfo.useractions)) - - # don't cache the current week (may not be comlete), and definitely - # don't cache the future weeks (certainly not complete) - if endtime < (datetime.datetime.now() - datetime.timedelta(1)) : - sys.stdout.write("Saving... ") - sys.stdout.flush() - with open(msgcachefile+".temp","wb") as msgcache: - pickle.dump((firstseen,lastseen,weekinfo,weekbreakdown),msgcache) - os.rename(msgcachefile+".temp",msgcachefile) - print("saved.") + for attempt in range(10): + try: + messages = utils.grep( + rows_per_page=100, + meta="usernames", + start=int((starttime - epoch).total_seconds()), + end=int((endtime - epoch).total_seconds()), + order="asc", # Start at the beginning, end at now. + topic=discriminant, + # Cut this stuff out, because its just so spammy. + not_user=["anonymous", "koschei"], + not_topic=verboten, + ) + except IOError: + print("Retrying.") + time.sleep(5) + else: + break + else: + raise TimeoutError("too much timeout") + + for i, msg in enumerate(messages): + # sanity check + if msg["topic"] in verboten: + raise InvalidDiscriminantError("hell") + + for user in msg["meta"]["usernames"]: + if user == "releng": + weekinfo.nonhuman["relengactions"] += 1 + continue + if user in bots: + weekinfo.nonhuman["botactions"] += 1 + continue + if user in spammers: + weekinfo.nonhuman["spamactions"] += 1 + if user not in firstseen: + firstseen[user] = datetime.datetime.fromtimestamp( + msg["timestamp"] + ) + weekinfo.nonhuman["newspammers"] += 1 + continue + if "@" in user: + # some msgs put email for anon users + continue + if ipaddrre.match(user): + # some msgs (wiki) put ip addr for anon users + continue + + weekinfo.useractions[user] += 1 + weekbreakdown[user] += 1 + + if user not in firstseen: + firstseen[user] = datetime.datetime.fromtimestamp( + msg["timestamp"] + ) + + if (starttime - firstseen[user]).days < 7: + weekinfo.actionsbyage["new"] += 1 + elif (starttime - firstseen[user]).days < 31: + weekinfo.actionsbyage["month"] += 1 + elif (starttime - firstseen[user]).days < 365: + weekinfo.actionsbyage["year"] += 1 + else: + weekinfo.actionsbyage["older"] += 1 + + lastseen[user] = datetime.datetime.fromtimestamp(msg["timestamp"]) + + if i % 50 == 0: + sys.stdout.write(".") + sys.stdout.flush() + + print() + # pprint.pprint(dict(weekinfo.useractions)) + + # don't cache the current week (may not be comlete), and definitely + # don't cache the future weeks (certainly not complete) + if endtime < (datetime.datetime.now() - datetime.timedelta(1)): + sys.stdout.write("Saving... ") + sys.stdout.flush() + with open(msgcachefile + ".temp", "wb") as msgcache: + pickle.dump( + (firstseen, lastseen, weekinfo, weekbreakdown), msgcache + ) + os.rename(msgcachefile + ".temp", msgcachefile) + print("saved.") ring.append(weekinfo) - - # okay, so, bear with me here. Comments are for explaining confusing # conceptual things in code, right? okay, hold on to your seats. @@ -223,62 +236,117 @@ with open('data/%s.bucketed-activity.csv' % (discriminant), 'w') as bucketcsv: # gonna write lines from 6 weeks earlier, because finally we have the # needed info. so, we jump back 6 weeks (42 days) from starttime. # this is the same as jumping back 7 elements in the deque (if it's that deep) - - if len(ring)>6: + + if len(ring) > 6: # first, we're bucketing all the users by percent of activity - usertotals=collections.Counter() + usertotals = collections.Counter() for week in ring: usertotals += week.useractions userrank = {} userbucket = {} - i=len(usertotals)+1 - for name in sorted(usertotals,key=usertotals.get): - userrank[name]=i - i-=1 - if i