From 726bf038a3aeb3a0f12ef5aaa83eb5c7f9b69a30 Mon Sep 17 00:00:00 2001 From: LOGON\212767881 Date: Feb 02 2024 14:56:14 +0000 Subject: compte health stats and generate a graph with it --- diff --git a/statistics_processing/generate_graph.py b/statistics_processing/generate_graph.py index 32f4e66..9b05474 100644 --- a/statistics_processing/generate_graph.py +++ b/statistics_processing/generate_graph.py @@ -2,13 +2,11 @@ import inspect import logging import os import shutil -from datetime import datetime import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib.dates as mdates -from matplotlib.ticker import FuncFormatter # Create a temporary folder to store the generated graphs temp_folder = "generated_graph" @@ -59,6 +57,7 @@ def compute_stats(df): def generate_graphs_for_each_language_and_fedora_version(language_fedora_version_files): + local_stats = {} for language, file_paths in language_fedora_version_files.items(): custom_logger(f" {language}") num_packages_list = [] @@ -71,73 +70,93 @@ def generate_graphs_for_each_language_and_fedora_version(language_fedora_version stats = compute_stats(df) num_packages_list.append(stats['num_packages']) total_translated_words_list.append(stats['total_translated_words']) - progress_percentage_list.append(stats['progress_percentage']) fedora_version = os.path.basename(os.path.dirname(file_path)) + fedora_version_date = pd.to_datetime(fedora_versions[fedora_version]) + progress_percentage_list.append((fedora_version_date, stats['progress_percentage'])) progress_percentage_list_d.append( - stats['total_translated_words'] / fedora_total_words[fedora_version] * 100) - fedora_version_dates.append(pd.to_datetime(fedora_versions[fedora_version])) - - # Convert lists to numpy arrays - num_packages_array = np.array(num_packages_list) - total_translated_words_array = np.array(total_translated_words_list) - progress_percentage_array = np.array(progress_percentage_list) - progress_percentage_d_array = np.array(progress_percentage_list_d) - fedora_version_dates_array = np.array(fedora_version_dates) + (fedora_version_date, stats['total_translated_words'] / fedora_total_words[fedora_version] * 100)) + fedora_version_dates.append(fedora_version_date) - # Sort the data by date - sorted_indices = np.argsort(fedora_version_dates_array) - fedora_version_dates_array = fedora_version_dates_array[sorted_indices] - num_packages_array = num_packages_array[sorted_indices] - total_translated_words_array = total_translated_words_array[sorted_indices] - progress_percentage_array = progress_percentage_array[sorted_indices] - progress_percentage_d_array = progress_percentage_d_array[sorted_indices] - - fig, ax = plt.subplots(1, 3, figsize=(18, 6)) - plt.title(f"Fedora Linux translation progress for {language}\n{datetime.today().strftime('%Y-%m-%d')}") - - ax[0].plot(fedora_version_dates_array, num_packages_array, label='Number of packages', marker='o') - ax[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) - ax[0].yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x)}')) - ax[0].set_xlabel('End-of-life date') - ax[0].set_ylabel('Number of packages') - ax[0].legend() - ax[0].set_ylim(bottom=0) - - ax[1].plot(fedora_version_dates_array, total_translated_words_array, label='Total number of translated words', - marker='o') - ax[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) - ax[1].yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{x / 1000}K')) - ax[1].set_xlabel('End-of-life date') - ax[1].set_ylabel('Total number of translated words') - ax[1].legend() - ax[1].set_ylim(bottom=0) - - color = 'tab:blue' - line1, = ax[2].plot(fedora_version_dates_array, progress_percentage_array, color=color, marker='o') - ax[2].xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) - ax[2].yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x)}%')) - ax[2].set_xlabel('End-of-life date') - ax[2].set_ylabel('Progress percentage (%)') - ax[2].set_ylim(bottom=0) - - ax2 = ax[2].twinx() # instantiate a second axes that shares the same x-axis - - color = 'tab:red' - line2, = ax2.plot(fedora_version_dates_array, progress_percentage_d_array, color=color, marker='o') - ax2.set_ylabel('Progress percentage (distribution)', color=color) # we already handled the x-label with ax1 - ax2.yaxis.set_major_formatter(FuncFormatter(lambda x, pos: f'{int(x)}%')) - ax2.tick_params(axis='y', labelcolor=color) - - # Create a legend for the two lines - ax[2].legend((line1, line2), ('Progress percentage', 'Progress percentage (distribution)'), loc='upper left') - - # Rotate the x-axis labels for better readability - for a in ax: - plt.setp(a.xaxis.get_majorticklabels(), rotation=45) + dates, values = zip(*progress_percentage_list[-10:]) + dates = np.array([np.datetime64(date) for date in dates]) + dates = (dates - dates.min()) / np.timedelta64(1, 'D') + try: + local_health, _ = np.polyfit(dates, values, 1) # Compute linear regression + except Exception: + local_health = 0 + + _, values = zip(*progress_percentage_list_d[-min(len(progress_percentage_list_d), 20):]) + try: + distribution_health, _ = np.polyfit(dates, values, 1) # Compute linear regression + except Exception: + distribution_health = 0 + + if language not in local_stats.keys(): + local_stats[language] = {} + local_stats[language]["num_packages"] = int(num_packages_list[-1]) + local_stats[language]['total_translated_words'] = int(total_translated_words_list[-1]) + local_stats[language]['progress_percentage'] = round(progress_percentage_list[-1][1], 2) + local_stats[language]['progress_percentage_d'] = round(progress_percentage_list_d[-1][1], 2) + local_stats[language]['local_health'] = round(local_health * 100, 2) + local_stats[language]['distribution_health'] = round(distribution_health * 100, 4) + + # Sorting the dates + sorted_indices = sorted(range(len(fedora_version_dates)), key=lambda k: fedora_version_dates[k]) + fedora_version_dates = [fedora_version_dates[i] for i in sorted_indices] + num_packages_list = [num_packages_list[i] for i in sorted_indices] + total_translated_words_list = [total_translated_words_list[i] for i in sorted_indices] + progress_percentage_list = [(fedora_version_dates[i], progress_percentage_list[i][1]) for i in sorted_indices] + progress_percentage_list_d = [(fedora_version_dates[i], progress_percentage_list_d[i][1]) for i in + sorted_indices] + + fig, ax = plt.subplots(1, 3, figsize=(15, 5)) + + # Graph 1: Number of packages progress + ax[0].plot(fedora_version_dates, num_packages_list, marker='o') + ax[0].set_title(f'{language} - Number of Packages Progress') + ax[0].set_xlabel('Date') + ax[0].set_ylabel('Number of Packages') + ax[0].xaxis.set_major_formatter(mdates.DateFormatter('%Y')) + ax[0].xaxis.set_major_locator(mdates.YearLocator()) + ax[0].tick_params(axis='x', rotation=45) # Add this line + ax[0].set_ylim(bottom=0) # Start y-axis at 0 + ax[0].yaxis.get_major_formatter().set_useOffset(False) # Always display y-axis values as integers + + # Graph 2: Total translated words progress + ax[1].plot(fedora_version_dates, total_translated_words_list, marker='o', color='g') + ax[1].set_title(f'{language} - Total Translated Words Progress') + ax[1].set_xlabel('Date') + ax[1].set_ylabel('Total Translated Words') + ax[1].xaxis.set_major_formatter(mdates.DateFormatter('%Y')) + ax[1].xaxis.set_major_locator(mdates.YearLocator()) + ax[1].tick_params(axis='x', rotation=45) # Add this line + ax[1].set_ylim(bottom=0) # Start y-axis at 0 + ax[1].yaxis.set_major_formatter( + plt.FuncFormatter(lambda x, _: '{:.0f}k'.format(x * 1e-3))) # Display y-axis values in thousands (k) + + # Graph 3: Global progress + ax2 = ax[2].twinx() + ax[2].plot(fedora_version_dates, [i[1] for i in progress_percentage_list], marker='o', color='b') + ax2.plot(fedora_version_dates, [i[1] for i in progress_percentage_list_d], marker='o', color='r') + ax[2].set_title(f'{language} - Global Progress') + ax[2].set_xlabel('Date') + ax[2].set_ylabel('Progress Percentage', color='b') + ax2.set_ylabel('Progress Percentage (D)', color='r') + ax[2].xaxis.set_major_formatter(mdates.DateFormatter('%Y')) + ax[2].xaxis.set_major_locator(mdates.YearLocator()) + ax[2].tick_params(axis='x', rotation=45) # Add this line + ax[2].set_ylim(bottom=0) # Start y-axis at 0 + ax[2].yaxis.set_major_formatter( + plt.FuncFormatter('{:.1f}%'.format)) # Display y-axis values as percentage with one number precision + ax2.set_ylim(bottom=0) # Start y-axis at 0 + ax2.yaxis.set_major_formatter( + plt.FuncFormatter('{:.1f}%'.format)) # Display y-axis values as percentage with one number precision plt.tight_layout() plt.savefig(os.path.join(temp_folder, f'{language}_trend.png')) - plt.close(fig) # Close the figure after it's saved + plt.close(fig) + + pd.DataFrame(local_stats).transpose().to_csv("generated_stats.csv") def generate_combined_graphs_for_all_languages(language_fedora_version_files): all_stats = { @@ -171,21 +190,21 @@ def generate_combined_graphs_for_all_languages(language_fedora_version_files): all_stats['total_translated_words'][language] = total_translated_words_list all_stats['progress_percentage'][language] = progress_percentage_list - for metric in all_stats.keys(): - custom_logger(f" {metric}") + for language in all_stats.keys(): + custom_logger(f" Generate plots for {language}") fig, ax = plt.subplots(figsize=(10, 6)) - for language in all_stats[metric].keys(): - dates, values = zip(*all_stats[metric][language]) + for language in all_stats[language].keys(): + dates, values = zip(*all_stats[language][language]) ax.plot(dates, values, label=language, marker='o') ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m-%d')) ax.set_xlabel('End-of-life date') - ax.set_ylabel(metric) + ax.set_ylabel(language) ax.legend() ax.set_ylim(bottom=0) plt.xticks(rotation=45) plt.tight_layout() - plt.savefig(os.path.join(temp_folder, f'all_languages_{metric}.png')) - plt.close(fig) # Close the figure after it's saved + plt.savefig(os.path.join(temp_folder, f'all_languages_{language}.png')) + plt.close(fig) def calculate_trend_per_day(versions, words): @@ -213,8 +232,8 @@ def custom_logger(message): logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S') custom_logger("compute linear regression") -coeffs = calculate_trend_per_day(fedora_versions, fedora_total_words) -custom_logger(f"The effort rendline is: y = {round(coeffs[0])} words per day + {round(coeffs[1])} words") +fedora_effort_per_day, fedora_effort_backlog = calculate_trend_per_day(fedora_versions, fedora_total_words) +custom_logger(f"The effort rendline is: y = {fedora_effort_per_day} words per day + {fedora_effort_backlog} words") custom_logger("start extracting stats") language_fedora_version_files = list_languages_and_fedora_versions('csv') diff --git a/statistics_processing/generate_plot.py b/statistics_processing/generate_plot.py new file mode 100644 index 0000000..c0b2fc2 --- /dev/null +++ b/statistics_processing/generate_plot.py @@ -0,0 +1,48 @@ +# Convert the CSV data to a pandas DataFrame +import os + +import pandas as pd +from matplotlib import pyplot as plt + +df = pd.read_csv("generated_stats.csv") + +# Exclude languages with less than 20 packages +excluded_languages = df[df['num_packages'] < 20]['language'] +print("Excluded languages:") +print(excluded_languages) + +# Filter the DataFrame to include only languages with 20 or more packages +df = df[df['num_packages'] >= 20] + +# Reset the index of the DataFrame +df.reset_index(drop=True, inplace=True) + +# Define the colors and markers based on local_health +colors = df['local_health'].apply(lambda x: 'green' if x > 0 else ('gray' if x == 0 else 'red')) +markers = df['local_health'].apply(lambda x: 's' if x > 0 else ('o' if x == 0 else '^')) + +# Determine the global minimum and maximum for x and y axes +x_min = df['progress_percentage'].min() +x_max = df['progress_percentage'].max() +y_min = df['num_packages'].min() +y_max = df['num_packages'].max() + +# Create a figure with three subplots +fig, axs = plt.subplots(1, 3, figsize=(15, 5)) + +# Plot the data in each subplot +local_health_conditions = [(df['local_health'] > 0), (df['local_health'] == 0), (df['local_health'] < 0)] +for i, ax in enumerate(axs): + df_subset = df[local_health_conditions[i]] + if not df_subset.empty: + for j in df_subset.index: + ax.scatter(df_subset['progress_percentage'].loc[j], df_subset['num_packages'].loc[j], color=colors.loc[j], marker=markers.loc[j]) + ax.text(df_subset['progress_percentage'].loc[j], df_subset['num_packages'].loc[j], df_subset['language'].loc[j]) + ax.set_xlabel('Translation Progress') + ax.set_ylabel('Number of Packages') + ax.set_title(f'Local_health = {"positive" if i==0 else ("zero" if i==1 else "negative")})') + ax.set_xlim([x_min, x_max]) + ax.set_ylim([y_min, y_max]) + +plt.tight_layout() +plt.savefig(os.path.join(f'all_languages_plot.png')) \ No newline at end of file