#6844 Add a script that will get the packages that have all the specified branches but are retired
Merged 6 years ago by ralph. Opened 6 years ago by mprahl.

@@ -0,0 +1,109 @@

+ """ get-retired-packages.py - Gets all the packages that have all of the

+ specified branches marked as inactive

+ """

+ from __future__ import print_function

+ try:

+     from urllib import urlencode

+ except ImportError:

+     from urllib.parse import urlencode

+ import multiprocessing.pool

+ from multiprocessing import cpu_count

+ import argparse

+ from math import ceil

+ from functools import partial

+ import sys

+ import traceback

+ import requests

+ 

+ PDC_URL = 'https://pdc.fedoraproject.org'

+ # These are set to 4 so that there is a performance gain but it should be low

+ # enough to not overwhelm PDC

+ MAX_NUM_PROCESSES = 4

+ NUM_THREADS_PER_PROCESS = 4

+ 

+ 

+ def handle_errors(f):

+     def _wrapper(*args, **kwargs):

+         """ A decorator for `f` that prints tracebacks. """

+         try:

+             return f(*args, **kwargs)

+         except:

+             traceback.print_exc()

+             raise

+     _wrapper.__name__ = f.__name__

+     _wrapper.__doc__ = f.__doc__

+     return _wrapper

+ 

+ 

+ def get_component_branch_page(branch_name, page, page_size=100):

+     query_args = {'type': 'rpm', 'name': branch_name, 'active': False,

+                   'page_size': page_size, 'page': page}

+     pdc_api_query_url = '{0}/rest_api/v1/component-branches/?{1}'.format(

+         PDC_URL.rstrip('/'), urlencode(query_args))

+     try:

+         rv = requests.get(pdc_api_query_url, timeout=30)

+     except (requests.ConnectionError, requests.ConnectTimeout):

+         print('The connection to PDC failed', file=sys.stderr)

+         sys.exit(1)

+ 

+     try:

+         return rv.json()

+     except ValueError:

+         print('The data returned from PDC was not JSON', file=sys.stderr)

+         sys.exit(1)

+ 

+ 

+ def get_pkgs_from_page(branch_name, page):

+     pkgs_set = set()

+     rv_json = get_component_branch_page(branch_name, page)

+     # Extract the package names from API results

+     for branch_rv in rv_json['results']:

+         pkgs_set.add(str(branch_rv['global_component']))

+ 

+     return pkgs_set

+ 

+ 

+ @handle_errors

+ def get_pkg_branch_status(branch_name):

+     # Get total number of branches that fit the query

+     component_branch_page_one = \

+         get_component_branch_page(branch_name, page=1, page_size=1)

+     # Get the total number of pages

+     num_pages = int(ceil(component_branch_page_one['count'] / 100))

+     # Since we are going to multi-thread, we need to make a partial function

+     # call so that all the function needs is an iterable to run

+     partial_get_pkgs_from_page = partial(get_pkgs_from_page, branch_name)

+     # Start processing NUM_THREADS_PER_PROCESS pages at a time

+     pool = multiprocessing.pool.ThreadPool(NUM_THREADS_PER_PROCESS)

+     pkg_sets = pool.map(partial_get_pkgs_from_page, range(1, num_pages + 1))

+     pool.close()

+     # Return a set of all the packages from the pages queried

+     if pkg_sets:

+         return set.union(*pkg_sets)

+     else:

+         return set()

+ 

+ 

+ if __name__ == '__main__':

+     parser = argparse.ArgumentParser()

+     help = 'the branches that the returned packages will have retired'

+     parser.add_argument('branches', nargs='+', help=help)

+     args = parser.parse_args()

+     if cpu_count() > MAX_NUM_PROCESSES:

+         num_processes = MAX_NUM_PROCESSES

+     else:

+         num_processes = cpu_count()

+     # Process up to num_processes branches at a time in separate processes

+     pool = multiprocessing.Pool(processes=num_processes)

+     pkg_sets = pool.map(get_pkg_branch_status, args.branches)

+     pool.close()

+ 

+     # Return only the packages that have all the specified branches and are

+     # retired

+     pkgs = list(set.intersection(*pkg_sets))

+     if pkgs:

+         for pkg in sorted(pkgs):

+             print(pkg)

+     else:

+         print('No retired packages were returned from the branches: {0}'

+               .format(', '.join(args.branches), file=sys.stderr))

Add a script that will get the packages that have all the specified branches but are retired (inactive in PDC).

Please note that this script will only function after the following is deployed in Fedora's PDC:
https://github.com/product-definition-center/product-definition-center/pull/433/commits/ad0de383ed19e6ea699f6d64896bb540c75d280b

rebased

6 years ago

Should specify in the docstring:

  • have any of the specified branches marked as...

or

  • have all of the specified branches marked as inactive...

I got the following error when trying to run this:

scripts/pdc❯ python get-retired-packages.py master
Traceback (most recent call last):
  File "get-retired-packages.py", line 70, in <module>
    pkg_sets = pool.map(get_pkg_branch_status, args.branches)
  File "/usr/lib64/python2.7/multiprocessing/pool.py", line 251, in map
    return self.map_async(func, iterable, chunksize).get()
  File "/usr/lib64/python2.7/multiprocessing/pool.py", line 567, in get
    raise self._value
TypeError: descriptor 'union' of 'set' object needs an argument

Can we get some better error reporting?

To explain more: I think multiprocessing.pool is masking the real traceback inside one of the subprocesses. Can you catch the error in get_pkg_branch_status and log the details before crashing?

This would do it:

diff --git a/scripts/pdc/get-retired-packages.py b/scripts/pdc/get-retired-packages.py
index 0d160a9..5e45355 100644
--- a/scripts/pdc/get-retired-packages.py
+++ b/scripts/pdc/get-retired-packages.py
@@ -7,6 +7,7 @@ except ImportError:
     from urllib.parse import urlencode
 import multiprocessing.pool
 import argparse
+import traceback
 from math import ceil
 from functools import partial
 import requests
@@ -47,6 +48,20 @@ def get_pkgs_from_page(branch_name, page):
     return pkgs_set


+def handle_errors(f):
+    def _wrapper(*args, **kwargs):
+        """ A decorator for `f` that prints tracebacks. """
+        try:
+            return f(*args, **kwargs)
+        except:
+            traceback.print_exc()
+            raise
+    _wrapper.__name__ = f.__name__
+    _wrapper.__doc__ = f.__doc__
+    return _wrapper
+
+
+@handle_errors
 def get_pkg_branch_status(branch_name):
     # Get total number of branches that fit the query
     component_branch_page_one = \

It might also be nice to print a warning to stderr if there are no retired packages found, like this:

diff --git a/scripts/pdc/get-retired-packages.py b/scripts/pdc/get-retired-packages.py
index 0d160a9..05f702c 100644
--- a/scripts/pdc/get-retired-packages.py
+++ b/scripts/pdc/get-retired-packages.py
@@ -1,12 +1,16 @@
 """ get-retired-packages.py - Gets all the packages that have the specified
 branches marked as inactive in PDC
 """
+from __future__ import print_function
+
 try:
     from urllib import urlencode
 except ImportError:
     from urllib.parse import urlencode
+
 import multiprocessing.pool
 import argparse
+import sys
 from math import ceil
 from functools import partial
 import requests
@@ -28,13 +32,13 @@ def get_component_branch_page(branch_name, page, page_size=100):
         rv = requests.get(pdc_api_query_url, timeout=30)
     except (requests.ConnectionError, requests.ConnectTimeout):
         print('The connection to PDC failed')
-        exit(1)
+        sys.exit(1)

     try:
         return rv.json()
     except ValueError:
         print('The data returned from PDC was not JSON')
-        exit(1)
+        sys.exit(1)


 def get_pkgs_from_page(branch_name, page):
@@ -61,7 +65,10 @@ def get_pkg_branch_status(branch_name):
     pkg_sets = pool.map(partial_get_pkgs_from_page, range(1, num_pages + 1))
     pool.close()
     # Return a set of all the packages from the pages queried
-    return set.union(*pkg_sets)
+    if pkg_sets:
+        return set.union(*pkg_sets)
+    else:
+        return set()


 if __name__ == '__main__':
@@ -72,5 +79,9 @@ if __name__ == '__main__':

     # Return only the packages that have all the specified branches and are
     # retired
-    for pkg in sorted(list(set.intersection(*pkg_sets))):
-        print(pkg)
+    pkgs = list(set.intersection(*pkg_sets))
+    if pkgs:
+        for pkg in sorted(pkgs):
+            print(pkg)
+    else:
+        print("No retired packages on %r" % args.branches, file=sys.stderr)

1 new commit added

  • Address feedback
6 years ago

2 new commits added

  • Address feedback
  • Add the get-retired-packages.py script. This script will only function after the following is deployed in Fedora's PDC:
6 years ago

Thanks for the review. I addressed the feedback in the latest commit.

IMHO this should be printed to stderr because it is an error message.

IMHO this should be printed to stderr because it is an error message.

IMHO it would be good to explain why 8 is a good choice here. It is not obvious to me I must confess.

Is it safe to assume that pages will always contain 100 entries? It does not seem to be set in the actual queries.

The lines above this comments after assigning the help variable should IMHO be moved into the main conditional below.

Why exactly four branches? Is it because of el6, epel7, branched and rawhide are the usual suspects?

Yes, the I'm using a page_size of 100 in get_component_branch_page by default.

It is just an arbitrary amount to improve performance without overloading PDC. I can make this a configuration option.

I'll set a configuration option called MAX_NUM_PROCESSES in the file which will dictate the maximum number of processes to create. If the number of CPUs is lower than that value, the number of CPUs will be used for the number of processes to create.

rebased

6 years ago

@till thanks for the review. I addressed your comments.

Pull-Request has been merged by ralph

6 years ago
Metadata