#4 Change the way we retrieve the package's summary and a couple of quick fixes
Merged 4 years ago by nphilipp. Opened 4 years ago by pingou.
fedora-infra/ pingou/distgit-bugzilla-sync master  into  master

file added
+210
@@ -0,0 +1,210 @@ 

+ #!/usr/bin/python3 -tt

+ 

+ """

+ This module provides the functionality to download the latest primary.xml

+ database from koji on the rawhide repo.

+ Decompress that xml file (which are downloaded compressed).

+ Read its content and build a dictionary with the package names as keys

+ and their summaries as values.

+ 

+ This code can then be used to create an in-memory cache of this information

+ which can then later be re-used in other places.

+ This prevents relying on remote services such as mdapi (of which a lot of

+ code here is coming from) when needing to access the summary of a lot of

+ packages.

+ 

+ """

+ import contextlib

+ import hashlib

+ import logging

+ import os

+ import time

+ import xml.etree.ElementTree as ET

+ import xml.sax

+ 

+ import defusedxml.sax

+ import requests

+ 

+ KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'

+ 

+ repomd_xml_namespace = {

+     'repo': 'http://linux.duke.edu/metadata/repo',

+     'rpm': 'http://linux.duke.edu/metadata/rpm',

+ }

+ 

+ log = logging.getLogger(__name__)

+ 

+ 

+ def download_db(name, repomd_url, archive):

+     log.info('%s Downloading file: %s to %s' % (

+         name.ljust(12), repomd_url, archive))

+     response = requests.get(repomd_url, verify=True)

+     with open(archive, 'wb') as stream:

+         stream.write(response.content)

+ 

+ 

+ def decompress_db(name, archive, location):

+     ''' Decompress the given archive at the specified location. '''

+     log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))

+     if archive.endswith('.xz'):

+         import lzma

+         with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:

+             data = stream_xz.read()

+         with open(location, 'wb') as stream:

+             stream.write(data)

+     elif archive.endswith('.tar.gz'):

+         import tarfile

+         with tarfile.open(archive) as tar:

+             tar.extractall(path=location)

+     elif archive.endswith('.gz'):

+         import gzip

+         with open(location, 'wb') as out:

+             with gzip.open(archive, 'rb') as inp:

+                 out.write(inp.read())

+     elif archive.endswith('.bz2'):

+         import bz2

+         with open(location, 'wb') as out:

+             bzar = bz2.BZ2File(archive)

+             out.write(bzar.read())

+             bzar.close()

+     else:

+         raise NotImplementedError(archive)

+ 

+ 

+ def needs_update(local_file, remote_sha, sha_type):

+     ''' Compare sha of a local and remote file.

+     Return True if our local file needs to be updated.

+     '''

+ 

+     if not os.path.isfile(local_file):

+         # If we have never downloaded this before, then obviously it has

+         # "changed"

+         return True

+ 

+     # Old old epel5 doesn't even know which sha it is using..

+     if sha_type == 'sha':

+         sha_type = 'sha1'

+ 

+     hash = getattr(hashlib, sha_type)()

+     with open(local_file, 'rb') as f:

+         hash.update(f.read())

+ 

+     local_sha = hash.hexdigest()

+     if local_sha != remote_sha:

+         return True

+ 

+     return False

+ 

+ 

+ class PackageHandler(xml.sax.ContentHandler):

+     def __init__(self):

+         self.current_data = ""

+         self.name = ""

+         self.summary = ""

+         self.output = {}

+         self.pkg = {}

+ 

+     # Call when an element starts

+     def startElement(self, tag, attributes):

+         self.current_data = tag

+         if tag == "package":

+             if self.pkg:

+                 self.output[self.pkg["name"]] = self.pkg["summary"]

+             self.type = attributes["type"]

+             self.pkg = {}

+ 

+     # Call when a character is read

+     def characters(self, content):

+         if self.current_data == "summary":

+             self.summary = content

+         elif self.current_data == "name":

+             self.name = content

+ 

+     # Call when an elements ends

+     def endElement(self, tag):

+         if self.current_data == "summary":

+             # print("Summary:", self.summary)

+             self.pkg["summary"] = self.summary

+         elif self.current_data == "name":

+             # print("name:", self.name)

+             self.pkg["name"] = self.name

+ 

+         self.current_data = ""

+ 

+ 

+ def get_primary_xml(destfolder, url, name):

+     ''' Retrieve the repo metadata at the given url and store them using

+     the provided name.

+     '''

+     repomd_url = url + '/repomd.xml'

+     response = requests.get(repomd_url, verify=True)

+     if not bool(response):

+         print('%s !! Failed to get %r %r' % (

+             name.ljust(12), repomd_url, response))

+         return

+ 

+     # Parse the xml doc and get a list of locations and their shasum.

+     files = ((

+         node.find('repo:location', repomd_xml_namespace),

+         node.find('repo:open-checksum', repomd_xml_namespace),

+     ) for node in ET.fromstring(response.text))

+ 

+     # Extract out the attributes that we're really interested in.

+     files = (

+         (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])

+         for f, s in files if f is not None and s is not None

+     )

+ 

+     # Filter down to only the primary.xml files

+     files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)

+ 

+     if not files:

+         log.debug('No primary.xml could be found in %s' % url)

+     elif len(files) > 1:

+         log.debug("More than one primary.xml could be found in %s" % url)

+         return

+ 

+     filename, shasum, shatype = files[0]

+     repomd_url = url + '/' + filename

+ 

+     # First, determine if the file has changed by comparing hash

+     db = "distgit-bugzilla-sync-primary.xml"

+ 

+     # Have we downloaded this before?  Did it change?

+     destfile = os.path.join(destfolder, db)

+     if not needs_update(destfile, shasum, shatype):

+         log.debug('%s No change of %s' % (name.ljust(12), repomd_url))

+     else:

+         # If it has changed, then download it and move it into place.

+         archive = os.path.join(destfolder, filename)

+ 

+         download_db(name, repomd_url, archive)

+         decompress_db(name, archive, destfile)

+ 

+     return destfile

+ 

+ 

+ def get_package_summary():

+     start = time.time()

+ 

+     primary_xml = get_primary_xml(

+         "/var/tmp",

+         KOJI_REPO + 'rawhide/latest/x86_64/repodata',

+         "koji",

+     )

+ 

+     handler = PackageHandler()

+     defusedxml.sax.parse(primary_xml, handler)

+ 

+     delta = time.time() - start

+     log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")

+ 

+     return handler.output

+ 

+ 

+ if __name__ == "__main__":

+     logging.basicConfig(level=logging.DEBUG)

+     db = get_package_summary()

+     print(f"guake: {db.get('guake')}")

+     print(f"geany: {db.get('geany')}")

+     print(f"kernel: {db.get('kernel')}")

file modified
+18 -31
@@ -53,6 +53,8 @@ 

  from urllib3.util import Retry

  import yaml

  

+ import package_summary

+ 

  

  env = 'staging'

  
@@ -531,36 +533,11 @@ 

          print('Querying {0}'.format(pagure_override_url))

      override_rv = session.get(pagure_override_url, timeout=30)

      if override_rv.status_code == 200:

-         override_yaml = yaml.load(override_rv.text)

+         override_yaml = yaml.safe_load(override_rv.text)

          return override_yaml.get('bugzilla_contact', {})

      return {}

  

  

- @cache.cache_on_arguments()

- def _get_package_summary_from_mdapi(namespace, repo, session=None):

-     summary = None

-     if namespace != 'rpms':

-         return summary

- 

-     if session is None:

-         session = retry_session()

- 

-     url = '{0}/rawhide/srcpkg/{1}'.format(MDAPIURL.rstrip('/'), repo)

-     if VERBOSE:

-         print('Querying {0}'.format(url))

- 

-     rv = session.get(url, timeout=60)

-     if rv.ok:

-         rv_json = rv.json()

-         summary = rv_json['summary']

-     elif not rv.ok and rv.status_code != 404:

-         error_msg = ('The connection to "{0}" failed with the status code {1} '

-                      'and output "{2}"').format(url, rv.status_code, rv.text)

-         raise RuntimeError(error_msg)

- 

-     return summary

- 

- 

  def _get_pdc_branches(session, repo):

      """

      Gets the branches on a project. This function is used for mapping.
@@ -604,7 +581,7 @@ 

          return True

  

  

- def _to_legacy_schema(product_and_project, session=None):

+ def _to_legacy_schema(product_and_project_and_summary, session=None):

      """

      This function translates the JSON of a Pagure project to what PkgDB used to

      output in the Bugzilla API. This function is used for mapping.
@@ -615,7 +592,7 @@ 

      :return: a dictionary of the content that the PkgDB Bugzilla API would

      return

      """

-     product, project = product_and_project

+     product, project, rpm_summary = product_and_project_and_summary

  

      if session is None:

          session = retry_session()
@@ -623,8 +600,9 @@ 

      owner = project['poc']

      watchers = project['watchers']

  

-     summary = _get_package_summary_from_mdapi(

-         project['namespace'], project['name'], session)

+     summary = None

+     if project["namespace"] == "rpms":

+         summary = rpm_summary.get(project["name"])

  

      # Check if the project is retired in PDC, and if so set assignee to orphan.

      if _is_retired(product, project):
@@ -658,6 +636,7 @@ 

  def main():

      """The entrypoint to the script."""

      global VERBOSE, DRYRUN, projects_dict

+     start = time.time()

  

      parser = argparse.ArgumentParser(

          description='Script syncing information between Pagure and bugzilla'
@@ -708,6 +687,10 @@ 

          print("Querying %r for initial cc list." % cc_url)

      pagure_namespace_to_cc = session.get(cc_url, timeout=120).json()

  

+     if VERBOSE:

+         print("Building a cache of the rpm packages' summary")

+     rpm_summary = package_summary.get_package_summary()

+ 

      # Combine and collapse those two into a single list:

      pagure_projects = []

      for namespace, entries in pagure_namespace_to_poc.items():
@@ -758,7 +741,7 @@ 

      # would have returned

      p_to_legacy_schema = resilient_partial(_to_legacy_schema, session=session)

      items = [

-         (product, project)

+         (product, project, rpm_summary)

          for project in pagure_projects

          for product in project['products']

      ]
@@ -819,6 +802,10 @@ 

          with open(DATA_CACHE, 'w') as stream:

              json.dump({}, stream)

  

+     if VERBOSE:

+         delta = time.time() - start

+         print("Ran for %s seconds -- ie: %.2f minutes" % (delta, delta/60.0))

+ 

      sys.exit(0)

  

  

no initial comment

"... which most often will be compressed)."?

BTW, if it's only "most often", should we cater for the case when it isn't below? Right now it looks as if decompress_db() would bail out with a NotImplementedError in this case.

"...with the package names as keys and their summaries as values."

"...an in-memory cache of this information..."

perhaps get rid of "XZ", since we support different compression formats here

The sys module isn't used:

./package_summary.py:20:1: F401 'sys' imported but unused
import sys
^
./package_summary.py:51:14: F821 undefined name 'contextlib'
        with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
             ^

remove the spaces after/before brackets:

./package_summary.py:99:22: E201 whitespace after '('
class PackageHandler( xml.sax.ContentHandler ):
                     ^
./package_summary.py:99:45: E202 whitespace before ')'
class PackageHandler( xml.sax.ContentHandler ):
                                            ^

Here and below, let's indent by multiples of 4 spaces:

./package_summary.py:100:4: E111 indentation is not a multiple of four
   def __init__(self):
   ^
./package_summary.py:101:7: E111 indentation is not a multiple of four
      self.current_data = ""
      ^
./package_summary.py:102:7: E111 indentation is not a multiple of four
      self.name = ""
      ^
...
./package_summary.py:143:24: F821 undefined name 'padding'
            name.ljust(padding), repomd_url, response))
                       ^
./package_summary.py:163:5: F841 local variable 'cache1' is assigned to but never used
    cache1, cache2 = {}, {}
    ^
./package_summary.py:163:13: F841 local variable 'cache2' is assigned to but never used
    cache1, cache2 = {}, {}
            ^
./package_summary.py:201:5: F841 local variable 'parser' is assigned to but never used
    parser = defusedxml.sax.parse(primary_xml, handler)
    ^
./package_summary.py:210:1: E303 too many blank lines (3)
if ( __name__ == "__main__"):
^
./package_summary.py:210:5: E201 whitespace after '('
if ( __name__ == "__main__"):
    ^

The brackets are superfluous here, let's just remove them.

rephrasing: which are downloaded compressed.

used it at one point, but no longer indeed :)

Added the missing import

rebased onto c04c9383e1f533160cb4ad7fa3e1f58d89b9c9f5

4 years ago

rebased onto 4a9d30a9dd63df263085ec28949db4d258c950f5

4 years ago

As discussed:

  • appeased flake8 (except one issue where it can't tell third party apart from our own project-internal module which will be fixed when we move the code into its own package)
  • change _log to log because it's more prevalent

Pull-Request has been merged by nphilipp

4 years ago