| |
@@ -0,0 +1,210 @@
|
| |
+ #!/usr/bin/python3 -tt
|
| |
+
|
| |
+ """
|
| |
+ This module provides the functionality to download the latest primary.xml
|
| |
+ database from koji on the rawhide repo.
|
| |
+ Decompress that xml file (which are downloaded compressed).
|
| |
+ Read its content and build a dictionary with the package names as keys
|
| |
+ and their summaries as values.
|
| |
+
|
| |
+ This code can then be used to create an in-memory cache of this information
|
| |
+ which can then later be re-used in other places.
|
| |
+ This prevents relying on remote services such as mdapi (of which a lot of
|
| |
+ code here is coming from) when needing to access the summary of a lot of
|
| |
+ packages.
|
| |
+
|
| |
+ """
|
| |
+ import contextlib
|
| |
+ import hashlib
|
| |
+ import logging
|
| |
+ import os
|
| |
+ import time
|
| |
+ import xml.etree.ElementTree as ET
|
| |
+ import xml.sax
|
| |
+
|
| |
+ import defusedxml.sax
|
| |
+ import requests
|
| |
+
|
| |
+ KOJI_REPO = 'https://kojipkgs.fedoraproject.org/repos/'
|
| |
+
|
| |
+ repomd_xml_namespace = {
|
| |
+ 'repo': 'http://linux.duke.edu/metadata/repo',
|
| |
+ 'rpm': 'http://linux.duke.edu/metadata/rpm',
|
| |
+ }
|
| |
+
|
| |
+ log = logging.getLogger(__name__)
|
| |
+
|
| |
+
|
| |
+ def download_db(name, repomd_url, archive):
|
| |
+ log.info('%s Downloading file: %s to %s' % (
|
| |
+ name.ljust(12), repomd_url, archive))
|
| |
+ response = requests.get(repomd_url, verify=True)
|
| |
+ with open(archive, 'wb') as stream:
|
| |
+ stream.write(response.content)
|
| |
+
|
| |
+
|
| |
+ def decompress_db(name, archive, location):
|
| |
+ ''' Decompress the given archive at the specified location. '''
|
| |
+ log.info('%s Extracting %s to %s' % (name.ljust(12), archive, location))
|
| |
+ if archive.endswith('.xz'):
|
| |
+ import lzma
|
| |
+ with contextlib.closing(lzma.LZMAFile(archive)) as stream_xz:
|
| |
+ data = stream_xz.read()
|
| |
+ with open(location, 'wb') as stream:
|
| |
+ stream.write(data)
|
| |
+ elif archive.endswith('.tar.gz'):
|
| |
+ import tarfile
|
| |
+ with tarfile.open(archive) as tar:
|
| |
+ tar.extractall(path=location)
|
| |
+ elif archive.endswith('.gz'):
|
| |
+ import gzip
|
| |
+ with open(location, 'wb') as out:
|
| |
+ with gzip.open(archive, 'rb') as inp:
|
| |
+ out.write(inp.read())
|
| |
+ elif archive.endswith('.bz2'):
|
| |
+ import bz2
|
| |
+ with open(location, 'wb') as out:
|
| |
+ bzar = bz2.BZ2File(archive)
|
| |
+ out.write(bzar.read())
|
| |
+ bzar.close()
|
| |
+ else:
|
| |
+ raise NotImplementedError(archive)
|
| |
+
|
| |
+
|
| |
+ def needs_update(local_file, remote_sha, sha_type):
|
| |
+ ''' Compare sha of a local and remote file.
|
| |
+ Return True if our local file needs to be updated.
|
| |
+ '''
|
| |
+
|
| |
+ if not os.path.isfile(local_file):
|
| |
+ # If we have never downloaded this before, then obviously it has
|
| |
+ # "changed"
|
| |
+ return True
|
| |
+
|
| |
+ # Old old epel5 doesn't even know which sha it is using..
|
| |
+ if sha_type == 'sha':
|
| |
+ sha_type = 'sha1'
|
| |
+
|
| |
+ hash = getattr(hashlib, sha_type)()
|
| |
+ with open(local_file, 'rb') as f:
|
| |
+ hash.update(f.read())
|
| |
+
|
| |
+ local_sha = hash.hexdigest()
|
| |
+ if local_sha != remote_sha:
|
| |
+ return True
|
| |
+
|
| |
+ return False
|
| |
+
|
| |
+
|
| |
+ class PackageHandler(xml.sax.ContentHandler):
|
| |
+ def __init__(self):
|
| |
+ self.current_data = ""
|
| |
+ self.name = ""
|
| |
+ self.summary = ""
|
| |
+ self.output = {}
|
| |
+ self.pkg = {}
|
| |
+
|
| |
+ # Call when an element starts
|
| |
+ def startElement(self, tag, attributes):
|
| |
+ self.current_data = tag
|
| |
+ if tag == "package":
|
| |
+ if self.pkg:
|
| |
+ self.output[self.pkg["name"]] = self.pkg["summary"]
|
| |
+ self.type = attributes["type"]
|
| |
+ self.pkg = {}
|
| |
+
|
| |
+ # Call when a character is read
|
| |
+ def characters(self, content):
|
| |
+ if self.current_data == "summary":
|
| |
+ self.summary = content
|
| |
+ elif self.current_data == "name":
|
| |
+ self.name = content
|
| |
+
|
| |
+ # Call when an elements ends
|
| |
+ def endElement(self, tag):
|
| |
+ if self.current_data == "summary":
|
| |
+ # print("Summary:", self.summary)
|
| |
+ self.pkg["summary"] = self.summary
|
| |
+ elif self.current_data == "name":
|
| |
+ # print("name:", self.name)
|
| |
+ self.pkg["name"] = self.name
|
| |
+
|
| |
+ self.current_data = ""
|
| |
+
|
| |
+
|
| |
+ def get_primary_xml(destfolder, url, name):
|
| |
+ ''' Retrieve the repo metadata at the given url and store them using
|
| |
+ the provided name.
|
| |
+ '''
|
| |
+ repomd_url = url + '/repomd.xml'
|
| |
+ response = requests.get(repomd_url, verify=True)
|
| |
+ if not bool(response):
|
| |
+ print('%s !! Failed to get %r %r' % (
|
| |
+ name.ljust(12), repomd_url, response))
|
| |
+ return
|
| |
+
|
| |
+ # Parse the xml doc and get a list of locations and their shasum.
|
| |
+ files = ((
|
| |
+ node.find('repo:location', repomd_xml_namespace),
|
| |
+ node.find('repo:open-checksum', repomd_xml_namespace),
|
| |
+ ) for node in ET.fromstring(response.text))
|
| |
+
|
| |
+ # Extract out the attributes that we're really interested in.
|
| |
+ files = (
|
| |
+ (f.attrib['href'].replace('repodata/', ''), s.text, s.attrib['type'])
|
| |
+ for f, s in files if f is not None and s is not None
|
| |
+ )
|
| |
+
|
| |
+ # Filter down to only the primary.xml files
|
| |
+ files = list((f, s, t) for f, s, t in files if 'primary.xml' in f)
|
| |
+
|
| |
+ if not files:
|
| |
+ log.debug('No primary.xml could be found in %s' % url)
|
| |
+ elif len(files) > 1:
|
| |
+ log.debug("More than one primary.xml could be found in %s" % url)
|
| |
+ return
|
| |
+
|
| |
+ filename, shasum, shatype = files[0]
|
| |
+ repomd_url = url + '/' + filename
|
| |
+
|
| |
+ # First, determine if the file has changed by comparing hash
|
| |
+ db = "distgit-bugzilla-sync-primary.xml"
|
| |
+
|
| |
+ # Have we downloaded this before? Did it change?
|
| |
+ destfile = os.path.join(destfolder, db)
|
| |
+ if not needs_update(destfile, shasum, shatype):
|
| |
+ log.debug('%s No change of %s' % (name.ljust(12), repomd_url))
|
| |
+ else:
|
| |
+ # If it has changed, then download it and move it into place.
|
| |
+ archive = os.path.join(destfolder, filename)
|
| |
+
|
| |
+ download_db(name, repomd_url, archive)
|
| |
+ decompress_db(name, archive, destfile)
|
| |
+
|
| |
+ return destfile
|
| |
+
|
| |
+
|
| |
+ def get_package_summary():
|
| |
+ start = time.time()
|
| |
+
|
| |
+ primary_xml = get_primary_xml(
|
| |
+ "/var/tmp",
|
| |
+ KOJI_REPO + 'rawhide/latest/x86_64/repodata',
|
| |
+ "koji",
|
| |
+ )
|
| |
+
|
| |
+ handler = PackageHandler()
|
| |
+ defusedxml.sax.parse(primary_xml, handler)
|
| |
+
|
| |
+ delta = time.time() - start
|
| |
+ log.info(f"Parsed in {delta} seconds -- ie: {delta/60} minutes")
|
| |
+
|
| |
+ return handler.output
|
| |
+
|
| |
+
|
| |
+ if __name__ == "__main__":
|
| |
+ logging.basicConfig(level=logging.DEBUG)
|
| |
+ db = get_package_summary()
|
| |
+ print(f"guake: {db.get('guake')}")
|
| |
+ print(f"geany: {db.get('geany')}")
|
| |
+ print(f"kernel: {db.get('kernel')}")
|
| |