#18 modlint: Use asynchronous requests using python-tornado to query Fedora dist-git when checking commit ids.
Closed 7 years ago by nphilipp. Opened 7 years ago by jkaluza.
jkaluza/modulemd master  into  master

file modified
+153 -56
@@ -1,20 +1,66 @@ 

  #!/usr/bin/python3

  

- 

  import argparse

  import sys

- 

  import requests

+ import collections

+ import asyncio

+ from concurrent.futures import ThreadPoolExecutor

+ 

  from modulemd import ModuleMetadata

  

+ class BacklogHTTPClient(object):

+     """

+     HTTP Client class sending HEAD requests asynchronously.

+     """

+     MAX_CONCURRENT_REQUESTS = 20

+ 

+     def __init__(self):

+         self.backlog = collections.deque()

+         self.concurrent_requests = 0

+         self.executor = ThreadPoolExecutor(

+             max_workers=self.MAX_CONCURRENT_REQUESTS)

+ 

+     def _schedule_requests(self):

+         loop = asyncio.get_event_loop()

+         while (self.backlog and

+                self.concurrent_requests < self.MAX_CONCURRENT_REQUESTS):

+             request, callback = self.backlog.popleft()

+             future = loop.run_in_executor(self.executor, requests.head,

+                                           request)

+             future.add_done_callback(callback)

+             self.concurrent_requests += 1

+         if len(self.backlog) == 0 and self.concurrent_requests == 0:

+             loop.stop()

+ 

+     def _get_callback(self, function, callback_data):

+         def wrapped(*args, **kwargs):

+             self.concurrent_requests -= 1

+             self._schedule_requests()

+             return function(data=callback_data, *args, **kwargs)

+         return wrapped

+ 

+     def head(self, url, callback=None, callback_data=None):

+         """

+         Schedules new HTTP HEAD request. Once the response is received,

+         callback function is called with response and callback data passed

+         to it.

+ 

+         :param string url: URL.

+         :param function callback: Callback function.

+         :param anytype callback_data: Data passed to callback function.

+         """

+         wrapped = self._get_callback(callback, callback_data)

+         self.backlog.append((url, wrapped))

+         self._schedule_requests()

  

  class RpmContentError(ValueError):

      """Metadata contains invalid/nonexistent srpm or commit."""

  

      #: Mapping of HTTP status codes to actual failure reasons.

      REASONS = {

-         400: 'Bad hash',

-         404: 'Nonexistent rpm',

+         400: "Bad hash",

+         404: "Nonexistent rpm",

      }

  

      def __init__(self, package, commit, status_code):
@@ -25,77 +71,122 @@ 

          self.status_code = status_code

  

      def __repr__(self):

-         return 'RpmContentError({pkg!r}, {commit!r}, {status!r})'.format(

+         return "RpmContentError({pkg!r}, {commit!r}, {status!r})".format(

              pkg=self.package,

              commit=self.commit,

              status=self.status_code

          )

  

      def __str__(self):

-         return '{pkg} [{commit}]: {reason}'.format(

+         return "{pkg} [{commit}]: {reason}".format(

              pkg=self.package,

              commit=self.commit,

-             reason=self.REASONS.get(self.status_code, 'Unknown error')

+             reason=self.REASONS.get(self.status_code, "Unknown error")

          )

  

+ class VerificationError(ValueError):

+     """

+     ValueError subclass meaning that RPM verification has failed.

+     """

+     def __init__(self, desc):

+         super().__init__(self, desc)

+         self.desc = desc

  

- def verify_rpm_content(package, metadata, *, requests=requests):

-     """Verify the existence of dist-git repo for described package.

+     def __str__(self):

+         return self.desc

  

-     Keyword arguments:

-         package -- Name of the package to verify (key in

-             modulemd packages)

-         metadata -- Metadata associated with the package

-             (value in modulemd packages)

+ class RPMVerifier(object):

+     def __init__(self):

+         self.http_client = BacklogHTTPClient()

+         self.packages_checked = 0

+         self.packages_to_check = 0

+         self.error = False

  

-         requests -- Dependency injection of networking library.

+     def _handle_git_response(self, response, data = None):

+         response = response.result()

+         package = data["package"]

+         commit = data["commit"]

  

-     Returns:

-         None if the associated repo exists.

+         self.packages_checked += 1

+         print("\r", end="")

  

-     Raises:

-         RpmContentError -- when the package (or commit) does not exist

-             in dist-git.

-     """

+         if response.status_code != 200:

+             raise RpmContentError(package, commit, response.status_code)

  

-     # TODO: Get URL from the metadata

-     CGIT_URL_TEMPLATE = 'http://pkgs.fedoraproject.org/cgit/rpms/{name}.git/commit'

+         print("Verifying RPM content %d%% (%d/%d)" % (

+             self.packages_checked / self.packages_to_check * 100,

+             self.packages_checked, self.packages_to_check), end = "",

+             flush = True)

  

-     commit = metadata.get('commit', 'HEAD')

+     def verify_rpm(self, package, metadata):

+         """

+         Verifies the existence of dist-git repo for described package.

  

-     response = requests.head(

-             CGIT_URL_TEMPLATE.format(name=package),

-             params={'id': commit}

-             )

+         :param string package: Name of the package to verify.

+         :param ModuleMetadata metadata: Metadata associated with the package.

+         """

  

-     if response.status_code != requests.codes.ok:

-         raise RpmContentError(package, commit, response.status_code)

+         # TODO: Get URL from the metadata

+         CGIT_URL_TEMPLATE = "http://pkgs.fedoraproject.org/cgit/rpms/{name}.git/commit"

  

+         commit = metadata.get("commit", "HEAD")

+         url = CGIT_URL_TEMPLATE.format(name=package) + "?id=" + commit

  

- def rpm_content(mmd):

-     """Generate (package, metadata) pairs from module.

+         data = {}

+         data["package"] = package

+         data["commit"] = commit

+         self.http_client.head(url, self._handle_git_response, data)

  

-     Keyword arguments:

-         mmd -- ModuleMetadata to be inspected.

+     def rpm_content(self, mmd):

+         """Generate (package, metadata) pairs from module.

  

-     Yields:

-         (package, metadata) from mmd, if any.

-     """

+         Positional arguments:

+             mmd -- ModuleMetadata to be inspected.

  

-     try:

-         packages = mmd.components.rpms.packages

-     except AttributeError:

-         # Module has no packages -- end generation with no output

-         return None

+         Yields:

+             (package, metadata) from mmd, if any.

+         """

  

-     yield from packages.items()

+         try:

+             packages = mmd.components.rpms.packages

+         except AttributeError:

+             # Module has no packages -- end generation with no output

+             return None

  

+         yield from packages.items()

  

- if __name__ == '__main__':

-     parser = argparse.ArgumentParser(description='Validate module metadata.')

+     def _exception_handler(self, loop, context):

+         self.error = True

+         print(str(context["exception"]))

+ 

+     def verify_rpms(self, metadata):

+         """

+         Verifies the existence of dist-git repo for described package.

+ 

+         :param string package: Name of the package to verify.

+         :param ModuleMetadata metadata: Metadata associated with the package.

+         :raises VerificationError: If there is an error during verification.

+         """

+         loop = asyncio.get_event_loop()

+         loop.set_exception_handler(self._exception_handler)

+ 

+         self.packages_to_check = len(metadata.components.rpms.packages)

+         self.packages_checked = 0

+         for package in self.rpm_content(metadata):

+             self.verify_rpm(*package)

+ 

+         loop.run_forever()

+ 

+         print("\r", end="")

+ 

+         if self.error:

+             raise VerificationError("Some RPMs failed the verification")

+ 

+ if __name__ == "__main__":

+     parser = argparse.ArgumentParser(description="Validate module metadata.")

  

      # Positional arguments

-     parser.add_argument('file', help='Input metadata file')

+     parser.add_argument("file", help="Input metadata file")

  

      args = parser.parse_args()

  
@@ -103,28 +194,34 @@ 

      metadata_errors = TypeError, ValueError

      all_went_well = True

  

+     # Load the ModuleMetadata file

      try:

          metadata.load(args.file)

      except metadata_errors as invalid_input_metadata:

-         message = 'ERROR: Invalid input: {!s}'.format(invalid_input_metadata)

+         message = "ERROR: Invalid input: {!s}".format(invalid_input_metadata)

          sys.exit(message)

  

+     # Try to validate it.

      try:

          metadata.validate()

      except metadata_errors as invalid_metadata_structure:

          all_went_well = False

-         print('ERROR: Invalid structure:', str(invalid_metadata_structure),

+         print("ERROR: Invalid structure:", str(invalid_metadata_structure),

                file=sys.stderr)

  

-     for package in rpm_content(metadata):

-         try:

-             verify_rpm_content(*package)

-         except RpmContentError as invalid_rpm:

-             all_went_well = False

-             print('RPM CONTENT ERROR:', str(invalid_rpm),

-                   file=sys.stderr)

+     # Try to verify RPM section of ModuleMetadata file.

+     rpm_verifier = RPMVerifier()

+     try:

+         rpm_verifier.verify_rpms(metadata)

+     except VerificationError as error:

+         all_went_well = False

+         print("ERROR:", str(error), file=sys.stderr)

+     except KeyboardInterrupt:

+         all_went_well = False

  

+     # Exit

      if all_went_well:

-         print('Everything OK')

+         print("")

+         print("Everything OK")

      else:

          sys.exit(1)

It uses python-tornado - I'm not sure if that's the right module to use. I'm OK to rewrite that to something different, but we should discuss what we should use for async http requests.

rebased

7 years ago

What's the reason for the double leading underscore in this method? Normally, this should only used to avoid name clashes in subclasses. I assume that _schedule_requests() has only one because name-mangling bit you in the nested wrapped() function, right? :wink:

Is there a problem with simply using print() here?

https://fedoraproject.org/wiki/Modularity/Development/Coding_Style#Comments_and_Docstrings :stuck_out_tongue:

NB: these are "positional" arguments (or just "arguments"). Only arguments that have a default value are called "keyword arguments" (or colloquially if you refer to positional arguments "by name" when calling the function/method).

Error status as the return value? That's too C-ish for me ;). And catching KeyboardInterrupt should normally go to the main function or block.

...and here the error status isn't looked at :wink: Wrapping the collected errors above in a new exception and catching it here looks more natural to me. :smiley:

rebased

7 years ago
  • I commented above on the doc comments you used, is it intentional that you don't use Sphinx-format docstrings?
  • I didn't mean to get rid of KeyboardInterrupt, just move it into the main block
  • Some of the lines are too long (PEP8), can you please fix that? Also, the commit log subject line is too long, too, should be shorter than ~50 characters (that's a guideline, some things can't be expressed well in such little space, the "hard limit" here would be 80)

rebased

7 years ago

Fixed the remaining long lines (one in each code and commit log), and merged in commit 98550b6.

Pull-Request has been closed by nphilipp

7 years ago