#10 Expanding modlint
Closed 7 years ago by nphilipp. Opened 7 years ago by jstanek.

file modified
+123 -16
@@ -1,23 +1,130 @@ 

  #!/usr/bin/python3

- import modulemd

+ 

+ 

+ import argparse

  import sys

  

- def usage():

-     print("Usage: modlint <file>")

-     sys.exit()

+ import requests

+ from modulemd import ModuleMetadata

+ 

+ 

+ class RpmContentError(ValueError):

+     """Metadata contains invalid/nonexistent srpm or commit."""

+ 

+     #: Mapping of HTTP status codes to actual failure reasons.

+     REASONS = {

+         400: 'Bad hash',

+         404: 'Nonexistent rpm',

+     }

+ 

+     def __init__(self, package, commit, status_code):

+         super().__init__(self, package, commit, status_code)

+ 

+         self.package = package

+         self.commit = commit

+         self.status_code = status_code

+ 

+     def __repr__(self):

+         return 'RpmContentError({pkg!r}, {commit!r}, {status!r})'.format(

+             pkg=self.package,

+             commit=self.commit,

+             status=self.status_code

+         )

+ 

+     def __str__(self):

+         return '{pkg} [{commit}]: {reason}'.format(

+             pkg=self.package,

+             commit=self.commit,

+             reason=self.REASONS.get(self.status_code, 'Unknown error')

+         )

+ 

+ 

+ def verify_rpm_content(package, metadata, *, requests=requests):

+     """Verify the existence of dist-git repo for described package.

+ 

+     Keyword arguments:

+         package -- Name of the package to verify (key in

+             modulemd packages)

+         metadata -- Metadata associated with the package

+             (value in modulemd packages)

+ 

+         requests -- Dependency injection of networking library.

+ 

+     Returns:

+         None if the associated repo exists.

+ 

+     Raises:

+         RpmContentError -- when the package (or commit) does not exist

+             in dist-git.

+     """

+ 

+     # TODO: Get URL from the metadata

+     CGIT_URL_TEMPLATE = 'http://pkgs.fedoraproject.org/cgit/rpms/{name}.git/commit'

+ 

+     commit = metadata.get('commit', 'HEAD')

+ 

+     response = requests.head(

+             CGIT_URL_TEMPLATE.format(name=package),

+             params={'id': commit}

+             )

+ 

+     if response.status_code != requests.codes.ok:

+         raise RpmContentError(package, commit, response.status_code)

+ 

+ 

+ def rpm_content(mmd):

+     """Generate (package, metadata) pairs from module.

+ 

+     Keyword arguments:

+         mmd -- ModuleMetadata to be inspected.

+ 

+     Yields:

+         (package, metadata) from mmd, if any.

+     """

+ 

+     try:

+         packages = mmd.components.rpms.packages

+     except AttributeError:

+         # Module has no packages -- end generation with no output

+         return None

+ 

+     yield from packages.items()

+ 

+ 

+ if __name__ == '__main__':

+     parser = argparse.ArgumentParser(description='Validate module metadata.')

+ 

+     # Positional arguments

+     parser.add_argument('file', help='Input metadata file')

+ 

+     args = parser.parse_args()

+ 

+     metadata = ModuleMetadata()

+     metadata_errors = TypeError, ValueError

+     all_went_well = True

  

- if (len(sys.argv) != 2):

-     usage()

+     try:

+         metadata.load(args.file)

+     except metadata_errors as invalid_input_metadata:

+         message = 'ERROR: Invalid input: {!s}'.format(invalid_input_metadata)

+         sys.exit(message)

  

- filename = sys.argv[1]

- modulemd = modulemd.ModuleMetadata()

+     try:

+         metadata.validate()

+     except metadata_errors as invalid_metadata_structure:

+         all_went_well = False

+         print('ERROR: Invalid structure:', str(invalid_metadata_structure),

+               file=sys.stderr)

  

- try:

-     modulemd.load(filename)

-     modulemd.validate()

- except Exception as e:

-     # raise

-     print("ERROR: ", str(e))

-     sys.exit()

+     for package in rpm_content(metadata):

+         try:

+             verify_rpm_content(*package)

+         except RpmContentError as invalid_rpm:

+             all_went_well = False

+             print('RPM CONTENT ERROR:', str(invalid_rpm),

+                   file=sys.stderr)

  

- print("Everything is OK!")

+     if all_went_well:

+         print('Everything OK')

+     else:

+         sys.exit(1)

This is more request for comments/review than PR.

I have expanded the modlint script with dist-git sanity check. For each package in the content, the Fedora CGIT is checked for existence of that package (and if present, its commit).

RFC:

  • New 3rd party dependency: requests toolkit. Is it OK to expand the dependecy list?
  • Code uses typing module from python 3.5, and it is thus only compatible with python 3.5+ (Fedora 24+). Is it OK, or should it be more backward compatible?

The format string is nested badly:

>>> '{pkg [{hash}]: {msg}}'.format(pkg="foo", hash="hash", msg="BOOH")
---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-1-2b656e3c1a82> in <module>()

Did you mean '{pkg}[{hash}]: {msg}'.format(...) instead?

You could use the dict.get() method instead:

hash=details.get('commit', 'HEAD')

Hmm. I don't particularly like returning a list of errors - we have exceptions for that. The name of the function could also be a little more descriptive:

class RpmContentError(Exception):
    ...

def verify_rpm_content(mmd: ModuleMetadata) -> None:
    """Check that rpm content points to an existing repository (commit).

    For each package in the module, this function queries dist-git
    and uses the status code to determine if the package, and
    optionally the commit, exists.

    Arguments:
        mmd -- The module metadata object to be checked

    Returns: Nothing

    Raises: An exception if packages, or commits can't be found.
    ...
    errors = []
    for package, details in ...:
        ...
        if response.status_code != requests.codes.ok:
            errors.append("{pkg}[{hash}]: {msg}".format(...))
    if errors:
        raise RpmContentError(errors)

Why do you catch all exceptions and continue here? metadata probably won't be usable below.

The "by-the-book" way would be to import sys at the top, then sys.exit(1) ;).

Ah, interesting, the first time I've seen type annotations in the wild :). Do you have tools that use this information?

As to your RFCs:

  • I think it's totally okay to add the requests package as a new dependency for a developer tool like modlint
  • What benefit does using the typing module bring us? We may want that people on older releases can write modules, too, so IMO its benefit would have to greatly outweigh the cost of excluding people on older releases of not so much Fedora, but RHEL or CentOS.

Not really, I use it mainly as code documentation which could be potentially checked by some static analysis.

True, but my intention here was to report the find to the caller as soon as possible, and continue the search for others when asked. Among other things, it allows for printing errors as they are found, instead of waiting for whole check (which could take a while) and then dumping it all at once.

Well, yes, but since sys.exit() ultimately raises SystemExit() and I do not need sys for anything else, I figured that I will raise it directly, saving an import that is otherwise unneeded.

What benefit does using the typing module bring us? We may want that people on older releases can write modules, too, so IMO its benefit would have to greatly outweigh the cost of excluding people on older releases of not so much Fedora, but RHEL or CentOS.

Good point. Since I use it mainly as code documentation, I will drop it in the name of older releases.

2 new commits added

  • Split metadata loading and validation to separate steps
  • Get default hash using builtin method
7 years ago

1 new commit added

  • Report rpm content errors with exceptions
7 years ago

Thanks for explaining your reasons, and for addressing my objections ;).

Let me expand on my comments a bit:

  • The idea of using a generator the way you did is very nifty, but also pretty unconventional which makes it harder for others to understand what's happening, especially to people not so familiar with Python. Additionally to "doing it the boring way", you could add a "don't break off at first error" option/flag which would then continue processing the file, gathering errors and raising an exception only when finished (and put the error reporting inside the function).
  • I guess the use of sys.exit() vs. raise SystemExit() is a matter of taste, the only real advantage of the function is that you can do things like sys.exit("Error: An error happened") which will print the message to stderr and exit with exitcode 1.

Thank you for review :)

  • Ad the generator/exceptions: I rethought the process and split the function–
    see commit 4fb5a70. Now the validation is scoped to single
    package, and the iteration/catching/reporting is done in the main section.
    Does it look better to you?
  • Ad the sys.exit vs. SystemExit – AFAIK they acts exactly the same, you
    can pass the string message to the exception too ;). But since I noticed
    I am in fact using the sys module already, I changed it to the sys.exit
    function – perhaps it will reduce confusion :)

Not sure why you use a docstring here and not a comment before the REASON dict – I really had to think twice what this line is referring to.

You could use {pkg!r} etc. in the format string, so strings get quoted.

For my understanding, is accessing .rpms.packages this way and catching AttributeError the canonical way for dealing with modules, and does it avoid hiding real errors?

Hah, wasn't aware that you could return from generators to cut them short :). I think using return would better convey the idea (adding a comment, too :wink: ), I was actually looking for what happened to None in the caller (i.e. nothing, unless you catch the StopIteration object and inspect it).

Maybe I'm too pedantic :wink:, but how about wrapping everything from the line where all_went_well is initialized to about here in a try: ... except: ... block instead? That seems more idiomatic to me.

Since in-file comments get lost after new commit, I will respond in normal one,
with references to the file as links. Hopefully it helps with the mental
juggling.

  • Docstring after assignment

    REASONS = {
        400: 'Bad hash',
        404: 'Nonexistent rpm',
    }
    """Mapping of HTTP status codes to actual failure reasons."""
    

    Not sure why you use a docstring here and not a comment before the
    REASON dict – I really had to think twice what this line is referring
    to.

    I was under impression that this kind of docstring is extractable by
    automated tools in standard library. However, after re-reading the
    PEP 257, it seems that this is
    more of proposal than usable fact. Changing to comment.

  • Nested attribute access, returning from generators

    try:
        packages = mmd.components.rpms.packages
    except AttributeError:
        return None
    

    For my understanding, is accessing .rpms.packages this way and catching
    AttributeError the canonical way for dealing with modules, and does it
    avoid hiding real errors?

    AttributeError indicates that the code is accessing nonexistent
    attribute/property. I found it convenient for avoiding "nested" ifs like
    if mmd.components is not None and mmd.components.rpms is not None and ...
    (since each step might be None). I'm not aware about any "canonical" way
    to safely access the package contents.

    Only "real" errors I'm aware of that could be hidden would occur if the
    modulemd inner structure change – the function would then always return
    without iterating.

    Maybe the modulemd author could provide insight on this matter :) Any
    comments here, @psabata?

    Hah, wasn't aware that you could return from generators to cut them
    short :). I think using return would better convey the idea (adding
    a comment, too 😉), I was actually looking for what happened to None
    in the caller (i.e. nothing, unless you catch the StopIteration object
    and inspect it).

    return None is the same as return
    (doc).
    And since <q cite='import this'>explicit is better than implicit</q>,
    I think return None is better ("give him nothing").

    I will add at least comment explanation to prevent further confusion. I see
    I might be too generator-happy :)

  • Error reporting

    all_went_well = True
    ...
    except ...:
        all_went_well = False
        ...
    ...
    if all_went_well:
        print('Everything OK')
    else:
        sys.exit(1)
    

    Maybe I'm too pedantic :wink:, but how about wrapping everything from
    the line where all_went_well is initialized to about here in a try: ... except: ... block instead? That seems more idiomatic to me.

    My issue with that is that try: ... except: ... block will short-circuit
    the script, ending after first exception. I want to run all the checks, note
    if any errors occurred, and then exit accordingly. I would love to do it in
    more pythonic way, but I'm afraid that wrapping it all in one
    exception-catching block will not suffice. Suggestions are welcome 😉.

Hmm, need to think about the issues you raise. Perhaps time for a third opinion on the above. @psabata?

  • Docstring after assignment

If you want the doc string to appear at least in the modulemd readthedocs.org documentation, you can use "#: comment" before that variable.

  • Nested attribute access, returning from generators

That's interesting question. I would vote for using try/except blocks, but use tests to ensure that internal structure modulemd changes are handled - in that case, test would fail.

Nils, can we agree here that this is OK for this sprint, and in the next sprint, I will write unit-tests for the modlint to ensure this is tested automatically?

I think we plan to release modlint together with modulemd, so it should always use the right version of modulemd library.

  • Error reporting

I agree with Jan here.

@jkaluza Sure, let's merge this as is, for you to improve upon later.

@jstanek Do you want to change the doc string as @jkaluza described above before I merge, or should I?

There are a couple of things I want to fix regarding commit structure before merging, e.g.:

  • don't introduce something (like the typing module) only to remove it later
  • split up large commits into topical ones (e.g. the last commit)

Okay if I do that?

I'm not sure if jstanek is reading that or will do the job.

Nils, can you do the changes? I'm OK with them. or I can fork modulemd and do the changes myself and send another pull request.

I'm still reading this and aim to finish it :wink:. I will implement the agreed changes.

Ad commit structure: I'm not sure how to change commits after pushing them, so if you know a way, fill free to do it, Nils.

3 new commits added

  • Add explanation for returning from generator early
  • Add proper quoting to repr string
  • Change attribute docstring to documentation comment
7 years ago

Pull-Request has been closed by nphilipp

7 years ago
Metadata