#2101 dist-git: don't remove pr directory if the build hasn't finished yet
Merged 2 years ago by praiskup. Opened 2 years ago by schlupov.
copr/ schlupov/copr fix_dist_git_prune  into  main

file modified
+77 -30
@@ -11,30 +11,75 @@ 

  import pwd

  import argparse

  import time

+ import datetime

  import copr

  

  from copr.v3.client import Client as CoprClient

  

  parser = argparse.ArgumentParser(description="Prune DistGit repositories and lookaside cache. "

-                                  "Requires to be run as copr-dist-git user. You will be asked before anything is deleted. "

-                                  "If you no longer want to be asked, answer 'Y', if you want to exit the script, answer 'N'.")

+                                              "Requires to be run as copr-dist-git user. "

+                                              "You will be asked before anything is deleted. "

+                                              "If you no longer want to be asked, answer 'Y', "

+                                              "if you want to exit the script, answer 'N'.")

  parser.add_argument('--repos', action='store', help='local path to a DistGit repository root', required=True)

- parser.add_argument('--lookasidepkgs', action='store', help='local path to a DistGit lookaside cache pkgs root', required=True)

+ parser.add_argument('--lookasidepkgs', action='store', help='local path to a DistGit lookaside cache pkgs root',

+                     required=True)

  parser.add_argument('--copr-config', action='store', help='path to copr config with API credentials', required=True)

  parser.add_argument('--always-yes', action='store_true', help="Assume answer 'yes' for each deletion.")

  

  args = parser.parse_args()

  

- if __name__ == "__main__":

+ 

+ def get_build(username, copr_pr_dirname, client):

+     """Get the last build in the given pr directory"""

+     data = client.monitor_proxy.monitor(

+         ownername=username, projectname=copr_pr_dirname.split(":")[0], project_dirname=copr_pr_dirname

+     )

+     build_id = None

+     for pckg in data.packages:

+         pckg_build_id = [b["build_id"] for b in pckg["chroots"].values()]

+         if build_id is None:

+             build_id = max(pckg_build_id)

+             continue

+         if max(pckg_build_id) > build_id:

+             build_id = max(pckg_build_id)

+     if build_id is None:

+         return None

+     return client.build_proxy.get(build_id)

+ 

+ 

+ def check_user():

+     """Check under which user the program was run"""

      if pwd.getpwuid(os.getuid())[0] != "copr-dist-git":

          print("This script should be executed under the `copr-dist-git` user")

          sys.exit(1)

  

+ 

+ def process_dirname(pkgs_project_path, project_dirname, repos_project_path, username):

+     """Directory doesn't exist so delete it"""

+     answer = None

+     if args.always_yes:

+         answer = 'y'

+     while not answer:

+         a = input('Project {0}/{1} does not exist.\nDelete paths {2} and {3} [y/n]? '

+                   .format(username, project_dirname, repos_project_path, pkgs_project_path))

+ 

+         if a in ['n', 'no']:

+             answer = 'n'

+         if a in ['y', 'yes']:

+             answer = 'y'

+     if answer == 'y':

+         print("Deleting {0}".format(repos_project_path))

+         shutil.rmtree(repos_project_path, ignore_errors=True)

+         print("Deleting {0}".format(pkgs_project_path))

+         shutil.rmtree(pkgs_project_path, ignore_errors=True)

+ 

+ 

+ def main():

+     """The main function that takes care of the whole logic"""

+     check_user()

      client = CoprClient.create_from_config_file(args.copr_config)

      os.chdir(args.repos)

- 

-     always_yes = False

- 

      if not os.path.isdir(args.repos):

          print("{0} is not a directory.".format(args.repos), file=sys.stderr)

  
@@ -44,11 +89,29 @@ 

          if not os.path.isdir(repos_user_path):

              continue

  

-         for projectname in os.listdir(repos_user_path):

-             repos_project_path = os.path.join(repos_user_path, projectname)

+         for project_dirname in os.listdir(repos_user_path):

+             repos_project_path = os.path.join(repos_user_path, project_dirname)

+ 

+             # this is only an optimization, if the modified time of the package

+             # directory has not changed in the last 90 days, then we perform an API check

+             modified_time = 0

+             for package in os.listdir(repos_project_path):

+                 mt = os.path.getmtime(os.path.join(repos_project_path, package))

+                 modified_time = max(modified_time, mt)

+ 

+             if (datetime.datetime.today() - datetime.datetime.fromtimestamp(modified_time)).days < 90:

+                 continue

  

              try:

-                 client.project_proxy.get(username, projectname)

+                 if ":pr:" in project_dirname:

+                     build = get_build(username, project_dirname, client)

+                     if build and (

+                             datetime.datetime.today() - datetime.datetime.fromtimestamp(build.ended_on)).days < 90:

+                         continue

+                 else:

+                     project_name = project_dirname.split(":", 1)[0]

+                     client.project_proxy.get(username, project_name)

+                     continue

              except copr.v3.exceptions.CoprNoResultException:

                  pass

              except (copr.v3.exceptions.CoprRequestException, copr.v3.exceptions.CoprTimeoutException):
@@ -57,26 +120,10 @@ 

                  # this is run daily, no problem if we miss one

                  # wait a few secs till frontend is available and continue

                  continue

-             else:

-                 continue

  

-             pkgs_project_path = os.path.join(args.lookasidepkgs, username, projectname)

+             pkgs_project_path = os.path.join(args.lookasidepkgs, username, project_dirname)

+             process_dirname(pkgs_project_path, project_dirname, repos_project_path, username)

  

-             answer = None

-             if args.always_yes:

-                 answer = 'y'

  

-             while not answer:

-                 a = input('Project {0}/{1} does not exist.\nDelete paths {2} and {3} [y/n]? '

-                           .format(username, projectname, repos_project_path, pkgs_project_path))

- 

-                 if a in ['n', 'no']:

-                     answer = 'n'

-                 if a in ['y', 'yes']:

-                     answer = 'y'

- 

-             if answer == 'y':

-                 print("Deleting {0}".format(repos_project_path))

-                 shutil.rmtree(repos_project_path, ignore_errors=True)

-                 print("Deleting {0}".format(pkgs_project_path))

-                 shutil.rmtree(pkgs_project_path, ignore_errors=True)

+ if __name__ == "__main__":

+     main()

Build succeeded.

I would prefer giving those directories like 90 days of "protection" period (since those files were last time touched).
I don't like adding additional API queries, and especially into an exception handling logic.

This condition doesn't do anything :-)

rebased onto 4f7e0f73877b4e045a9bad254823e90aebd7a7a7

2 years ago

Build succeeded.

Per mtg. discussion, we can skip all directories that are "younger than N days".

rebased onto aa2010d6737fad43868fcd1abb7b9b192fc3061f

2 years ago

Build succeeded.

This seems fine, thank you. Can we "present" this as an optimization effort by in-line coment?

And perhaps mention that "this also protects the pull-request CoprDirs from early removals".

Well, no ... this call will always fail for the pull-request directories.

The problem: The mtime of the directory is not bumped if the user keeps building the very same package over again in the same pull-request. What is modified is the mtime in underlying directories.

rebased onto 9b91b54dbcef7816b99bc6c31c6a93c5fec7f1d9

2 years ago

Build succeeded.

This is a bit more I/O intensive optimization, but this still isn't the solution. We are still asking frontend for project names that may never exist....

rebased onto 24db12a3b2e8cfe43032efdcad11cb35370a6706

2 years ago

Build succeeded.

Please don't remove this. The point is to guard against running the script as a root user.

projectname should be passed into this call?

Can you move this continue to the try branch, after get() call?

This FE call can traceback as well (how #2102 is relevant!), exception should be handled.

rebased onto 809162a1ecd8a161c73892baa3f29dfea838c94e

2 years ago

Build succeeded.

I refactored the code a bit and fixed everything you mentioned.

Connection error may happen as well...?

this is actually s/projectname/project_dirname/

Can you please document that this is just an optimization?

and to be more precise, it should be named copr_pr_dirname, as we are strictly expecting this to contain the colon-PR-colon string.

I did a mistake in designing this ... the projectname in this API call is actually redundant, I think. :-(

Otherwise looks nice, thank you for the progress!

rebased onto 91ff302eca1681bede86abf98f44d8e599e0662f

2 years ago

Build succeeded.

Consider a connection error, do we really want to return None? That will trigger the directory removal? I may be wrong, please document the tactics ... :-)

Project name at this point still contains the colon-pr-colon string... For the sake of readability, we should use project_dirname.

This is news to me, but we can have arbitrarily named dir names: This is not documented, so it is not frequent for now:

coprdb=# select * from copr_dir where name like '%:%' and name not like '%:pr:%';
┌───────┬───────────────────┬──────┬─────────┬───────────┐
│  id   │       name        │ main │ copr_id │ ownername │
├───────┼───────────────────┼──────┼─────────┼───────────┤
│ 36289 │ xentest:chaintest │ f    │    3302 │ myoung    │
└───────┴───────────────────┴──────┴─────────┴───────────┘

We should first split the "project_dirname" (currently named projectname) on the first colon, and ask for that projectname.

This API call will certainly fail as well, potentially breaking everything.... I believe we should handle the cli exceptions in one place.

rebased onto d8d8930d88c83de8f683a3c3f91bf95e8144d744

2 years ago

This is news to me, but we can have arbitrarily named dir names: This is not documented, so it is not frequent for now:
coprdb=# select * from copr_dir where name like '%:%' and name not like '%:pr:%'; ┌───────┬───────────────────┬──────┬─────────┬───────────┐ │ id │ name │ main │ copr_id │ ownername │ ├───────┼───────────────────┼──────┼─────────┼───────────┤ │ 36289 │ xentest:chaintest │ f │ 3302 │ myoung │ └───────┴───────────────────┴──────┴─────────┴───────────┘

We should first split the "project_dirname" (currently named projectname) on the first colon, and ask for that projectname.

Oh, I didn't know that 😯 I fixed the code because of it.

Build succeeded.

Those are two different use-cases. We want to check for "build" only for the pull-request directories. The other directories should be handled with projectname = project_dirname.split(":", 1)[0] ... and that project should be checked by client.project_proxy.get(username, projectname)
(not dirname).

We are catching those connection exceptions in two places. Could you move the handling one level up in the if-else tree and do it only once?

rebased onto ce7a0ab5174b590ee5e591ec481d8abf1dab83bc

2 years ago

Build succeeded.

IMO this is less readable than the previous proposal:

We are catching those connection exceptions in two places. Could you move the handling one level up in the if-else tree and do it only once?

s/project_dirname/project_name/

s/process_project/process_dirname/ ?

This regressed back; returning None for this "connection error" will lead to directory removal, right? Please document your tactics so we don't get lost in this really complicated code :-)

rebased onto eb279333963e3fa0687ae1f34816ba4401258e73

2 years ago

Build succeeded.

I would feel safer with:

if build_id is None:
    return None

IOW it doesn't make sense to ask the server for build_id = 0?

rebased onto ca2a0243ce782bc911437e5d13d6dd419661260f

2 years ago

Build succeeded.

Build succeeded.

Please wait a sec ... I'd like to fix the PyLint problems here.

Build failed. More information on how to proceed and troubleshoot errors available at https://fedoraproject.org/wiki/Zuul-based-ci

================
 Added warnings 
================

Error: PYLINT_WARNING:
run/prune-dist-git.py:33: C0116[missing-function-docstring]: get_build: Missing function or method docstring

Error: PYLINT_WARNING:
run/prune-dist-git.py:50: C0116[missing-function-docstring]: check_user: Missing function or method docstring

Error: PYLINT_WARNING:
run/prune-dist-git.py:56: C0116[missing-function-docstring]: process_dirname: Missing function or method docstring

Error: PYLINT_WARNING:
run/prune-dist-git.py:75: C0116[missing-function-docstring]: main: Missing function or method docstring

Error: PYLINT_WARNING:
run/prune-dist-git.py:96:16: R1731[consider-using-max-builtin]: main: Consider using 'modified_time = max(modified_time, mt)' instead of unnecessary if block

rebased onto 7d4daf9165c09d7cea9bfd13c0cda008aea46ce9

2 years ago

Build succeeded.

Pylint problems fixed.

rebased onto a7f2cbc

2 years ago

Pull-Request has been merged by praiskup

2 years ago
Metadata