#7366 add script for pruning atomic ostree repos
Closed 4 years ago by dustymabe. Opened 6 years ago by dustymabe.
dustymabe/releng dusty-prune-ostree  into  master

@@ -0,0 +1,171 @@ 

+ #!/bin/python3

+ # -*- coding: utf-8 -*-

+ #

+ # prune-atomic-ostree-repos.py - A utility to prune our atomic ostree repos.

+ #

+ # Copyright (C) 2015 Red Hat, Inc.

+ # SPDX-License-Identifier:      GPL-2.0+

+ #

+ # Authors:

+ #     Dusty Mabe <dusty@dustymabe.com>

+ 

+ import argparse

+ import logging

+ import os

+ import subprocess

+ import sys

+ 

+ 

+ logging.basicConfig(level=logging.INFO)

+ logger = logging.getLogger('prune-atomic-ostree-repos')

+ 

+ # The location of our two unified ostree repos

+ ATOMICCOMPOSEREPO = '/mnt/koji/compose/atomic/repo'

+ ATOMICPRODREPO = '/mnt/koji/atomic/repo'

+ 

+ # The # of commits to retain in the compose repo for each branch

+ COMPOSE_REPO_POLICY_DEPTH = '5'

+ 

+ # The policy for each ref in the prod repo. None means "keep all"

+ PROD_REF_POLICIES = {

+     # rawhide atomic host

+     'fedora/rawhide/x86_64/atomic-host':  "30 days ago",

+     'fedora/rawhide/aarch64/atomic-host': "30 days ago",

+     'fedora/rawhide/ppc64le/atomic-host': "30 days ago",

+ 

+     # rawhide atomic workstation

+     'fedora/rawhide/x86_64/workstation':  "30 days ago",

+ 

+     # f28 atomic host

+     'fedora/28/x86_64/atomic-host':  None,

+     'fedora/28/aarch64/atomic-host': None,

+     'fedora/28/ppc64le/atomic-host': None,

+     # f28 atomic host (updates)

+     'fedora/28/x86_64/updates/atomic-host':  None,

+     'fedora/28/aarch64/updates/atomic-host': None,

+     'fedora/28/ppc64le/updates/atomic-host': None,

+     # f28 atomic host (testing)

+     'fedora/28/x86_64/testing/atomic-host':  "30 days ago",

+     'fedora/28/aarch64/testing/atomic-host': "30 days ago",

+     'fedora/28/ppc64le/testing/atomic-host': "30 days ago",

+ 

+     # f28 atomic workstation

+     'fedora/28/x86_64/workstation': None,

+     # f28 atomic workstation (updates)

+     'fedora/28/x86_64/updates/workstation': None,

+     # f28 atomic workstation (testing)

+     'fedora/28/x86_64/testing/workstation': "30 days ago",

+ }

+ 

+ # Beneath this is code, no config needed here

+ def run_command(cmd):

+     logger.info('Running %s', cmd)

+     try:

+         output = subprocess.check_output(cmd,

+                                          stderr=subprocess.STDOUT,

+                                          shell=False)

+     except subprocess.CalledProcessError as e:

+         logger.error(e.stdout.decode('utf-8'))

+         raise # re-raise CalledProcessError

+ 

+     return output

+ 

+ 

+ def prune_compose_repo(test=False):

+ 

+     # prune the compose repo

+     logger.info('Pruning the compose repo %s to %s depth' %

+                 (ATOMICCOMPOSEREPO, COMPOSE_REPO_POLICY_DEPTH))

+     cmd = ['ostree', 'prune', '--repo', ATOMICCOMPOSEREPO,

+            '--depth', COMPOSE_REPO_POLICY_DEPTH, '--refs-only']

+ 

+     if test:

+         cmd.append('--no-prune')

+ 

+     output = run_command(cmd)

+     logger.info('####################################')

+     logger.info('\n' + output.decode('utf-8'))

+     logger.info('####################################')

+ 

+ 

+ def prune_prod_repo(test=False):

+ 

+     # Error out if any refs exist that aren't defined in

+     # the policy

+     cmd = ['ostree', 'refs', '--repo', ATOMICPRODREPO]

+     output = run_command(cmd)

+     prodrefs = output.decode('utf-8').splitlines()

This could have stayed prod_ref - PEP standard for names. I know it's a nit, but, the other change made the difference clear already for us.

+     for ref in prodrefs:

+         if ref not in PROD_REF_POLICIES:

+             msg = 'ref %s in repo %s but no policy defined' % (ref, ATOMICPRODREPO)

+             logger.error(msg)

+             raise Exception(msg)

+ 

+     # prune each branch in the policy with specified value

+     for ref,policy in PROD_REF_POLICIES.items():

+         if ref not in prodrefs:

+             logger.warn('policy defined for ref %s but ref not in repo %s' % 

+                          (ref, ATOMICPRODREPO))

+             continue

+ 

+         if policy is None:

+             logger.info('Skipping ref %s in repo %s. Policy is to keep all commits' %

+                         (ref, ATOMICPRODREPO))

+             continue

+ 

+         logger.info('Pruning the %s ref in repo %s to %s' %

+                     (ref, ATOMICPRODREPO, policy))

+         cmd = ['ostree', 'prune', '--repo', ATOMICPRODREPO,

+                '--only-branch', ref, '--refs-only',

+                '--keep-younger-than', policy]

+         if test:

+             cmd.append('--no-prune')

+         output = run_command(cmd)

+         logger.info('####################################')

+         logger.info('\n' + output.decode('utf-8'))

+         logger.info('####################################')

+ 

+ # XXX this function should not be used yet as there are a few

+ # bugs to work out: 

+ # https://github.com/ostreedev/ostree/issues/1479 

+ # https://github.com/ostreedev/ostree/issues/1481

+ def prune_prod_repo_deltas(test=False):

+ 

+     logger.info("prune_prod_repo_deltas: this function is disabled until the"

+                 "following bugs are fixed:")

+     logger.info("https://github.com/ostreedev/ostree/issues/1479")

+     logger.info("https://github.com/ostreedev/ostree/issues/1481")

+     return

+ 

+     cmd = ['ostree', 'prune', '--repo', ATOMICPRODREPO,

+            '--refs-only', '--static-deltas-only']

+     if test:

+         cmd.append('--no-prune')

+     output = run_command(cmd)

+     logger.info('####################################')

+     logger.info('\n' + output.decode('utf-8'))

+     logger.info('####################################')

+ 

+ def main():

+ 

+     parser = argparse.ArgumentParser()

+     parser.add_argument("--test",

+                         help="Don't actually prune", action='store_true')

I went and re-read the argparse manual and realized you can't change the default of this from the command line.

So what I really want, and I apologize for being unclear due to that misunderstanding on my part, is for the script to ONLY run for real if something is passed.

Can we change this to --dry-run and that passes the test case instead?

This negates the need to set a variable later in the script; we just change the job to remove the passing of --dry-run.

I would also say then that "test" should be renamed to "dry_run".

In my mind there is really no difference between passing --test and passing --dry-run. It sounds like what you want is a --notest option. I guess that would be ok but seems a bit odd.

What I don't like is the manual setting of args.test = True in the body.

Ideally, what I'd prefer here is that the script when invoked always goes dry-run and we have some option that actually performs the action.

I just can't remember the right word for the standard on this...

EG: --dry-run for scripts whose behavior defaults to run in production, and something else for not.

The idea of having something we have to delete from code bothers me; we should make sure we set it --dry-run in the cron job or, since this is a destructive script, opt to make the default behavior not do anything but show the changes and then a single option --really-do-it (but that's a terrible name) that does the changes for real.

+     args = parser.parse_args()

+ 

+     # set args.test = True for now. Once we have proved

+     # everything out in prod and the logs look good we'll

+     # delete this code.

+     args.test = True

+ 

+     prune_compose_repo(args.test)

+     prune_prod_repo(args.test)

+ 

+     # XXX this function should not be used yet as there are a few

+     # bugs to work out: 

+     # https://github.com/ostreedev/ostree/issues/1479 

+     # https://github.com/ostreedev/ostree/issues/1481

+     #prune_prod_repo_deltas(args.test)

+ 

+ if __name__ == '__main__':

+     main()

Why provide the --test argument at all if it sets test to true every time?

good question. The plan is to have this start off as test only and called from a cron job. I'll then update this script to pass args.test instead of True. My thoughts were it's easier to update this script than it is to update the cron job in infra git and beg someone to re-run the playbook for me.

Where is this exception handled?

As far as I know, exceptions in scripts need handling unlike those in APIs where the person calling the API is expected to handle exceptions appropriately for their program.

It isn't handled. The desire is that the script stop executing and throw an error.

Rather than comment this out here, I would have it call appropriately and do a logger.info that states this feature is disabled due to ISSUE X, ISSUE Y.

Otherwise, I'd just remove all the code for this until the other bugs are ironed out.

Throwing an error to the invoking shell can be handled with an error handler and the appropriate sys.exit(EXIT_CODE) call.

right, i'm just not sure why I need to do that here vs just raising an exception, the same goal is achieved either way??

added a log statement in the function and uncommented this so that it will call the function

rebased onto 5e7e107630ba8c801abc2b498c41ac76368b8354

6 years ago

This seems fine for now but I'm curious if you considered using the ostree pygobject bindings? There's various examples in https://github.com/ostreedev/ostree-releng-scripts

Related to this, I wonder if it'd make sense to "parse" the refs (basically prefix/${arch}/${suffix}) rather than having a separate list?

But in the end rojig will give us this arch-independent pruning naturally.

This seems fine for now but I'm curious if you considered using the ostree pygobject bindings? There's various examples in https://github.com/ostreedev/ostree-releng-scripts

I did consider it, but am much more familiar with the command line so stuck to that for now.

Related to this, I wonder if it'd make sense to "parse" the refs (basically prefix/${arch}/${suffix}) rather than having a separate list?

so this would make PROD_REFS shorter right? I thought about that as well, just didn't take the time to figure it out.

But in the end rojig will give us this arch-independent pruning naturally.

yeah, we'll have to plan and make that a priority soon.

Shouldn't ref here be 'fedora/28/x86_64/testing/atomic-host instead' of 'fedora/28/x86_64/updates/atomic-host' ? Similarly, for other arches.

I think you meant to say testing instead of updates.

rebased onto be6f7b38c515803bc3aaab619dc5ecda02785b92

6 years ago

rebased on latest master.. I'd like +1 from @kellin @mohanboddu and @puiterwijk if possible please.

rebased onto b902ab3

5 years ago

Better use some other variable name for PROD_REFS which conflicts with prod_refs, it works but better use something else to eliminate confusion

I would prefer if this set default=True here and then passed that to all the other functions that way. This way when we shift it in production we simply set it to --test=False on the command line rather than have to update the function. It also ensures that, for anyone, unless they explicitly ever tell it to prune it will always do a dry run instead.

I feel that would be safer behavior for the long term.

Adding on here to what @mohanboddu said - I think if this were named PROD_REF_POLICIES it would be clearer that it's a dictionary and have less confusion with prod_refs.

I'm picking nits, but I think if we need to exit here when the policy is none, this should be a positive statement.

EG:

if policy is None:
logger....
continue

it eliminates the extra else line and has more clarity.

This is a question about the longer term and not necessary something that needs solved to deploy this script today.

Do we care about what gets pruned over time and is it kept anywhere else? I almost feel this should be a consumable format like JSON that lets us see which refs got pruned and when if we care.

I'm not a fan of logging to text and then having it sit somewhere unconsumed.

I think it's mostly log statements for now. If we moved to using the python library then we might be able to get more info out.

I changed it a bit. Once I do a new upload see if I've addressed this comment

This comment is assuming the script will be changed to always run as test unless --test=False to run in production.

I would change the test to true here at the start of the method call so that even if we run it in production, it's still a test.

I think that gives us the effect you're going for of having the stub but never allowing it to actually run until it's fixed.

I think I've addressed this in main(). please see my next upload when I do it and let me know what you think

also see the comments and the pre-mature return directly below this code

1 new commit added

  • update with comments from code review
5 years ago

I went and re-read the argparse manual and realized you can't change the default of this from the command line.

So what I really want, and I apologize for being unclear due to that misunderstanding on my part, is for the script to ONLY run for real if something is passed.

Can we change this to --dry-run and that passes the test case instead?

This negates the need to set a variable later in the script; we just change the job to remove the passing of --dry-run.

I would also say then that "test" should be renamed to "dry_run".

This could have stayed prod_ref - PEP standard for names. I know it's a nit, but, the other change made the difference clear already for us.

In my mind there is really no difference between passing --test and passing --dry-run. It sounds like what you want is a --notest option. I guess that would be ok but seems a bit odd.

What I don't like is the manual setting of args.test = True in the body.

Ideally, what I'd prefer here is that the script when invoked always goes dry-run and we have some option that actually performs the action.

I just can't remember the right word for the standard on this...

EG: --dry-run for scripts whose behavior defaults to run in production, and something else for not.

The idea of having something we have to delete from code bothers me; we should make sure we set it --dry-run in the cron job or, since this is a destructive script, opt to make the default behavior not do anything but show the changes and then a single option --really-do-it (but that's a terrible name) that does the changes for real.

@dustymabe Please sign the commits or else we cannot merge them.

@mohanboddu - I'm going to squash all commits before I merge. I also want patrick to review before I merge as well.

@dustymabe , were you able to get @puiterwijk to review this yet?

Metadata Update from @syeghiay:
- Request assigned

5 years ago

This needs rebasing again... @dustymabe can you do that when you get back?

yep - unless @sinnykumari wants to pick it up before I get back

yep - unless @sinnykumari wants to pick it up before I get back

sure, I will see if I can pick this up before you are back

Meanwhile, is there any related issue where ostree repo pruning has been discussed? I don't recall much, will be useful to have some context.

i cant rememer any other than this ticket. it's mostly something i started to look at when we started making statuc deltas and realizing the repo was going to get large fast if we don't prune.

I revived this PR and now am planning to host it over in the fedora-coreos-releng-automation repo on GitHub. The code will run in Fedora's Openshift instances. Here is the PR for adding the code (known as fedora-ostree-pruner) to that repo: https://github.com/coreos/fedora-coreos-releng-automation/pull/79

Pull-Request has been closed by dustymabe

4 years ago
Metadata