#3842 Don't spawn createrepo if not needed
Merged 7 months ago by tkopecek. Opened 11 months ago by tkopecek.
tkopecek/koji issue3808a  into  master

file modified
+120 -5
@@ -24,6 +24,7 @@ 

  from __future__ import absolute_import, division

  

  import copy

+ import filecmp

  import glob

  import grp

  import io
@@ -80,7 +81,15 @@ 

      ServerExit,

      ServerRestart

  )

- from koji.util import dslice, dslice_ex, isSuccess, parseStatus, to_list, format_shell_cmd

+ from koji.util import (

+     dslice,

+     dslice_ex,

+     format_shell_cmd,

+     isSuccess,

+     joinpath,

+     parseStatus,

+     to_list,

+ )

  

  try:

      import requests_gssapi as reqgssapi
@@ -5557,6 +5566,86 @@ 

      Methods = ['newRepo']

      _taskWeight = 0.1

  

+     def copy_arch_repo(self, src_repo_id, src_repo_path, repo_id, arch):

+         """Copy repodata, return False if it fails"""

+         dst_repodata = joinpath(self.workdir, arch, 'repodata')

+         src_repodata = joinpath(src_repo_path, arch, 'repodata')

+         try:

+             # copy repodata

+             self.logger.debug('Copying repodata %s to %s' % (src_repodata, dst_repodata))

+             if os.path.exists(src_repodata):

+                 # symlink=True is not needed as they are no part of arch repodir

+                 shutil.copytree(src_repodata, dst_repodata)

+             uploadpath = self.getUploadDir()

+             files = []

+             for f in os.listdir(dst_repodata):

+                 files.append(f)

+                 self.session.uploadWrapper('%s/%s' % (dst_repodata, f), uploadpath, f)

+             return [uploadpath, files]

+         except Exception as ex:

+             self.logger.warning("Copying repo %i to %i failed. %r" % (src_repo_id, repo_id, ex))

+             # Try to remove potential leftovers and fail if there is some problem

+             koji.util.rmtree(dst_repodata, self.logger)

+             return False

+ 

+     def check_repo(self, src_repo_path, dst_repo_path, src_repo, dst_repo, opts):

+         """Check if oldrepo is reusable as is and can be directly copied"""

+         # with_src, debuginfo, pkglist, blocklist, grouplist

+         # We're ignoring maven support here. It is handled in repo_init which is called

+         # always, so it doesn't affect efficiency of pre-cloning rpm repos.

+         if not src_repo_path:

+             self.logger.debug("Source repo wasn't found")

+             return False

+         if not os.path.isdir(src_repo_path):

+             self.logger.debug("Source repo doesn't exist %s" % src_repo_path)

+             return False

+         try:

+             repo_json = koji.load_json(joinpath(src_repo_path, 'repo.json'))

+             for key in ('with_debuginfo', 'with_src', 'with_separate_src'):

+                 if repo_json.get(key, False) != opts.get(key, False):

+                     return False

+         except IOError:

+             self.logger.debug("Can't open repo.json for repo {repo_info['id']}")

+             return False

+ 

+         # compare comps if they exist

+         src_comps_path = joinpath(src_repo_path, 'groups/comps.xml')

+         dst_comps_path = joinpath(dst_repo_path, 'groups/comps.xml')

+         src_exists = os.path.exists(src_comps_path)

+         if src_exists != os.path.exists(dst_comps_path):

+             self.logger.debug("Comps exists only in one repo")

+             return False

+         if src_exists and not filecmp.cmp(src_comps_path, dst_comps_path, shallow=False):

+             self.logger.debug("Comps differs")

+             return False

+ 

+         # if there is any external repo, don't trust the repodata

+         if self.session.getExternalRepoList(src_repo['tag_id'], event=src_repo['create_event']):

+             self.logger.debug("Source repo use external repos")

+             return False

+         if self.session.getExternalRepoList(dst_repo['tag_id'], event=dst_repo['create_event']):

+             self.logger.debug("Destination repo use external repos")

+             return False

+ 

+         self.logger.debug('Repo test passed')

+         return True

+ 

+     def check_arch_repo(self, src_repo_path, dst_repo_path, arch):

+         """More checks based on architecture content"""

+         for fname in ('blocklist', 'pkglist'):

+             src_file = joinpath(src_repo_path, arch, fname)

+             dst_file = joinpath(dst_repo_path, arch, fname)

+             # both must non/exist

+             if not os.path.exists(src_file) or not os.path.exists(dst_file):

+                 self.logger.debug("%s doesn't exit in one of the repos" % fname)

+                 return False

+             # content must be same

+             if not filecmp.cmp(src_file, dst_file, shallow=False):

+                 self.logger.debug('%s differs' % fname)

+                 return False

+         self.logger.debug('Arch repo test passed %s' % arch)

+         return True

+ 

      def handler(self, tag, event=None, src=False, debuginfo=False, separate_src=False):

          tinfo = self.session.getTag(tag, strict=True, event=event)

          kwargs = {}
@@ -5585,6 +5674,10 @@ 

          else:

              oldrepo_state = koji.REPO_READY

          oldrepo = self.session.getRepo(tinfo['id'], state=oldrepo_state)

+         oldrepo_path = None

+         if oldrepo:

+             oldrepo_path = koji.pathinfo.repo(oldrepo['id'], tinfo['name'])

+             oldrepo['tag_id'] = tinfo['id']

          # If there is no old repo, try to find first usable repo in

          # inheritance chain and use it as a source. oldrepo is not used if

          # createrepo_update is not set, so don't waste call in such case.
@@ -5595,28 +5688,49 @@ 

              for tag in sorted(tags, key=lambda x: x['currdepth']):

                  oldrepo = self.session.getRepo(tag['parent_id'], state=oldrepo_state)

                  if oldrepo:

+                     parenttag = self.session.getTag(tag['parent_id'])

+                     oldrepo_path = koji.pathinfo.repo(oldrepo['id'], parenttag['name'])

+                     oldrepo['tag_id'] = parenttag['id']

                      break

+         newrepo_path = koji.pathinfo.repo(repo_id, tinfo['name'])

+         newrepo = {'tag_id': tinfo['id'], 'create_event': event_id}

+         if self.options.copy_old_repodata:

+             possibly_clonable = self.check_repo(oldrepo_path, newrepo_path,

+                                                 oldrepo, newrepo, kwargs)

+         else:

+             possibly_clonable = False

          subtasks = {}

+         data = {}

+         cloned_archs = []

          for arch in arches:

+             if possibly_clonable and self.check_arch_repo(oldrepo_path, newrepo_path, arch):

+                 result = self.copy_arch_repo(oldrepo['id'], oldrepo_path, repo_id, arch)

+                 if result:

+                     data[arch] = result

+                     cloned_archs.append(arch)

+                     continue

+             # if we can't copy old repo directly, trigger normal createrepo

              arglist = [repo_id, arch, oldrepo]

              subtasks[arch] = self.session.host.subtask(method='createrepo',

                                                         arglist=arglist,

                                                         label=arch,

                                                         parent=self.id,

                                                         arch='noarch')

- 

          # gather subtask results

-         data = {}

          if subtasks:

              results = self.wait(to_list(subtasks.values()), all=True, failany=True)

              for (arch, task_id) in six.iteritems(subtasks):

                  data[arch] = results[task_id]

-                 self.logger.debug("DEBUG: %r : %r " % (arch, data[arch],))

  

          # finalize

          kwargs = {}

          if event is not None:

              kwargs['expire'] = True

+         if cloned_archs:

+             kwargs['repo_json_updates'] = {

+                 'cloned_from_repo_id': oldrepo['id'],

+                 'cloned_archs': cloned_archs,

+             }

          self.session.host.repoDone(repo_id, data, **kwargs)

          return repo_id, event_id

  
@@ -6477,6 +6591,7 @@ 

                  'createrepo_skip_stat': True,

                  'createrepo_update': True,

                  'distrepo_skip_stat': False,

+                 'copy_old_repodata': False,

                  'mock_bootstrap_image': False,

                  'pkgurl': None,

                  'allowed_scms': '',
@@ -6513,7 +6628,7 @@ 

                            'build_arch_can_fail', 'no_ssl_verify', 'log_timestamps',

                            'allow_noverifyssl', 'allowed_scms_use_config',

                            'allowed_scms_use_policy', 'allow_password_in_scm_url',

-                           'distrepo_skip_stat']:

+                           'distrepo_skip_stat', 'copy_old_repodata']:

                  defaults[name] = config.getboolean('kojid', name)

              elif name in ['plugin', 'plugins']:

                  defaults['plugin'] = value.split()

file modified
+3
@@ -73,6 +73,9 @@ 

  ; be always run in same way. Not recommended

  ; distrepo_skip_stat=False

  

+ ; copy old repodata if there is no apparent change

+ ; copy_old_repodata = False

+ 

  ; A space-separated list of tuples from which kojid is allowed to checkout.

  ; The format of those tuples is:

  ;

@@ -135,6 +135,13 @@ 

     createrepo_update=True

        Recycle old repodata (if they exist) in createrepo.

  

+    copy_old_repodata=False

+       ``newRepo`` task can copy old repodata if they exist and there is no

+       apparent change in the content. It should be generally safe to turn on

+       and it would lower number of ``createrepo`` tasks in normal environment.

+       Note, that some cases (especially tags with external repos) will render

+       this as no-op as we can't be sure that content hasn't changed meanwhile.

+ 

     failed_buildroot_lifetime=14400

        Failed tasks leave buildroot content on disk for debugging purposes.

        They are removed after 4 hours by default. This value is specified

file modified
+7 -1
@@ -15272,13 +15272,15 @@ 

          return repo_init(tag, task_id=task_id, with_src=with_src, with_debuginfo=with_debuginfo,

                           event=event, with_separate_src=with_separate_src)

  

-     def repoDone(self, repo_id, data, expire=False):

+     def repoDone(self, repo_id, data, expire=False, repo_json_updates=None):

          """Finalize a repo

  

          repo_id: the id of the repo

          data: a dictionary of repo files in the form:

                { arch: [uploadpath, [file1, file2, ...]], ...}

          expire: if set to true, mark the repo expired immediately [*]

+         repo_json_updates: dict - if provided it will be shallow copied

+                                   into repo.json file

  

          Actions:

          * Move uploaded repo files into place
@@ -15300,6 +15302,10 @@ 

              raise koji.GenericError("Repo %(id)s not in INIT state (got %(state)s)" % rinfo)

          repodir = koji.pathinfo.repo(repo_id, rinfo['tag_name'])

          workdir = koji.pathinfo.work()

+         if repo_json_updates:

+             repo_json = koji.load_json(f'{repodir}/repo.json')

+             repo_json.update(repo_json_updates)

+             koji.dump_json(f'{repodir}/repo.json', repo_json, indent=2)

          if not rinfo['dist']:

              for arch, (uploadpath, files) in data.items():

                  archdir = "%s/%s" % (repodir, koji.canonArch(arch))

Overall, the structure of this looks good.

In check_repo()

if not os.path.exists(src_repo_path)

maybe os.path.isdir?

json.load(open(f'{src_repo_path}/repo.json'))

Elsewhere we load json files a little differently. Should we use koji.load_json() here, or otherwise involve _open_text_file?

In check_arch_repo()

if src_exists != os.path.exists(dst_file): 

We should probably decline to copy (and perhaps warn) if pkglist/blocklist is missing for either, as repo_init should always write them. Or am I missing some edge case?

in copy_repo()

Perhaps name it copy_arch_repo() for precision and consistency?

#dst_repo_path = koji.pathinfo.repo(repo_id, taginfo['name'])
...
#dst_repodata = f'{dst_repo_path}/{arch}/repodata'

debugging lines? still relevant?

shutil.copytree(src_repodata, dst_repodata)

This had me checking whether we need symlinks=True (we don't, because the toplink symlink is one dir up), but perhaps worth a comment?

with open(f'{dst_repodata}/repo.json', 'wt') as fp:
    json.dump({'cloned_from_repo_id': src_repo_id}, fp, indent=2)

This seems to overwrite the json with just that single bit of data

self.logger.warning(f"Copying repo {src_repo_id} to {repo_id} failed. {ex}")
return False

Returning False here will trigger a createrepo run, but the failure may have left partial repodata copied. We should probably clean it up (and actually fail if we cannot).

in handler()

self.logger.debug("DEBUG: %r : %r " % (arch, data[arch],))

we're outside of the arch loop here. needs adjustment

We're not checking for maven support here. That is something I listed in #3841, but perhaps it is ok in this context? I guess the resulting createrepo job isn't going to look at the maven repo and doesn't care about it. Regardless, worth a comment to explain our thinking.

rebased onto ab7fce8bec657c9656c059724e0bd98415931887

10 months ago
repo_json = koji.load_json(f'{src_repodata}/repo.json')
repo_json['cloned_from_repo_id'] = src_repo_id
koji.dump_json(f'{dst_repodata}/repo.json', repo_json, indent=2

This code from copy_arch_repo is using the wrong location for repo.json. This file lives in the main repo directory, and is not stored per arch. So this bit doesn't fit in this function.

This exposes a larger issue -- we don't have a mechanism for the builder to update repo.json, so we'll need to add one. Perhaps this can be added to the repoDone call.

Hmm, perhaps I was mistaken in my previous review. Did you intend to add a new per-arch repo.json file that only includes the cloned repo id? I do like the idea of tracking the clone links, but I'm not sure about the location. Having multiple repo.json files could be confusing. Maybe I'm being pedantic?

rebased onto 348189b252dff026614c79cf0a5c8b798d6d93ba

9 months ago

1 new commit added

  • fix path
9 months ago

Yes, idea was to have these arch specific files. Anyway, you're right that it means too many files with same name.
So, maybe adding separate record to repo-level repo.json?

{
  "id": 562,
  "tag": "a",
  "tag_id": 19,
  "task_id": 1673,
  "event_id": 2843,
  "with_src": false,
  "with_separate_src": false,
  "with_debuginfo": false,
  "cloned_from_repo_id": {
      "x86_64": 123,
      "aarch64": 123,
  }
} 

Other archs are not present or cloned?

second option:

{
   ...
  "cloned_from_repo_id":  123,
  "cloned_archs": ["x86_64", "aarch64"],
}

The current path updates are only affecting how things are written out to the workdir on the builder. copy_arch_repo only returns the list of files in dst_repodata, and this result ends up getting passed through to the host.repoDone call. Nothing in this current code seems to do anything with the updated repo.json file.

As I mentioned above, if we want to update repo.json (which previously was only written on the hub by repoInit), we'll need to add a mechanism for that. Perhaps a new option to repoDone. I guess we could also cram it into the data arg, but that could be obtuse and confusing.

Second option is probably better. We should always have a single repo id we're cloning from. I don't think we want to enable copying different repos per arch.

1 new commit added

  • update top-level repo.json
9 months ago

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

8 months ago

rebased onto 1151286

8 months ago

Metadata Update from @relias-redhat:
- Pull-request tagged with: testing-done

8 months ago

Metadata Update from @relias-redhat:
- Pull-request untagged with: testing-done

8 months ago

Metadata Update from @relias-redhat:
- Pull-request untagged with: testing-ready

8 months ago

1 new commit added

  • replace f-strings with older syntax
7 months ago

Metadata Update from @relias-redhat:
- Pull-request tagged with: testing-done, testing-ready

7 months ago

Commit c3c0a6a fixes this pull-request

Pull-Request has been merged by tkopecek

7 months ago