#516 kojira monitors external repos changes
Opened 2 years ago by tkopecek. Modified 4 months ago
tkopecek/koji issue512  into  master

file modified
+112 -14

@@ -22,22 +22,26 @@ 

  

  from __future__ import absolute_import

  from __future__ import division

- import sys

- import os

- import koji

- from koji.util import rmtree, parseStatus, to_list

- from optparse import OptionParser

- from six.moves.configparser import ConfigParser

+ 

  import errno

  import logging

  import logging.handlers

+ import os

  import pprint

  import signal

- import time

+ import sys

  import threading

+ import time

  import traceback

+ from optparse import OptionParser

+ from xml.etree import ElementTree

+ 

+ import requests

  import six

+ from six.moves.configparser import ConfigParser

  

+ import koji

+ from koji.util import rmtree, parseStatus, to_list

  

  

  tag_cache = {}

@@ -200,6 +204,7 @@ 

          self._local = threading.local()

          self._local.session = session

          self.repos = {}

+         self.external_repos = {}

          self.tasks = {}

          self.recent_tasks = {}

          self.other_tasks = {}

@@ -313,8 +318,45 @@ 

                      self.logger.info('Dropping entry for inactive repo: %s', repo_id)

                      del self.repos[repo_id]

  

-     def checkCurrentRepos(self):

-         """Determine which repos are current"""

+     def checkExternalRepo(self, ts, repodata, tag):

+         """Determine which external repos are current"""

+         url = repodata['url']

+         if url not in self.external_repos:

+             self.external_repos[url] = 0

+             arches = [None] # placeholder for repos without $arch bit

+             try:

+                 arches = self.session.getTag(tag)['arches'].split()

+             except AttributeError:

+                 pass

+             for arch in arches:

+                 if arch is None and '$arch' not in url:

+                     arch_url = url

+                 elif arch and '$arch' in url:

+                     arch_url = url.replace('$arch', arch)

+                 else:

+                     # None placeholder, but $arch is part of url, it makes no

+                     # sense to check this one

+                     self.logger.warning('Weird external repo without defined archs: %(tag_name)s - %(url)s' % repodata)

+                     continue

+                 arch_url = os.path.join(arch_url, 'repodata/repomd.xml')

+                 self.logger.debug('Checking external url: %s' % arch_url)

+                 try:

+                     r = requests.get(arch_url, timeout=5)

+                     root = ElementTree.fromstring(r.text)

+                     for child in root.iter('{http://linux.duke.edu/metadata/repo}timestamp'):

+                         remote_ts = int(child.text)

+                         if remote_ts > self.external_repos[url]:

+                             self.external_repos[url] = remote_ts

+                         if remote_ts > ts:

+                             # return early, no need to check others

+                             return False

+                 except Exception:

+                     # inaccessible or without timestamps

+                     # treat repo as unchanged (ts = 0)

+                     pass

+         return ts < self.external_repos[url]

+ 

+     def reposToCheck(self):

          to_check = []

          repo_ids = to_list(self.repos.keys())

          for repo_id in repo_ids:

@@ -335,11 +377,37 @@ 

          if self.logger.isEnabledFor(logging.DEBUG):

              skipped = set(repo_ids).difference([r.repo_id for r in to_check])

              self.logger.debug("Skipped check for repos: %r", skipped)

-         if not to_check:

-             return

-         for repo in to_check:

-             changed = self.session.tagChangedSinceEvent(repo.event_id, repo.taglist)

+         return to_check

+ 

+     def checkExternalRepos(self):

+         """Determine which external repos changed"""

+         for repo in self.reposToCheck():

+             changed = False

+             for tag in repo.taglist:

+                 try:

+                     external_repos = self.session.getExternalRepoList(tag)

+                 except koji.GenericError:

+                     # in case tag was deleted, checkCurrentRepos is

+                     # responsible for cleanup, ignore it here

+                     external_repos = []

+                 for external_repo in external_repos:

+                     changed = self.checkExternalRepo(repo.event_ts, external_repo, tag)

+                     self.logger.debug("Check external repo %s [%s] for tag %s: %s" % (

+                         external_repo['external_repo_id'], external_repo['url'],

+                         tag, changed))

+                     if changed:

+                         break

+                 if changed:

+                     break

              if changed:

+                 self.logger.info("Repo %i no longer current due to external repo change", repo.repo_id)

+                 repo.current = False

+                 repo.expire_ts = time.time()

+ 

+     def checkCurrentRepos(self):

+         """Determine which repos are current"""

+         for repo in self.reposToCheck():

+             if self.session.tagChangedSinceEvent(repo.event_id, repo.taglist):

                  self.logger.info("Repo %i no longer current", repo.repo_id)

                  repo.current = False

                  repo.expire_ts = time.time()

@@ -359,6 +427,21 @@ 

          finally:

              session.logout()

  

+     def currencyExternalChecker(self, session):

+         """Continually checks repos for external repo currency. Runs as a separate thread"""

+         self.session = session

+         self.logger = logging.getLogger("koji.repo.currency_external")

+         self.logger.info('currencyExternalChecker starting')

+         try:

+             while True:

+                 self.checkExternalRepos()

+                 time.sleep(self.options.sleeptime)

+         except:

+             self.logger.exception('Error in external currency checker thread')

+             raise

+         finally:

+             session.logout()

+ 

      def regenLoop(self, session):

          """Triggers regens as needed/possible. Runs in a separate thread"""

          self.session = session

@@ -734,6 +817,15 @@ 

      return thread

  

  

+ def start_external_currency_checker(session, repomgr):

+     subsession = session.subsession()

+     thread = threading.Thread(name='currencyExternalChecker',

+                         target=repomgr.currencyExternalChecker, args=(subsession,))

+     thread.setDaemon(True)

+     thread.start()

+     return thread

+ 

+ 

  def start_regen_loop(session, repomgr):

      subsession = session.subsession()

      thread = threading.Thread(name='regenLoop',

@@ -750,6 +842,8 @@ 

          raise SystemExit

      signal.signal(signal.SIGTERM,shutdown)

      curr_chk_thread = start_currency_checker(session, repomgr)

+     if options.check_external_repos:

+         curr_ext_chk_thread = start_external_currency_checker(session, repomgr)

      regen_thread = start_regen_loop(session, repomgr)

      # TODO also move rmtree jobs to threads

      logger.info("Entering main loop")

@@ -765,6 +859,9 @@ 

              if not curr_chk_thread.isAlive():

                  logger.error("Currency checker thread died. Restarting it.")

                  curr_chk_thread = start_currency_checker(session, repomgr)

+             if options.check_external_repos and not curr_ext_chk_thread.isAlive():

+                 logger.error("External currency checker thread died. Restarting it.")

+                 curr_ext_chk_thread = start_external_currency_checker(session, repomgr)

              if not regen_thread.isAlive():

                  logger.error("Regeneration thread died. Restarting it.")

                  regen_thread = start_regen_loop(session, repomgr)

@@ -862,6 +959,7 @@ 

                  'deleted_repo_lifetime': 7*24*3600,

                  #XXX should really be called expired_repo_lifetime

                  'dist_repo_lifetime': 7*24*3600,

+                 'check_external_repos': True,

                  'recent_tasks_lifetime': 600,

                  'sleeptime' : 15,

                  'cert': None,

@@ -878,7 +976,7 @@ 

                      'cert', 'ca', 'serverca', 'debuginfo_tags',

                      'source_tags', 'ignore_tags')  # FIXME: remove ca here

          bool_opts = ('with_src','verbose','debug','ignore_stray_repos', 'offline_retry',

-                      'krb_rdns', 'krb_canon_host', 'no_ssl_verify')

+                      'krb_rdns', 'krb_canon_host', 'no_ssl_verify', 'check_external_repos')

          for name in config.options(section):

              if name in int_opts:

                  defaults[name] = config.getint(section, name)

I'm now wondering if someone would benefit if this behaviour is configurable - e.g. enabled by default but possible to disable completely and avoid long timeouts when network/repos are unreachable.

Also, shouldn't this mirror be configurable?

@jflorian Could there be something else? yum/createrepo uses only this one and I've not found any other DTD for repomd.

@tkopecek I'm sorry, I didn't look close enough at that. Short version: disregard my comment.

Long version: My thought was tainted by how I achieved this objective in my gojira tool where I checked the metadata timestamps of my local repo mirrors which upon a change would trigger a regen-repo command. So when I saw the "metadata" bit of that URL I mistook that just another repo. I don't work with DTDs directly, but I get the concept so I'm rather useless here unfortunately. Seems like a glaring central point of failure unless DNS provides some failover means. Outta my element here.

I would use this feature for my instance.

Few toughs.
There is maybe a need to explicitly opt-in this feature because some "external repo" might be under the control of the "koji hub admins". Thinking about Fedora EPEL case here. The regen-repo task might be run each time the base repos are updated, there is no detection needed here. Once that said it should be harmless. As regen repo tasks will quickly be newer than the last koji external repo update.

Right now, I have the following error in kojira.log when the patch is applied to koji 1.13 on el7.
sept. 08 13:40:59 koji01.online.rpmfusion.net kojira[6605]: for arch in self.tag_arches.get(tag, self.session.getTag(tag)['arches'].split()):
sept. 08 13:40:59 koji01.online.rpmfusion.net kojira[6605]: AttributeError: 'NoneType' object has no attribute 'split'

Thanks for your work on this.

I've changed the self.session.getTag(tag)['arches'].split()): to hardcode to 'x86_64' since this arch is built for every target. It seems to work for some cases, but it fails on others.

I need to investigate more deeply. (it could be caused by the fedora/fedora-secondary split).

This patch applied on our infra seems to work as expected.

But I think there is a race condition. This might be caused by the following events.
(not sure how to verify this).
- A new repo is created in fedora, but not yet published.
- A build job is created, and an override set for the target, so the regen-repo task is triggered
- Then the new fedora repo is published.
- A later buildroot creation will fail because of missing packages from the (old) fedora repos.

Here kojira might still consider the external repository as fresh because there is an event that has triggered a regen-repo task. But it doesn't mean this last regen-repo task was run against the latest repodata. Specially if the repodata was not yet published.

So I think it's a good path to rely on the xml timestamp value of the repomd.xml . But this probably need to be register as a property of the given external url (the later one for all arches of a given external_url seems enough).
Then this value can be compared to the previous one, no matter which event triggered a rebuild if the previous timestamp is older than the current one (and if a regen-repo task isn't already running).

@kwizart I've changed logic of url retrieval, so error shouldn't appear now.

Now checking race condition issue.

rebased onto 3198f98f06abb41158c09269ebc8725101c9b393

2 years ago

I've re-applied this patch against 1.14.0 and hit the following issue:
arch_url = url.replace('$arch', arch)
TypeError: expected a character buffer object

Changed to:
arch_url = url.replace('$arch', str(arch))
fixes the problem

Any tough about the race condition ?

rebased onto 128a7ecc80c97d22c657b5ed3cd8d6bbd992ec42

2 years ago

I've added a fix for that. Reason is bad if condition. Your fix will create 'path/to/repo/None/os' url, which is wrong and will result in 404.

To race condition - I don't think it will happen, but maybe I'm wrong. Even if external repo will be created after kojira's one, its timestamp will be higher than last regen-repo, so regeneration will be triggered correctly as external repos are checked every time if tag's content has not changed.

What I'm still worried about is amount of potential network traffic - but maybe it could be solved by admin by using proper proxy and it's not kojira's thing to care about. I can add option to config if checking of external repos should happen at all for some network-critical deployments.

@mikem ideas?

Thx for fixing the arch issue.

About the race condition:
This is the reverse, I expect the race occurs only when the external repo is {{created}} "before" the kojira generated one, but {{published}} "after". When such repo is published, kojira expects it' is already new enough. But actually, it was run against a previous version of the external repo.

Fedora is particularly good at producing such external repository because with bodhi-backend there is a strict separation between the repository production tasks (mash, pungi, dist-repos, whatever) and the publication tasks (rsync, etc).

Hmm, I understand it now. Timestamp in this case wouldn't be a property of external repo, but of repo-external_repo relation as any repo can be regenerated in different times and in that time it will get fetched. Only place, where we have this relation in db is tag_external_repos table, which could be extended for this.

rebased onto ab8d1cf904c1fe4d0d89a1aef47e876ef1fdd1fd

a year ago

rebased onto 79dfa576e56eb02b5ba3cba474a17136efb3015a

a year ago

rebased onto ab8d1cf904c1fe4d0d89a1aef47e876ef1fdd1fd

10 months ago

Hi there, I've experienced two issue with the current rebase on top of 16.1:
line 328 kojira
- if arch is None and '$arch' not in url:
+ if arch is None or '$arch' not in url:
But I'm not sure about why to iterate on a None Arch at all.

line 353 kojira
- for external_repo in session.getExternalRepoList(tag):
+ for external_repo in self.session.getExternalRepoList(tag):

After talking about this feature with Mike at CentOS Dojo Brussel 2019 this month.

This is a need to track the timestamp for each arch where an external repo is used, because if one arch is published late, it will not be seen by the script and only this arch may fail.

Another point is that if the timestamp ( by repo - by arch ) is updated during the regen-repo task, there is a need to discard the result and schedule a new regen-repo task.
So there is a need to compare the timestamp for earch arch of earch external repos for changes right before and right after the regen-repo task.

What I recall from the conversation was that we were worried about races where we could see inconsistent data in the merge.

The relatively simple check we came up with was to check the external repo before and after. If it changed during the merge, then we through out the results and re-run (or maybe just fail and let kojira re-run the jobs).

This check could either be done on a per-arch basis (weaker), or on a overall basis (stronger). I.e. the stronger case would mean that if any of the individual external arch repos changed during the entire run (over all arches) of the createpo tasks, then we start over.

The weaker case is likely much easier to implement, and would be a good start.

@mikem Does it make sense to plan it for 1.18? I would rewrite it (kojira changed a bit meanwhile, so even merge is impossible now).

rebased onto a6cc2899261cb0a868be83e841ceba4434cead42

4 months ago

I've rebased original PR due reappearing problem in #1443 and have following proposals:

  1. Use this PR for kojira
    • start with this
    • add configuration option to disable checking of external repos
    • move external repo checking to separate thread, as it can be quite slow
  2. Create new RFE/bug for newRepo task. This one should handle proposed problem with consistency.
    • check timestamps of external repos before and after merge, fail in case when any of these is different
    • store external repo timestamps in tag.extra and use it in kojira's code instead of event_ts in case it exists.

Comments?

rebased onto a84071c

4 months ago

3 new commits added

  • make external repo checking configurable
  • move external repo checking to separate thread
  • handle deleted tags
4 months ago

I've gone ahead and modified current PR according to previous comment.

3 new commits added

  • make external repo checking configurable
  • move external repo checking to separate thread
  • kojira monitors external repos changes
4 months ago