#516 kojira monitors external repos changes
Merged 3 years ago by tkopecek. Opened 6 years ago by tkopecek.
tkopecek/koji issue512  into  master

file modified
+98 -7
@@ -35,7 +35,9 @@ 

  import time

  import traceback

  from optparse import OptionParser

+ from xml.etree import ElementTree

  

+ import requests

  import six

  

  import koji
@@ -250,6 +252,7 @@ 

          self._local = threading.local()

          self._local.session = session

          self.repos = {}

+         self.external_repos = {}

          self.tasks = {}

          self.recent_tasks = {}

          self.other_tasks = {}
@@ -370,8 +373,37 @@ 

                      self.logger.info('Dropping entry for inactive repo: %s', repo_id)

                      del self.repos[repo_id]

  

-     def checkCurrentRepos(self):

-         """Determine which repos are current"""

+     def checkExternalRepo(self, ts, repodata, tag):

+         """Determine which external repos are current, return True if remote repo is newer"""

+         url = repodata['url']

+         if url not in self.external_repos:

+             self.external_repos[url] = 0

+             arches = []  # placeholder for repos without $arch bit

+             try:

+                 arches = self.session.getTag(tag)['arches'].split()

+             except AttributeError:

+                 pass

+             for arch in arches:

+                 if '$arch' in url:

+                     arch_url = url.replace('$arch', arch)

+                 else:

+                     arch_url = url

+                 arch_url = os.path.join(arch_url, 'repodata/repomd.xml')

+                 self.logger.debug('Checking external url: %s' % arch_url)

+                 try:

+                     r = requests.get(arch_url, timeout=5)

+                     root = ElementTree.fromstring(r.text)

+                     for child in root.iter('{http://linux.duke.edu/metadata/repo}timestamp'):

+                         remote_ts = int(child.text)

+                         if remote_ts > self.external_repos[url]:

+                             self.external_repos[url] = remote_ts

+                 except Exception:

+                     # inaccessible or without timestamps

+                     # treat repo as unchanged (ts = 0)

+                     pass

+         return ts < self.external_repos[url]

+ 

+     def reposToCheck(self):

          to_check = []

          repo_ids = to_list(self.repos.keys())

          for repo_id in repo_ids:
@@ -392,11 +424,40 @@ 

          if self.logger.isEnabledFor(logging.DEBUG):

              skipped = set(repo_ids).difference([r.repo_id for r in to_check])

              self.logger.debug("Skipped check for repos: %r", skipped)

-         if not to_check:

-             return

-         for repo in to_check:

-             changed = self.session.tagChangedSinceEvent(repo.event_id, repo.taglist)

+         return to_check

+ 

+     def checkExternalRepos(self):

+         """Determine which external repos changed"""

+         # clean external repo cache

+         self.external_repos = {}

+         for repo in self.reposToCheck():

+             changed = False

+             for tag in repo.taglist:

+                 try:

+                     external_repos = self.session.getExternalRepoList(tag)

+                 except koji.GenericError:

+                     # in case tag was deleted, checkCurrentRepos is

+                     # responsible for cleanup, ignore it here

+                     external_repos = []

+                 for external_repo in external_repos:

+                     changed = self.checkExternalRepo(repo.event_ts, external_repo, tag)

+                     self.logger.debug("Check external repo %s [%s] for tag %s: %s" % (

+                         external_repo['external_repo_id'], external_repo['url'],

+                         tag, changed))

+                     if changed:

+                         break

+                 if changed:

+                     break

              if changed:

+                 self.logger.info("Repo %i no longer current due to external repo change" %

+                                  repo.repo_id)

+                 repo.current = False

+                 repo.expire_ts = time.time()

+ 

+     def checkCurrentRepos(self):

+         """Determine which repos are current"""

+         for repo in self.reposToCheck():

+             if self.session.tagChangedSinceEvent(repo.event_id, repo.taglist):

                  self.logger.info("Repo %i no longer current", repo.repo_id)

                  repo.current = False

                  repo.expire_ts = time.time()
@@ -416,6 +477,21 @@ 

          finally:

              session.logout()

  

+     def currencyExternalChecker(self, session):

+         """Continually checks repos for external repo currency. Runs as a separate thread"""

+         self.session = session

+         self.logger = logging.getLogger("koji.repo.currency_external")

+         self.logger.info('currencyExternalChecker starting')

+         try:

+             while True:

+                 self.checkExternalRepos()

+                 time.sleep(self.options.sleeptime)

+         except Exception:

+             self.logger.exception('Error in external currency checker thread')

+             raise

+         finally:

+             session.logout()

+ 

      def regenLoop(self, session):

          """Triggers regens as needed/possible. Runs in a separate thread"""

          self.session = session
@@ -815,6 +891,15 @@ 

      return thread

  

  

+ def start_external_currency_checker(session, repomgr):

+     subsession = session.subsession()

+     thread = threading.Thread(name='currencyExternalChecker',

+                               target=repomgr.currencyExternalChecker, args=(subsession,))

+     thread.setDaemon(True)

+     thread.start()

+     return thread

+ 

+ 

  def start_regen_loop(session, repomgr):

      subsession = session.subsession()

      thread = threading.Thread(name='regenLoop',
@@ -832,6 +917,8 @@ 

          raise SystemExit

      signal.signal(signal.SIGTERM, shutdown)

      curr_chk_thread = start_currency_checker(session, repomgr)

+     if options.check_external_repos:

+         curr_ext_chk_thread = start_external_currency_checker(session, repomgr)

      regen_thread = start_regen_loop(session, repomgr)

      # TODO also move rmtree jobs to threads

      logger.info("Entering main loop")
@@ -844,6 +931,9 @@ 

              if not curr_chk_thread.isAlive():

                  logger.error("Currency checker thread died. Restarting it.")

                  curr_chk_thread = start_currency_checker(session, repomgr)

+             if options.check_external_repos and not curr_ext_chk_thread.isAlive():

+                 logger.error("External currency checker thread died. Restarting it.")

+                 curr_ext_chk_thread = start_external_currency_checker(session, repomgr)

              if not regen_thread.isAlive():

                  logger.error("Regeneration thread died. Restarting it.")

                  regen_thread = start_regen_loop(session, repomgr)
@@ -940,6 +1030,7 @@ 

                  'deleted_repo_lifetime': 7 * 24 * 3600,

                  # XXX should really be called expired_repo_lifetime

                  'dist_repo_lifetime': 7 * 24 * 3600,

+                 'check_external_repos': True,

                  'recent_tasks_lifetime': 600,

                  'sleeptime': 15,

                  'cert': None,
@@ -956,7 +1047,7 @@ 

                      'krbservice', 'cert', 'ca', 'serverca', 'debuginfo_tags',

                      'source_tags', 'separate_source_tags', 'ignore_tags')  # FIXME: remove ca here

          bool_opts = ('verbose', 'debug', 'ignore_stray_repos', 'offline_retry',

-                      'krb_rdns', 'krb_canon_host', 'no_ssl_verify')

+                      'krb_rdns', 'krb_canon_host', 'no_ssl_verify', 'check_external_repos')

          legacy_opts = ('with_src')

          for name in config.options(section):

              if name in int_opts:

I'm now wondering if someone would benefit if this behaviour is configurable - e.g. enabled by default but possible to disable completely and avoid long timeouts when network/repos are unreachable.

Also, shouldn't this mirror be configurable?

@jflorian Could there be something else? yum/createrepo uses only this one and I've not found any other DTD for repomd.

@tkopecek I'm sorry, I didn't look close enough at that. Short version: disregard my comment.

Long version: My thought was tainted by how I achieved this objective in my gojira tool where I checked the metadata timestamps of my local repo mirrors which upon a change would trigger a regen-repo command. So when I saw the "metadata" bit of that URL I mistook that just another repo. I don't work with DTDs directly, but I get the concept so I'm rather useless here unfortunately. Seems like a glaring central point of failure unless DNS provides some failover means. Outta my element here.

I would use this feature for my instance.

Few toughs.
There is maybe a need to explicitly opt-in this feature because some "external repo" might be under the control of the "koji hub admins". Thinking about Fedora EPEL case here. The regen-repo task might be run each time the base repos are updated, there is no detection needed here. Once that said it should be harmless. As regen repo tasks will quickly be newer than the last koji external repo update.

Right now, I have the following error in kojira.log when the patch is applied to koji 1.13 on el7.
sept. 08 13:40:59 koji01.online.rpmfusion.net kojira[6605]: for arch in self.tag_arches.get(tag, self.session.getTag(tag)['arches'].split()):
sept. 08 13:40:59 koji01.online.rpmfusion.net kojira[6605]: AttributeError: 'NoneType' object has no attribute 'split'

Thanks for your work on this.

I've changed the self.session.getTag(tag)['arches'].split()): to hardcode to 'x86_64' since this arch is built for every target. It seems to work for some cases, but it fails on others.

I need to investigate more deeply. (it could be caused by the fedora/fedora-secondary split).

This patch applied on our infra seems to work as expected.

But I think there is a race condition. This might be caused by the following events.
(not sure how to verify this).
- A new repo is created in fedora, but not yet published.
- A build job is created, and an override set for the target, so the regen-repo task is triggered
- Then the new fedora repo is published.
- A later buildroot creation will fail because of missing packages from the (old) fedora repos.

Here kojira might still consider the external repository as fresh because there is an event that has triggered a regen-repo task. But it doesn't mean this last regen-repo task was run against the latest repodata. Specially if the repodata was not yet published.

So I think it's a good path to rely on the xml timestamp value of the repomd.xml . But this probably need to be register as a property of the given external url (the later one for all arches of a given external_url seems enough).
Then this value can be compared to the previous one, no matter which event triggered a rebuild if the previous timestamp is older than the current one (and if a regen-repo task isn't already running).

@kwizart I've changed logic of url retrieval, so error shouldn't appear now.

Now checking race condition issue.

rebased onto 3198f98f06abb41158c09269ebc8725101c9b393

6 years ago

I've re-applied this patch against 1.14.0 and hit the following issue:
arch_url = url.replace('$arch', arch)
TypeError: expected a character buffer object

Changed to:
arch_url = url.replace('$arch', str(arch))
fixes the problem

Any tough about the race condition ?

rebased onto 128a7ecc80c97d22c657b5ed3cd8d6bbd992ec42

6 years ago

I've added a fix for that. Reason is bad if condition. Your fix will create 'path/to/repo/None/os' url, which is wrong and will result in 404.

To race condition - I don't think it will happen, but maybe I'm wrong. Even if external repo will be created after kojira's one, its timestamp will be higher than last regen-repo, so regeneration will be triggered correctly as external repos are checked every time if tag's content has not changed.

What I'm still worried about is amount of potential network traffic - but maybe it could be solved by admin by using proper proxy and it's not kojira's thing to care about. I can add option to config if checking of external repos should happen at all for some network-critical deployments.

@mikem ideas?

Thx for fixing the arch issue.

About the race condition:
This is the reverse, I expect the race occurs only when the external repo is {{created}} "before" the kojira generated one, but {{published}} "after". When such repo is published, kojira expects it' is already new enough. But actually, it was run against a previous version of the external repo.

Fedora is particularly good at producing such external repository because with bodhi-backend there is a strict separation between the repository production tasks (mash, pungi, dist-repos, whatever) and the publication tasks (rsync, etc).

Hmm, I understand it now. Timestamp in this case wouldn't be a property of external repo, but of repo-external_repo relation as any repo can be regenerated in different times and in that time it will get fetched. Only place, where we have this relation in db is tag_external_repos table, which could be extended for this.

rebased onto ab8d1cf904c1fe4d0d89a1aef47e876ef1fdd1fd

5 years ago

rebased onto 79dfa576e56eb02b5ba3cba474a17136efb3015a

5 years ago

rebased onto ab8d1cf904c1fe4d0d89a1aef47e876ef1fdd1fd

5 years ago

Hi there, I've experienced two issue with the current rebase on top of 16.1:
line 328 kojira
- if arch is None and '$arch' not in url:
+ if arch is None or '$arch' not in url:
But I'm not sure about why to iterate on a None Arch at all.

line 353 kojira
- for external_repo in session.getExternalRepoList(tag):
+ for external_repo in self.session.getExternalRepoList(tag):

After talking about this feature with Mike at CentOS Dojo Brussel 2019 this month.

This is a need to track the timestamp for each arch where an external repo is used, because if one arch is published late, it will not be seen by the script and only this arch may fail.

Another point is that if the timestamp ( by repo - by arch ) is updated during the regen-repo task, there is a need to discard the result and schedule a new regen-repo task.
So there is a need to compare the timestamp for earch arch of earch external repos for changes right before and right after the regen-repo task.

What I recall from the conversation was that we were worried about races where we could see inconsistent data in the merge.

The relatively simple check we came up with was to check the external repo before and after. If it changed during the merge, then we through out the results and re-run (or maybe just fail and let kojira re-run the jobs).

This check could either be done on a per-arch basis (weaker), or on a overall basis (stronger). I.e. the stronger case would mean that if any of the individual external arch repos changed during the entire run (over all arches) of the createpo tasks, then we start over.

The weaker case is likely much easier to implement, and would be a good start.

@mikem Does it make sense to plan it for 1.18? I would rewrite it (kojira changed a bit meanwhile, so even merge is impossible now).

rebased onto a6cc2899261cb0a868be83e841ceba4434cead42

4 years ago

I've rebased original PR due reappearing problem in #1443 and have following proposals:

  1. Use this PR for kojira
    • start with this
    • add configuration option to disable checking of external repos
    • move external repo checking to separate thread, as it can be quite slow
  2. Create new RFE/bug for newRepo task. This one should handle proposed problem with consistency.
    • check timestamps of external repos before and after merge, fail in case when any of these is different
    • store external repo timestamps in tag.extra and use it in kojira's code instead of event_ts in case it exists.

Comments?

rebased onto a84071c3f58e6b1139ac5ad2c0fa832a9be6a030

4 years ago

3 new commits added

  • make external repo checking configurable
  • move external repo checking to separate thread
  • handle deleted tags
4 years ago

I've gone ahead and modified current PR according to previous comment.

3 new commits added

  • make external repo checking configurable
  • move external repo checking to separate thread
  • kojira monitors external repos changes
4 years ago

This seems like could break this out into a smaller "normalize_arch_url" method to centralize this "$arch substitution" logic.

As an admin, I don't understand what action I would perform to correct this warning message

rebased onto 147eeeb4a36447bb16f9c3c3b168e204769c0941

4 years ago

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

4 years ago

rebased onto c7a00274afb95957dd548645fa6f2dc2f75b53a5

4 years ago

rebased onto 309041c19e236fe8f58aa0ef0cc6622cf37ae011

4 years ago

@tkopecek what's missing here? Can we target it for 1.22?

It is already scheduled for 1.21. Needs some reviews and testing from other folks (@mikem, @julian8628) - otherwise it is complete.

I still don't understand much of the arch= [None] case...

I'm using that instead:

+    def checkExternalRepo(self, ts, repodata, tag):
+        """Determine which external repos are current"""
+        url = repodata['url']
+        if url not in self.external_repos:
+            newest_ts = 0
+            **arches = []** # placeholder for repos without $arch bit
+            try:
+                arches += self.session.getTag(tag)['arches'].split()
+            except AttributeError:
+                pass
+            for arch in arches:
+                **if '$arch' not in url:**
+                    arch_url = url
+                else:
+                    arch_url = url.replace('$arch', arch)

In others words , I don't get in which situation self.session.getTag(tag)['arches'].split() could output None or even noarch. So there are only two situations to handle IMO:

1/ repo has architecture harcoded (or implicit) because that koji instance only build for x86_64:
url=http://dl.fp.org/pub/linux/fedora/updates/x86_64
in this case, there is no need to use url.replace. whatever the arch is .

2/ repo use the $arch variable in url, then there is a need to use url.replace with an existing and valid architecture. Even in the case of building for noarch.
In such case, None (or noarch) will never ever be a valid architecture to use url.replace. (at least not in the fedora default URL context).

At the very least if a noarch case ever matter, then there is a need to use a "sane default" arch, but one cannot predict which sane default arch will be relevant for a particular koji instance.

I've not tested the current code, but older revision was producing 404 errors on the local mirror because of the use of
url=http://dl.fp.org/pub/linux/fedora/updates/None URLs.

rebased onto 72ec33bc2f0ddb0474d7b8797c659b7ce900cb69

4 years ago

Ok, I don't remember why I felt it is important. Your solution looks ok to me - added.

@tkopecek
I don't see the change implemented in the PR is it still in your local tree ?

Okay, good for merge for me.

rebased onto 751862f

3 years ago

Commit 9639f78 fixes this pull-request

Pull-Request has been merged by tkopecek

3 years ago

Metadata Update from @mfilip:
- Pull-request tagged with: testing-done

3 years ago

It took me a while to see that this feature was controlled by a new option (check_external_repos) that was disabled by default. Is this documented somewhere?

Yes, see https://docs.pagure.org/koji/utils/ - there is a potential problem with server timezone, so we've disabled it by default.