#2140 move checkTasks near its usage
Merged 3 years ago by tkopecek. Opened 4 years ago by tkopecek.
tkopecek/koji issue2119b  into  master

file modified
+18 -21
@@ -277,7 +277,6 @@ 

          self.repos = {}

          self.external_repo_ts = {}

          self.tasks = {}

-         self.recent_tasks = {}

          self.other_tasks = {}

          self.needed_tags = {}

          self.tag_use_stats = {}
@@ -712,11 +711,6 @@ 

              self.setTagScore(entry)

  

      def updateRepos(self):

-         self.checkTasks()

-         self.logger.debug("Current tasks: %r" % self.tasks)

-         if self.other_tasks:

-             self.logger.debug("Found %i untracked newRepo tasks",

-                               len(self.other_tasks))

          self.logger.debug("Updating repos")

  

          self.readCurrentRepos()
@@ -748,12 +742,6 @@ 

          - check for other newRepo tasks (not generated by us)

          """

  

-         # prune recent tasks

-         now = time.time()

-         for task_id in list(self.recent_tasks):

-             if now - self.recent_tasks[task_id] > self.options.recent_tasks_lifetime:

-                 del self.recent_tasks[task_id]

- 

          # check on current tasks

          task_ids = list(self.tasks)

          self.session.multicall = True
@@ -764,12 +752,15 @@ 

              tag_id = self.tasks[task_id]['tag_id']

              if tstate == 'CLOSED':

                  self.logger.info("Finished: newRepo task %s for tag %s", task_id, tag_id)

-                 self.recent_tasks[task_id] = time.time()

                  del self.tasks[task_id]

+                 try:

+                     del self.needed_tags[tag_id]

+                 except KeyError:

+                     # it could be already removed by checkNeeded

+                     pass

              elif tstate in ('CANCELED', 'FAILED'):

                  self.logger.info(

                      "Problem: newRepo task %s for tag %s is %s", task_id, tag_id, tstate)

-                 self.recent_tasks[task_id] = time.time()

                  del self.tasks[task_id]

              else:

                  self.tasks[task_id]['taskinfo'] = tinfo
@@ -821,7 +812,11 @@ 

                      # no longer needed

                      self.logger.info("Tag %(name)s has a current or in "

                                       "progress repo", entry['taginfo'])

-                     del self.needed_tags[tag_id]

+                     try:

+                         del self.needed_tags[tag_id]

+                     except KeyError:

+                         # it could be already deleted by checkTasks

+                         pass

                  # if not covered, we already know

                  continue

              if covered:
@@ -875,6 +870,12 @@ 

      def regenRepos(self):

          """Trigger newRepo tasks for needed tags"""

  

+         self.checkTasks()

+         self.logger.debug("Current tasks: %r" % self.tasks)

+         if self.other_tasks:

+             self.logger.debug("Found %i untracked newRepo tasks",

+                               len(self.other_tasks))

+ 

          # first note currently running tasks

          running_tasks = 0

          running_tasks_maven = 0
@@ -909,9 +910,6 @@ 

                  if task_id in self.tasks:

                      # we already have a task

                      continue

-                 elif task_id in self.recent_tasks:

-                     # avoiding a race, see https://pagure.io/koji/issue/942

-                     continue

                  else:

                      # should not happen

                      logger.warning('Needed tag refers to unknown task. '
@@ -1134,7 +1132,6 @@ 

                  # XXX should really be called expired_repo_lifetime

                  'dist_repo_lifetime': 7 * 24 * 3600,

                  'check_external_repos': False,

-                 'recent_tasks_lifetime': 600,

                  'sleeptime': 15,

                  'cert': None,

                  'serverca': None,
@@ -1144,13 +1141,13 @@ 

          int_opts = ('deleted_repo_lifetime', 'max_repo_tasks', 'repo_tasks_limit',

                      'retry_interval', 'max_retries', 'offline_retry_interval',

                      'max_delete_processes', 'max_repo_tasks_maven', 'dist_repo_lifetime',

-                     'sleeptime', 'recent_tasks_lifetime')

+                     'sleeptime')

          str_opts = ('topdir', 'server', 'user', 'password', 'logfile', 'principal', 'keytab',

                      'cert', 'serverca', 'debuginfo_tags', 'queue_file',

                      'source_tags', 'separate_source_tags', 'ignore_tags')

          bool_opts = ('verbose', 'debug', 'ignore_stray_repos', 'offline_retry',

                       'no_ssl_verify', 'check_external_repos')

-         legacy_opts = ('with_src', 'delete_batch_size')

+         legacy_opts = ('with_src', 'delete_batch_size', 'recent_tasks_lifetime')

          for name in config.options(section):

              if name in int_opts:

                  defaults[name] = config.getint(section, name)

rebased onto 4e943eae7362598603ba4c6f106b0b1f99445a58

3 years ago

Ok, it appears that regenRepos is the only consumer of the data gathered by checkTasks (apart from printState, which is purely informative). So, in that sense, it seems reasonable to move this here.

On the other hand, regenRepos was was isolated to try to remove as much slowdown from the regen cycle as possible. I'm a little concerned that putting checkTasks here could have the opposite effect that intended in some cases. The getTaskInfo multicall shouldn't take very long, but the listTasks could in some instances be very slow.

Using this reraises older issue which was thought to be fixed #942. It show up more or less regularly with this patch. Not sure what is the code causing it. Even with recent_tasks_lifetime: long enough it still shows up. It will get cleared few cycles ahead when tags are updated. But meanwhile it spawns additional tasks (I've put simple continue in that part to not create secondary task as it is evidently done)

$ grep 33027713 /var/log/kojira.log
2020-11-11 13:11:59,314 [INFO] koji.repo.regen: Created newRepo task 33027713 for tag 65604 (rhel-7.9-z-nss-stack-build), expired for 160.3 sec
2020-11-11 13:17:12,659 [INFO] koji.repo.regen: Finished: newRepo task 33027713 for tag 65604
2020-11-11 14:17:18,584 [WARNING] koji: Needed tag refers to unknown task. rhel-7.9-z-nss-stack-build -> 33027713

1 new commit added

  • remove recent tasks behaviour
3 years ago

It looks that fix could be to remove tag from needed_tags if task successfully finished. In such case it should be ok (in worst case it will be checked in next cycle, but it looks less harmful than triggerring duplicate tasks).

1 new commit added

  • allow other thread to remove tag
3 years ago

rebased onto 934ca61381d69f6cd36c216eff5eca7b3de4fa27

3 years ago

1 new commit added

  • remove unused variable
3 years ago

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

3 years ago

rebased onto fcf84dd

3 years ago

Metadata Update from @tkopecek:
- Pull-request untagged with: testing-ready
- Pull-request tagged with: no_qe

3 years ago

I'm not 100% sure what problem we are trying to solve here.

I see that this is a followup to #2119. In that ticket, Kevin writes that the fix worked, but that Kojira was still "not functioning well". However, I don't see any details there or in that issue or in #2581.

I am concerned about dismantling this race condition fix.

Is there a problem statement somewhere that I am missing?

@kevin Are you still running this PR? Do you've hit some problems with it?

Commit 1fed5e4 fixes this pull-request

Pull-Request has been merged by tkopecek

3 years ago

So we ran this for a while, but then went back to without it.

I think our issues were not related to this change...