#952 backend: minimize redis traffic for looping over pending-jobs
Merged 3 months ago by praiskup. Opened 3 months ago by praiskup.
copr/ praiskup/copr optimize-backend  into  master

@@ -145,9 +145,13 @@ 

          self.log.info("Build dispatching started.")

          self.update_process_title()

  

+         first_backend_loop = True

+ 

          while True:

              self.clean_finished_workers()

  

+             skip_jobs_cache = {}

+ 

              for job in self.load_jobs():

                  # first check if we do not have

                  # worker already running for the job

@@ -156,16 +160,28 @@ 

                                       job.task_id)

                      continue

  

-                 # now search db builder records for the job and

-                 # if we found it, spawn a worker to reattach

-                 vm = self.vm_manager.get_vm_by_task_id(job.task_id)

-                 if vm and vm.state == 'in_use':

-                     self.log.info("Reattaching to VM: "+str(vm))

-                     worker = self.start_worker(vm, job, reattach=True)

-                     worker.mark_running(job)

-                     vm.store_field(self.vm_manager.rc, "used_by_worker", worker.worker_id)

-                     self.log.info("Reattached new worker %s for job %s",

-                                   worker.worker_id, worker.job.task_id)

+                 if first_backend_loop:

+                     # Server was restarted.  Some builds might be running on

+                     # background on builders;  so search db builder records for

+                     # the job and if we found it, spawn a worker to reattach.

+                     vm = self.vm_manager.get_vm_by_task_id(job.task_id)

+                     if vm and vm.state == 'in_use':

+                         self.log.info("Reattaching to VM: "+str(vm))

+                         worker = self.start_worker(vm, job, reattach=True)

+                         worker.mark_running(job)

+                         vm.store_field(self.vm_manager.rc, "used_by_worker", worker.worker_id)

+                         self.log.info("Reattached new worker %s for job %s",

+                                       worker.worker_id, worker.job.task_id)

+                         continue

+ 

+                 cache_entry = '{owner}-{arch}-{sandbox}'.format(

+                     owner=job.project_owner,

+                     arch=job.arch or "noarch",

+                     sandbox=job.sandbox,

+                 )

+ 

+                 if cache_entry in skip_jobs_cache:

+                     self.log.info("Skipped job %s, cached", job)

                      continue

  

                  # ... and if the task is new to us,

@@ -178,6 +194,7 @@ 

                          self.next_worker_id, job.task_id, job.build_id,

                          job.chroot)

                  except NoVmAvailable as error:

+                     skip_jobs_cache[cache_entry] = True

                      self.log.info("No available resources for task %s (Reason: %s). Deferring job.",

                                    job.task_id, error)

                      continue

@@ -192,4 +209,5 @@ 

                  self.log.info("Started new worker %s for job %s",

                                worker.worker_id, worker.job.task_id)

  

+             first_backend_loop = False

              time.sleep(self.opts.sleeptime)

Previously, each item in /backend/pending-jobs/ dict caused several
queries to redis database. This caused enormous slowdown when we had
several thousands of builds in build queue.

The result of unsuccessful VM acquire is now cached for "<user>-<arch>"
pair, and within the same loop (aka, within processing one fetched
pending-jobs dict) we don't even try to acquire VM for the same pair.

We also checked whether we couldn't re-attach some worker to existing
builder machine. This only ever makes sense after the fresh backend
restart, and never more.

This complements frontend counterpart in commit 548a9a0.

Fixes: #902

rebased onto 0ae147dde03aedfde02f8e24c59645918f7c8308

3 months ago

Metadata Update from @praiskup:
- Pull-request tagged with: release-blocker

3 months ago

rebased onto 1e0dbcc110de48d0131f40b2fabe1d45abb62896

3 months ago

rebased onto 5584f89

3 months ago

Pull-Request has been merged by praiskup

3 months ago

With queue of about 25.000 pending tasks (on dev), I've got builder for srpm after about 30seconds, and builders in about 1 minute. Sounds to be good enough for now.