From ec8d5617de396cc4a89ce5dc5e642e8e7a98429b Mon Sep 17 00:00:00 2001
From: Pavel Raiskup <praiskup@redhat.com>
Date: Apr 11 2022 10:36:02 +0000
Subject: [PATCH 1/3] backend: lower the frequency of the workers cleanup


---

diff --git a/backend/copr_backend/worker_manager.py b/backend/copr_backend/worker_manager.py
index beb3a34..a4d73fc 100644
--- a/backend/copr_backend/worker_manager.py
+++ b/backend/copr_backend/worker_manager.py
@@ -244,10 +244,17 @@ class WorkerManager():
             is_worker_alive() method - whether the task is really still doing
             something on background or not (== unexpected failure cleanup).
             Fill float value in seconds.
+    :cvar worker_cleanup_period: How often should WorkerManager try to cleanup
+            workers? (value is a period in seconds)
     """
+
+    # pylint: disable=too-many-instance-attributes
+
     worker_prefix = 'worker' # make sure this is unique in each class
     worker_timeout_start = 30
     worker_timeout_deadcheck = 60
+    worker_cleanup_period = 3.0
+
 
     def __init__(self, redis_connection=None, max_workers=8, log=None,
                  frontend_client=None, limits=None):
@@ -263,6 +270,7 @@ class WorkerManager():
         # to survive server restarts (we adopt the old background workers).
         self._tracked_workers = set(self.worker_ids())
         self._limits = limits or []
+        self._last_worker_cleanup = None
 
     def start_task(self, worker_id, task):
         """
@@ -407,6 +415,12 @@ class WorkerManager():
         start_time = time.time()
         self.log.debug("Worker.run() start at time %s", start_time)
 
+        # Make sure _cleanup_workers() has some effect during the run() call.
+        # This is here mostly for the test-suite, because in the real use-cases
+        # the worker_cleanup_period is much shorter period than the timeout and
+        # the cleanup is done _several_ times during the run() call.
+        self._last_worker_cleanup = 0.0
+
         while True:
             now = start_time if now is None else time.time()
 
@@ -472,6 +486,22 @@ class WorkerManager():
         self._tracked_workers.discard(worker_id)
 
     def _cleanup_workers(self, now):
+        """
+        Go through all the tracked workers and check if they already finished,
+        failed to start or died in the background.
+        """
+
+        # This method is called very frequently (several hundreds per second,
+        # for each of the attempts to start a worker in the self.run() method).
+        # Because the likelihood that some of the background workers changed
+        # state is pretty low, we control the frequency of the cleanup here.
+        now = time.time()
+        if now - self._last_worker_cleanup < self.worker_cleanup_period:
+            return
+
+        self.log.debug("Trying to clean old workers")
+        self._last_worker_cleanup = time.time()
+
         for worker_id in self.worker_ids():
             info = self.redis.hgetall(worker_id)
 

From 624ea94a290130eb3dca06a36d11abcae0dc467f Mon Sep 17 00:00:00 2001
From: Pavel Raiskup <praiskup@redhat.com>
Date: Apr 11 2022 10:36:02 +0000
Subject: [PATCH 2/3] backend: more debug-logging for WorkerManager


---

diff --git a/backend/copr_backend/worker_manager.py b/backend/copr_backend/worker_manager.py
index a4d73fc..14b98a3 100644
--- a/backend/copr_backend/worker_manager.py
+++ b/backend/copr_backend/worker_manager.py
@@ -431,6 +431,7 @@ class WorkerManager():
 
             worker_count = len(self._tracked_workers)
             if worker_count >= self.max_workers:
+                self.log.debug("Worker count on a limit %s", worker_count)
                 time.sleep(1)
                 continue
 
@@ -441,6 +442,7 @@ class WorkerManager():
                 # Empty queue!
                 if worker_count:
                     # It still makes sense to cycle to finish the workers.
+                    self.log.debug("No more tasks, waiting for workers")
                     time.sleep(1)
                     continue
                 # Optimization part, nobody is working now, and there's nothing

From 562928a5d5d94216eb414c730073b783f44b201d Mon Sep 17 00:00:00 2001
From: Pavel Raiskup <praiskup@redhat.com>
Date: Apr 11 2022 10:36:02 +0000
Subject: [PATCH 3/3] backend: check for the worker liveness less-frequently


We tend to have up to 200 concurrently running workers nowadays in
Fedora Copr, and doing liveness check (sending kill) is not for free.
Let's decrease the frequency of the check from one minute to three
minutes (most of the builds take more than that).

---

diff --git a/backend/copr_backend/worker_manager.py b/backend/copr_backend/worker_manager.py
index 14b98a3..ce0a171 100644
--- a/backend/copr_backend/worker_manager.py
+++ b/backend/copr_backend/worker_manager.py
@@ -252,7 +252,7 @@ class WorkerManager():
 
     worker_prefix = 'worker' # make sure this is unique in each class
     worker_timeout_start = 30
-    worker_timeout_deadcheck = 60
+    worker_timeout_deadcheck = 3*60
     worker_cleanup_period = 3.0