#1066 backend: delete leftover action redis entries
Merged 4 years ago by msuchy. Opened 4 years ago by praiskup.
Unknown source leftover-redis-entry  into  master

@@ -65,7 +65,7 @@

              Fill float value in seconds.

      """

      worker_prefix = 'worker' # make sure this is unique in each class

-     worker_timeout_start = 10

+     worker_timeout_start = 30

      worker_timeout_deadcheck = 60

  

      def __init__(self, redis_connection=None, max_workers=8, log=None):
@@ -200,7 +200,17 @@

  

          for worker_id in self.worker_ids():

              info = self.redis.hgetall(worker_id)

-             allocated = float(info.get('allocated'))

+ 

+             allocated = info.get('allocated', None)

+             if not allocated:

+                 # In worker manager, we _always_ add 'allocated' tag when we

+                 # start worker.  So this may only happen when worker is

+                 # orphaned for some reason (we gave up with him), and it still

+                 # touches the database on background.

+                 self.redis.delete(worker_id)

+                 continue

+ 

+             allocated = float(allocated)

  

              if self.has_worker_ended(worker_id, info):

                  # finished worker

@@ -40,6 +40,22 @@

      return parser

  

  

+ def identify(worker_id, redis, log):

+     redis.hset(worker_id, 'started', 1)

+     redis.hset(worker_id, 'PID', os.getpid())

+ 

+     data = redis.hgetall(worker_id)

+     if not 'allocated' in data:

+         log.error("too slow box, manager thinks we are dead")

+         redis.delete(worker_id)

+         return False

+ 

+     # There's still small race on a very slow box (TOCTOU in manager, the db

+     # entry can be deleted after our check above ^^).  But we don't risk

+     # anything else than concurrent run of multiple workers in such case.

+     return True

+ 

+ 

  def handle_task(opts, args, log):

      "Handle the task, executed on background in DaemonContext"

  
@@ -50,10 +66,9 @@

  

      log.info("Handling action %s", task_id)

  

-     if args.worker_id:

-         # Identify ourselves.

-         redis.hset(args.worker_id, 'started', 1)

-         redis.hset(args.worker_id, 'PID', os.getpid())

+     if args.worker_id and not identify(args.worker_id, redis, log):

+         log.error("can not identify myself")

+         sys.exit(1)

  

      resp = frontend_client.get('action/{}'.format(task_id))

      if resp.status_code != 200:

Traceback (most recent call last):
File "/usr/bin/copr_run_action_dispatcher.py", line 12, in <module>
main()
File "/usr/bin/copr_run_action_dispatcher.py", line 9, in main
action_dispatcher.run()
File "/usr/share/copr/backend/daemons/action_dispatcher.py", line 73, in run
worker_manager.run(timeout=timeout)
File "/usr/share/copr/backend/worker_manager.py", line 166, in run
self._cleanup_workers()
File "/usr/share/copr/backend/worker_manager.py", line 203, in _cleanup_workers
allocated = float(info.get('allocated'))
TypeError: float() argument must be a string or a number, not 'NoneType'

The bug caused copr-backend-action service failure this night when
logrotate started rotating our large backend's logs. We have
Restart=on-failure, but the service failed repeatedly. My guess is that
backend become so overloaded so the related action worker did not manage
to start fast enough (within 'worker_timeout_start'). So worker-manager
deleted the worker from redis (believing that the worker died), but the
worker eventually finished ... and updated the database.

So first, make worker_manager resistant against redis entries without
'allocated' tag. Simply delete them.

Also, try to detect the slow-box scenario (absent 'allocated' tag) in
copr-backend-process-action. This is not 100% guard, but I don't think
it is worth adding a lock mechanism. In the worst case we'll process
the same action twice.

Also, prolong the worker startup time period. We had 10s before, and
during the failure this night the worker was starting almost 12s. So
even though this is not fatal anymore, larger timeout lowers the chance
that the worker will be restarted when backend is overloaded.

Pull-Request has been merged by msuchy

4 years ago