| |
@@ -40,6 +40,22 @@
|
| |
return parser
|
| |
|
| |
|
| |
+ def identify(worker_id, redis, log):
|
| |
+ redis.hset(worker_id, 'started', 1)
|
| |
+ redis.hset(worker_id, 'PID', os.getpid())
|
| |
+
|
| |
+ data = redis.hgetall(worker_id)
|
| |
+ if not 'allocated' in data:
|
| |
+ log.error("too slow box, manager thinks we are dead")
|
| |
+ redis.delete(worker_id)
|
| |
+ return False
|
| |
+
|
| |
+ # There's still small race on a very slow box (TOCTOU in manager, the db
|
| |
+ # entry can be deleted after our check above ^^). But we don't risk
|
| |
+ # anything else than concurrent run of multiple workers in such case.
|
| |
+ return True
|
| |
+
|
| |
+
|
| |
def handle_task(opts, args, log):
|
| |
"Handle the task, executed on background in DaemonContext"
|
| |
|
| |
@@ -50,10 +66,9 @@
|
| |
|
| |
log.info("Handling action %s", task_id)
|
| |
|
| |
- if args.worker_id:
|
| |
- # Identify ourselves.
|
| |
- redis.hset(args.worker_id, 'started', 1)
|
| |
- redis.hset(args.worker_id, 'PID', os.getpid())
|
| |
+ if args.worker_id and not identify(args.worker_id, redis, log):
|
| |
+ log.error("can not identify myself")
|
| |
+ sys.exit(1)
|
| |
|
| |
resp = frontend_client.get('action/{}'.format(task_id))
|
| |
if resp.status_code != 200:
|
| |
Traceback (most recent call last):
File "/usr/bin/copr_run_action_dispatcher.py", line 12, in <module>
main()
File "/usr/bin/copr_run_action_dispatcher.py", line 9, in main
action_dispatcher.run()
File "/usr/share/copr/backend/daemons/action_dispatcher.py", line 73, in run
worker_manager.run(timeout=timeout)
File "/usr/share/copr/backend/worker_manager.py", line 166, in run
self._cleanup_workers()
File "/usr/share/copr/backend/worker_manager.py", line 203, in _cleanup_workers
allocated = float(info.get('allocated'))
TypeError: float() argument must be a string or a number, not 'NoneType'
The bug caused copr-backend-action service failure this night when
logrotate started rotating our large backend's logs. We have
Restart=on-failure, but the service failed repeatedly. My guess is that
backend become so overloaded so the related action worker did not manage
to start fast enough (within 'worker_timeout_start'). So worker-manager
deleted the worker from redis (believing that the worker died), but the
worker eventually finished ... and updated the database.
So first, make worker_manager resistant against redis entries without
'allocated' tag. Simply delete them.
Also, try to detect the slow-box scenario (absent 'allocated' tag) in
copr-backend-process-action. This is not 100% guard, but I don't think
it is worth adding a lock mechanism. In the worst case we'll process
the same action twice.
Also, prolong the worker startup time period. We had 10s before, and
during the failure this night the worker was starting almost 12s. So
even though this is not fatal anymore, larger timeout lowers the chance
that the worker will be restarted when backend is overloaded.