#4032 implicit task refusals
Merged 2 months ago by tkopecek. Opened 2 months ago by mikem.

file modified
+10 -6
@@ -191,7 +191,6 @@ 

          self.active_tasks = []

          self.free_tasks = []

  

-         # TODO these things need proper config

          self.maxjobs = context.opts['MaxJobs']

          self.capacity_overcommit = context.opts['CapacityOvercommit']

          self.ready_timeout = context.opts['ReadyTimeout']
@@ -371,12 +370,14 @@ 

                  # TODO fix

  

              if task['state'] == koji.TASK_STATES['ASSIGNED']:

-                 # TODO check time since assigned

-                 # if not taken within a timeout

-                 #  - if host not checking in, then make sure host marked unavail and free

-                 #  - if host *is* checking in, then treat as refusal and free

-                 age = time.time() - min([r['create_ts'] for r in taskruns])

+                 assign_ts = min([r['create_ts'] for r in taskruns])

+                 age = time.time() - assign_ts

                  if age > self.assign_timeout:

+                     # has the host checked in since we assigned?

+                     if host['update_ts'] and host['update_ts'] > assign_ts:

+                         # treat this as an implicit refusal

+                         # possibly an older koji version on builder

+                         set_refusal(host['id'], task['task_id'], msg='assignment timeout')

                      log_both('Task assignment timeout', task_id=task['task_id'],

                               host_id=host['id'])

                      kojihub.Task(task['task_id']).free()
@@ -422,6 +423,9 @@ 

                  values={'host_ids': [h['id'] for h in hosts_to_mark]},

              )

              update.execute()

+         # also update our data

+         for host in hosts_to_mark:

+             host['ready'] = False

  

      def get_active_runs(self):

          runs = get_task_runs([["active", True]])

@@ -139,6 +139,7 @@ 

          mock.patch('kojihub.Task.assign', new=my_assign).start()

          self.log_db = mock.MagicMock()

          mock.patch('kojihub.scheduler.log_db', new=self.log_db).start()

+         self.set_refusal = mock.patch('kojihub.scheduler.set_refusal').start()

  

      def test_check_no_active(self):

          self.assertEqual(self.sched.active_tasks, [])  # set by init
@@ -184,14 +185,40 @@ 

  

      def test_check_assign_timeout(self):

          # 'Task assignment timeout' case

-         create_ts = 0

+         create_ts = 1000

+         update_ts = 999  # host ts BEFORE assignment

          now = 1000000

          self.sched.active_tasks = [{'task_id': 99, 'host_id': 23, 'state': koji.TASK_STATES['ASSIGNED']}]

-         self.sched.hosts = {23: {'id': 23, 'name': 'test host 23'}}

+         self.sched.hosts = {23: {'id': 23, 'name': 'test host 23', 'update_ts': update_ts}}

          self.sched.get_active_runs.return_value = {99: [{'create_ts': create_ts}]}

+ 

          with mock.patch('time.time', return_value=now):

              self.sched.check_active_tasks()

+ 

+         self.get_active_runs.assert_called_once()

+         self.set_refusal.assert_not_called()

+         self.log_db.assert_called_once_with('Task assignment timeout', 99, 23)

+         # we should free such tasks

+         self.assertEqual(self.frees, [99])

+         self.assertEqual(self.assigns, [])

+         self.assertEqual(len(self.updates), 1)

+         update = self.updates[0]

+         self.assertEqual(update.table, 'scheduler_task_runs')

+ 

+     def test_check_implicit_refusal(self):

+         # 'Task assignment timeout' case

+         create_ts = 1000

+         update_ts = 1001  # host ts AFTER assignment

+         now = 1000000

+         self.sched.active_tasks = [{'task_id': 99, 'host_id': 23, 'state': koji.TASK_STATES['ASSIGNED']}]

+         self.sched.hosts = {23: {'id': 23, 'name': 'test host 23', 'update_ts': update_ts}}

+         self.sched.get_active_runs.return_value = {99: [{'create_ts': create_ts}]}

+ 

+         with mock.patch('time.time', return_value=now):

+             self.sched.check_active_tasks()

+ 

          self.get_active_runs.assert_called_once()

+         self.set_refusal.assert_called_once_with(23, 99, msg='assignment timeout')

          self.log_db.assert_called_once_with('Task assignment timeout', 99, 23)

          # we should free such tasks

          self.assertEqual(self.frees, [99])

If a task assignment times out and the host has been checking in, treat this as a soft refusal

Metadata Update from @tkopecek:
- Pull-request tagged with: testing-ready

2 months ago

Commit 7d126ab fixes this pull-request

Pull-Request has been merged by tkopecek

2 months ago

Metadata Update from @relias-redhat:
- Pull-request tagged with: testing-done

2 months ago