#1519 frontend: don't set ended_on for canceled builds
Merged 3 years ago by praiskup. Opened 3 years ago by schlupov.
copr/ schlupov/copr fix_timestamp  into  master

@@ -85,5 +85,6 @@ 

                                 self.opts.frontend_base_url, error)

              return []

  

-     def report_canceled_task_id(self, task_id):

-         self.frontend_client.post('build-tasks/canceled/{}'.format(task_id), None)

+     def report_canceled_task_id(self, task_id, was_running):

+         self.frontend_client.post('build-tasks/canceled/{}'.format(task_id),

+                                   data=None if not was_running else was_running)

@@ -70,7 +70,7 @@ 

          _subclass_can_use = (self)

          return []

  

-     def report_canceled_task_id(self, task_id):

+     def report_canceled_task_id(self, task_id, was_running):

          """

          Report back to Frontend that this task was canceled.  By default this

          isn't called, so it is NO-OP by default.
@@ -115,8 +115,8 @@ 

  

              self._update_process_title("getting cancel requests")

              for task_id in self.get_cancel_requests_ids():

-                 worker_manager.cancel_task_id(task_id)

-                 self.report_canceled_task_id(task_id)

+                 was_running = worker_manager.cancel_task_id(task_id)

+                 self.report_canceled_task_id(task_id, was_running)

  

              # process the tasks

              self._update_process_title("processing tasks")

@@ -379,15 +379,18 @@ 

          """

          Using task_id, cancel corresponding task, and request worker

          shut-down (when already started)

+ 

+         :return: True if worker is running on background, False otherwise

          """

          self._drop_task_id_safe(task_id)

          worker_id = self.get_worker_id(task_id)

          if worker_id not in self.worker_ids():

              self.log.info("Cancel request, worker %s is not running", worker_id)

-             return

+             return False

          self.log.info("Cancel request, worker %s requested to cancel",

                        worker_id)

          self.redis.hset(worker_id, 'cancel_request', 1)

+         return True

  

      def worker_ids(self):

          """

@@ -964,10 +964,6 @@ 

          build.canceled = True

          cls.process_update_callback(build)

  

-         for chroot in build.build_chroots:

-             chroot.status = 2  # canceled

-             if chroot.ended_on is not None:

-                 chroot.ended_on = time.time()

  

      @classmethod

      def check_build_to_delete(cls, user, build):

@@ -197,6 +197,14 @@ 

  def build_task_canceled(task_id):

      """ Report back to frontend that the task was canceled on backend """

      models.CancelRequest.query.filter_by(what=task_id).delete()

+     was_running = flask.request.json

+     if not was_running:

+         if '-' in task_id:

+             build_chroot = BuildsLogic.get_build_task(task_id)

+             build_chroot.status = StatusEnum("canceled")

+         else:

+             build = models.Build.query.filter_by(id=task_id).first()

+             build.source_status = StatusEnum("canceled")

      db.session.commit()

      return flask.jsonify("success")

  

@@ -241,6 +241,42 @@ 

          assert ended.result_dir == "00000002"

          assert ended.chroots_ended_on == {'fedora-18-x86_64': 1390866440}

  

+     def test_build_task_canceled_waiting_build(

+             self, f_users, f_coprs, f_mock_chroots, f_builds, f_db):

+ 

+         self.db.session.add(self.b3)

+         self.db.session.commit()

+ 

+         r = self.tc.post("/backend/build-tasks/canceled/{}/".format(self.b3.id),

+                          content_type="application/json",

+                          headers=self.auth_header,

+                          data=json.dumps(False))

+         assert r.status_code == 200

+         assert json.loads(r.data.decode("utf-8")) == "success"

+ 

+         build = self.models.Build.query.filter(self.models.Build.id == 3).one()

+         assert build.source_status == StatusEnum("canceled")

+ 

+     def test_build_task_canceled_running_build(

+             self, f_users, f_coprs, f_mock_chroots, f_builds, f_db):

+ 

+         self.b4.build_chroots.pop()

+         self.b4.build_chroots[0].status = StatusEnum("running")

+         self.db.session.add(self.b4)

+         self.db.session.commit()

+ 

+         r = self.tc.post("/backend/build-tasks/canceled/{}/".format(self.b4.id),

+                          content_type="application/json",

+                          headers=self.auth_header,

+                          data=json.dumps(True))

+ 

+         assert r.status_code == 200

+         assert json.loads(r.data.decode("utf-8")) == "success"

+ 

+         build = self.models.Build.query.filter(self.models.Build.id == 4).one()

+         assert build.canceled == False

+         assert build.build_chroots[0].status == StatusEnum("running")

+ 

  

  class TestWaitingActions(CoprsTestCase):

  

From what I remember, we actually don't (yet) call /update/ route with "canceled" status. If the build is already running, we asynchronously kill the mock process on builder ... which results in a StatusEnum failure actually. Which brings me to the code:
https://pagure.io/copr/copr/blob/dbfd123514c83a41c25185ed60810a28be92b161/f/frontend/coprs_frontend/coprs/logic/builds_logic.py#_956-959
That part needs to be guilty in many cases; we seem to set the state to "canceled", even though it is already failed/succeeded (e.g.). And reset the
ended_on stamp.

IMO, the build cancelation should be fully managed by the backend, because at that point in time frontend has no idea if backend has or has not started working on the build. So IMO, we should set the canceled state and ended_on in the build-tasks/canceled/{build_id} route:
https://pagure.io/copr/copr/blob/dbfd123514c83a41c25185ed60810a28be92b161/f/backend/copr_backend/daemons/build_dispatcher.py#_88-89

And we should only do that if the build not yet finished. If it is running, and it is going to be canceled on background - we should keep the state unchanged and let it "fail".

rebased onto 30a95ecfdc0bcefbf8f55d3f42da1d7024e995b5

3 years ago

Both those methods have changed interface, please document that. Also was-running would be slightly better wording

Please do post(..., was_running) here.. and generate the timestamp on FE. Otherwise it suffers from the time sync problems

Looks good, but we need to remove part of the frontend code which was previously setting ended_on flag on cancel

To be more precise... I actually think that we shouldnt play
with the ended_on here. If running task was cancelled, do nothing.
If non running task was cancelled set state=canceled. Wdyt?

rebased onto 47dafddb4aa08cd14c9ab5d82715f212b76e05de

3 years ago

This can be one-lier like post('...', data=None if not was_running else was_running).

- not in task_id means source build, not build chroot. We want to set especially BuildChroot.state here, I think?

Tests look like they need some fix, too.

Tests look like they need some fix, too.

rebased onto 359d699a21f7f882bc33314c2a13d6621ce111b0

3 years ago

Please ping me on irc once you are online; I'm sure we should brainstorm a bit about this.

rebased onto 921a26aca6bc5039b6860327fe0111edd6484022

3 years ago

I'm curious about the fail_type. What happens if we don't re-set it to None?

nit: s/Fixes: #342/Relates: #342/ ... we don't want to close that bug yet

CI failed, trying a rebuild [copr-build]

CI failed, trying a rebuild

... ok, this is irrelevant to this PR, see #1536

I'm curious about the fail_type. What happens if we don't re-set it to None?

If the fail_type variable is set, the Build State block appears, saying that the build failed, which isn't true, it was just canceled.
See: https://pagure.io/copr/copr/blob/master/f/frontend/coprs_frontend/coprs/templates/coprs/detail/_describe_failure.html

rebased onto 8c84115e3c0a5301ec5e9a39eeaea4552bf0565b

3 years ago

If the fail_type variable is set, the Build State block appears, saying that the build failed, which isn't true, it was just canceled.

Weird, when the build is just canceled - why do we set it to anything else than None in the first place?

rebased onto db7203ea415162f64129c39a85e397e7a14d5d46

3 years ago

You're right, I wasn't able to reproduce it. I deleted the line.

One last thing, I think :-) I don't think we eventually need this change. In case we fixed backend in future to report "canceled" instead of "failed" when the running job is canceled (this is something we should do, but low prio) , it wouldn't have the ended_on set....
And ATM, nobody calls the "/update/" route with state == canceled.

Otherwise looks good (there's weird working with DB in the test cases, but if that works, np).

rebased onto b788dc2cc76089864e3d9c231ca86d40ed90c1eb

3 years ago

One last thing, I think :-) I don't think we eventually need this change. In case we fixed backend in future to report "canceled" instead of "failed" when the running job is canceled (this is something we should do, but low prio) , it wouldn't have the ended_on set....
And ATM, nobody calls the "/update/" route with state == canceled.

Fixed.

rebased onto 65b3435

3 years ago

Pull-Request has been merged by praiskup

3 years ago