#1323 Do not start newRepo task if one in progress
Merged 4 years ago by lucarval. Opened 4 years ago by lucarval.
lucarval/fm-orchestrator newrepo-dedupe  into  master

@@ -89,10 +89,16 @@ 

              if c.state == koji.BUILD_STATES["BUILDING"] or not c.state

          ]

          if unbuilt_components:

-             repo_tag = builder.module_build_tag["name"]

-             log.info("All components in batch tagged, regenerating repo for tag %s", repo_tag)

-             task_id = builder.koji_session.newRepo(repo_tag)

-             module_build.new_repo_task_id = task_id

+             if not _is_new_repo_generating(module_build, builder.koji_session):

+                 repo_tag = builder.module_build_tag["name"]

+                 log.info("All components in batch tagged, regenerating repo for tag %s", repo_tag)

+                 task_id = builder.koji_session.newRepo(repo_tag)

+                 module_build.new_repo_task_id = task_id

+             else:

+                 log.info(

+                     "newRepo task %s for %r already in progress, not starting another one",

+                     str(module_build.new_repo_task_id), module_build,

+                 )

          else:

              # In case this is the last batch, we do not need to regenerate the

              # buildroot, because we will not build anything else in it. It
@@ -106,3 +112,18 @@ 

          session.commit()

  

      return further_work

+ 

+ 

+ def _is_new_repo_generating(module_build, koji_session):

+     """ Return whether or not a new repo is already being generated. """

+     if not module_build.new_repo_task_id:

+         return False

+ 

+     log.debug(

+         'Checking status of newRepo task "%d" for %s', module_build.new_repo_task_id, module_build)

+     task_info = koji_session.getTaskInfo(module_build.new_repo_task_id)

+ 

+     active_koji_states = [

+         koji.TASK_STATES["FREE"], koji.TASK_STATES["OPEN"], koji.TASK_STATES["ASSIGNED"]]

+ 

+     return task_info["state"] in active_koji_states

@@ -50,7 +50,7 @@ 

                  self.process_open_component_builds(session)

                  self.fail_lost_builds(session)

                  self.process_paused_module_builds(conf, session)

-                 self.trigger_new_repo_when_stalled(conf, session)

+                 self.retrigger_new_repo_on_failure(conf, session)

                  self.delete_old_koji_targets(conf, session)

                  self.cleanup_stale_failed_builds(conf, session)

                  self.sync_koji_build_tags(conf, session)
@@ -275,29 +275,28 @@ 

              # then no possible event will start off new component builds.

              # But do not try to start new builds when we are waiting for the

              # repo-regen.

-             if (

-                 not module_build.current_batch(koji.BUILD_STATES["BUILDING"])

-                 and not module_build.new_repo_task_id

-             ):

-                 log.info("  Processing the paused module build %r", module_build)

+             if not module_build.current_batch(koji.BUILD_STATES["BUILDING"]):

                  # Initialize the builder...

                  builder = GenericBuilder.create_from_module(session, module_build, config)

- 

-                 further_work = module_build_service.utils.start_next_batch_build(

-                     config, module_build, session, builder)

-                 for event in further_work:

-                     log.info("  Scheduling faked event %r" % event)

-                     module_build_service.scheduler.consumer.work_queue_put(event)

+                 if _has_missed_new_repo_message(module_build, builder.koji_session):

+                     log.info("  Processing the paused module build %r", module_build)

+                     further_work = module_build_service.utils.start_next_batch_build(

+                         config, module_build, session, builder)

+                     for event in further_work:

+                         log.info("  Scheduling faked event %r" % event)

+                         module_build_service.scheduler.consumer.work_queue_put(event)

  

              # Check if we have met the threshold.

              if module_build_service.utils.at_concurrent_component_threshold(config, session):

                  break

  

-     def trigger_new_repo_when_stalled(self, config, session):

+     def retrigger_new_repo_on_failure(self, config, session):

          """

-         Sometimes the Koji repo regeneration stays in "init" state without

-         doing anything and our module build stucks. In case the module build

-         gets stuck on that, we trigger newRepo again to rebuild it.

+         Retrigger failed new repo tasks for module builds in the build state.

+ 

+         The newRepo task may fail for various reasons outside the scope of MBS.

+         This method will detect this scenario and retrigger the newRepo task

+         if needed to avoid the module build from being stuck in the "build" state.

          """

          if config.system != "koji":

              return
@@ -319,8 +318,6 @@ 

                  )

                  taginfo = koji_session.getTag(module_build.koji_tag + "-build")

                  module_build.new_repo_task_id = koji_session.newRepo(taginfo["name"])

-             else:

-                 module_build.new_repo_task_id = 0

  

          session.commit()

  
@@ -496,3 +493,18 @@ 

                      build.state_reason += " (Error occured while querying Greenwave)"

                  build.time_modified = datetime.utcnow()

              session.commit()

+ 

+ 

+ def _has_missed_new_repo_message(module_build, koji_session):

+     """

+     Returns whether or not a new repo message has probably been missed.

+     """

+     if not module_build.new_repo_task_id:

+         # A newRepo task has incorrectly not been requested. Treat it as a missed

+         # message so module build can recover.

+         return True

+     log.debug(

+         'Checking status of newRepo task "%d" for %s', module_build.new_repo_task_id, module_build)

+     task_info = koji_session.getTaskInfo(module_build.new_repo_task_id)

+     # Other final states, FAILED and CANCELED, are handled by retrigger_new_repo_on_failure

+     return task_info["state"] == koji.TASK_STATES["CLOSED"]

@@ -116,6 +116,7 @@ 

      on_buildroot_add_artifacts_cb = None

      on_tag_artifacts_cb = None

      on_buildroot_add_repos_cb = None

+     on_get_task_info_cb = None

  

      @module_build_service.utils.validate_koji_tag("tag_name")

      def __init__(self, owner, module, config, tag_name, components):
@@ -237,6 +238,8 @@ 

              return 123

  

          session.newRepo = _newRepo

+         if self.on_get_task_info_cb:

+             session.getTaskInfo = self.on_get_task_info_cb

          return session

  

      @property
@@ -459,6 +462,10 @@ 

          def on_tag_artifacts_cb(cls, artifacts, dest_tag=True):

              assert tag_groups.pop(0) == set(artifacts)

  

+         def on_get_task_info_cb(cls, task_id):

+             return {"state": koji.TASK_STATES["CLOSED"]}

+ 

+         FakeModuleBuilder.on_get_task_info_cb = on_get_task_info_cb

          FakeModuleBuilder.on_finalize_cb = on_finalize_cb

          FakeModuleBuilder.on_tag_artifacts_cb = on_tag_artifacts_cb

  

@@ -87,7 +87,6 @@ 

          poller.poll()

  

          # Refresh our module_build object.

-         module_build = models.ModuleBuild.query.filter_by(id=3).one()

          db.session.refresh(module_build)

  

          # If fresh is set, we expect the poller to not touch the module build since it's been less
@@ -105,9 +104,62 @@ 

  

          assert len(start_build_component.mock_calls) == expected_build_calls

  

+     @pytest.mark.parametrize('task_state, expect_start_build_component', (

+         (None, True),  # Indicates a newRepo task has not been triggered yet.

+         (koji.TASK_STATES["CLOSED"], True),

+         (koji.TASK_STATES["OPEN"], False),

+     ))

+     @patch("module_build_service.utils.batches.start_build_component")

+     def test_process_paused_module_builds_with_new_repo_task(

+         self, start_build_component, create_builder, global_consumer, dbg, task_state,

+         expect_start_build_component

+     ):

+         """

+         Tests general use-case of process_paused_module_builds.

+         """

+         consumer = mock.MagicMock()

+         consumer.incoming = queue.Queue()

+         global_consumer.return_value = consumer

+ 

+         builder = mock.MagicMock()

+         create_builder.return_value = builder

+ 

+         # Change the batch to 2, so the module build is in state where

+         # it is not building anything, but the state is "build".

+         module_build = models.ModuleBuild.query.get(3)

+         module_build.batch = 2

+         module_build.time_modified = datetime.utcnow() - timedelta(days=5)

+         if task_state:

+             koji_session = mock.MagicMock()

+             koji_session.getTaskInfo.return_value = {"state": task_state}

+             builder.koji_session = koji_session

+             module_build.new_repo_task_id = 123

+         db.session.commit()

+ 

+         # Poll :)

+         hub = mock.MagicMock()

+         poller = MBSProducer(hub)

+         poller.poll()

+ 

+         # Refresh our module_build object.

+         db.session.refresh(module_build)

+ 

+         if expect_start_build_component:

+             expected_state = koji.BUILD_STATES["BUILDING"]

+             expected_build_calls = 2

+         else:

+             expected_state = None

+             expected_build_calls = 0

+ 

+         components = module_build.current_batch()

+         for component in components:

+             assert component.state == expected_state

+ 

+         assert len(start_build_component.mock_calls) == expected_build_calls

+ 

      @patch.dict("sys.modules", krbV=mock.MagicMock())

      @patch("module_build_service.builder.KojiModuleBuilder.KojiClientSession")

-     def test_trigger_new_repo_when_failed(

+     def test_retrigger_new_repo_on_failure(

          self, ClientSession, create_builder, global_consumer, dbg

      ):

          """
@@ -174,11 +226,10 @@ 

          poller.poll()

  

          # Refresh our module_build object.

-         module_build = models.ModuleBuild.query.filter_by(id=3).one()

          db.session.refresh(module_build)

  

          assert not koji_session.newRepo.called

-         assert module_build.new_repo_task_id == 0

+         assert module_build.new_repo_task_id == 123456

  

      def test_process_paused_module_builds_waiting_for_repo(

          self, create_builder, global_consumer, dbg
@@ -207,7 +258,6 @@ 

          poller.poll()

  

          # Refresh our module_build object.

-         module_build = models.ModuleBuild.query.filter_by(id=3).one()

          db.session.refresh(module_build)

  

          # Components should not be in building state

@@ -31,6 +31,7 @@ 

  from tests import conf, db

  

  import koji

+ import pytest

  

  

  class TestTagTagged:
@@ -504,3 +505,114 @@ 

          # newRepo task_id should be stored in database, so we can check its

          # status later in poller.

          assert module_build.new_repo_task_id == 123456

+ 

+     @pytest.mark.parametrize('task_state, expect_new_repo', (

+         (None, True),  # Indicates a newRepo task has not been triggered yet.

+         (koji.TASK_STATES["CLOSED"], True),

+         (koji.TASK_STATES["CANCELED"], True),

+         (koji.TASK_STATES["FAILED"], True),

+         (koji.TASK_STATES["FREE"], False),

+         (koji.TASK_STATES["OPEN"], False),

+         (koji.TASK_STATES["ASSIGNED"], False),

+     ))

+     @patch(

+         "module_build_service.builder.GenericBuilder.default_buildroot_groups",

+         return_value={"build": [], "srpm-build": []},

+     )

+     @patch("module_build_service.builder.KojiModuleBuilder.KojiModuleBuilder.get_session")

+     @patch("module_build_service.builder.GenericBuilder.create_from_module")

+     def test_newrepo_not_duplicated(

+         self, create_builder, koji_get_session, dbg, task_state, expect_new_repo

+     ):

+         """

+         Test that newRepo is not called if a task is already in progress.

+         """

+         koji_session = mock.MagicMock()

+         koji_session.getTag = lambda tag_name: {"name": tag_name}

+         koji_session.getTaskInfo.return_value = {"state": task_state}

+         koji_session.newRepo.return_value = 123456

+         koji_get_session.return_value = koji_session

+ 

+         builder = mock.MagicMock()

+         builder.koji_session = koji_session

+         builder.buildroot_ready.return_value = False

+         builder.module_build_tag = {

+             "name": "module-testmodule-master-20170219191323-c40c156c-build"

+         }

+         create_builder.return_value = builder

+ 

+         module_build = module_build_service.models.ModuleBuild.query.get(3)

+         assert module_build

+ 

+         # Set previous components as COMPLETE and tagged.

+         module_build.batch = 1

+         for c in module_build.up_to_current_batch():

+             c.state = koji.BUILD_STATES["COMPLETE"]

+             c.tagged = True

+             c.tagged_in_final = True

+ 

+         module_build.batch = 2

+         for c in module_build.current_batch():

+             if c.package == "perl-Tangerine":

+                 c.nvr = "perl-Tangerine-0.23-1.module+0+d027b723"

+             elif c.package == "perl-List-Compare":

+                 c.nvr = "perl-List-Compare-0.53-5.module+0+d027b723"

+             c.state = koji.BUILD_STATES["COMPLETE"]

+ 

+         if task_state is not None:

+             module_build.new_repo_task_id = 123456

+ 

+         db.session.commit()

+ 

+         # Tag the first component to the buildroot.

+         msg = module_build_service.messaging.KojiTagChange(

+             "id",

+             "module-testmodule-master-20170219191323-c40c156c-build",

+             "perl-Tangerine",

+             "perl-Tangerine-0.23-1.module+0+d027b723",

+         )

+         module_build_service.scheduler.handlers.tags.tagged(

+             config=conf, session=db.session, msg=msg)

+         # Tag the first component to the final tag.

+         msg = module_build_service.messaging.KojiTagChange(

+             "id",

+             "module-testmodule-master-20170219191323-c40c156c",

+             "perl-Tangerine",

+             "perl-Tangerine-0.23-1.module+0+d027b723",

+         )

+         module_build_service.scheduler.handlers.tags.tagged(

+             config=conf, session=db.session, msg=msg)

+         # Tag the second component to the buildroot.

+         msg = module_build_service.messaging.KojiTagChange(

+             "id",

+             "module-testmodule-master-20170219191323-c40c156c-build",

+             "perl-List-Compare",

+             "perl-List-Compare-0.53-5.module+0+d027b723",

+         )

+         module_build_service.scheduler.handlers.tags.tagged(

+             config=conf, session=db.session, msg=msg)

+         # Tag the second component to the final tag.

+         msg = module_build_service.messaging.KojiTagChange(

+             "id",

+             "module-testmodule-master-20170219191323-c40c156c",

+             "perl-List-Compare",

+             "perl-List-Compare-0.53-5.module+0+d027b723",

+         )

+         module_build_service.scheduler.handlers.tags.tagged(

+             config=conf, session=db.session, msg=msg)

+ 

+         # All components are tagged, newRepo should be called if there are no active tasks.

+         if expect_new_repo:

+             koji_session.newRepo.assert_called_once_with(

+                 "module-testmodule-master-20170219191323-c40c156c-build")

+         else:

+             assert not koji_session.newRepo.called

+ 

+         # Refresh our module_build object.

+         db.session.expunge(module_build)

+         module_build = module_build_service.models.ModuleBuild.query.filter_by(id=3).one()

+ 

+         # newRepo task_id should be stored in database, so we can check its

+         # status later in poller.

+         assert module_build.new_repo_task_id == 123456

+         koji_session.newRepo.reset_mock()

Build #212 failed (commit: e1df17d9c6ca125170eb7059fc502ef27306f8fe).
Rebase or make new commits to rebuild.

rebased onto 812e28cffea536adae736ffea15d7e1cd54dec85

4 years ago

pretty please pagure-ci rebuild

4 years ago

Optional: It'd be nice if we renamed trigger_new_repo_when_stalled to trigger_new_repo_when_failed and fixed the docstring. I understand it's a different change than this PR, but it's related and it'd be nice to do.

Optional:
I think a more conventional approach to this is:

module_build = module_build_service.models.ModuleBuild.query.get(3)
assert module_build

Either approach works for me though.

Once all the components are tagged, it'd be nice to trigger the handler again with a duplicate message to make sure another newRepo task isn't created. This would simulate the bug that this PR fixes.

@lucarval unfortunately, since you created this PR from your fork, CentOS CI (Jenkins) will intermittently fail to pull down your change. I can trigger it to run again.

This looks good to me.

@lucarval it looks you need to rebase on the latest master for the tests to pass.

Once all the components are tagged, it'd be nice to trigger the handler again with a duplicate message to make sure another newRepo task isn't created. This would simulate the bug that this PR fixes.

Hmm after giving some thought on this, I don't think this is necessary. The handler is idempotent so running it multiple times will give you the same result. What changes the behavior is whether or not module_build.new_repo_task_id is set, and if so the state of the task in koji. This is thoroughly tested by the different values used in parametrize.

I'll add a little loop to ensure that it is idempotent, but depending on the value of task_state, it may or may not trigger the newRepo task.

Let's say that the handler ran and the task is FAILED. The handler will kick off a newRepo task as expected. But let's say the handler is triggered again. We want to check the task status again.

rebased onto b5b2657c3ab169f19097c991e47f76cd39c2a0c4

4 years ago

pretty please pagure-ci rebuild

4 years ago

rebased onto 86015c5

4 years ago

@mprahl ptal. I think I addressed all your comments.

btw, I've no idea how to make the centos ci job pass.

I think it would be a more natural way to call log.info as a else branch of if not _is_new_repo_generating(module_build, builder.koji_session):. Thus, _is_new_repo_generating is just a simple function to return True or False without any other side effect, even if to just log something.

@lucarval Regarding the failure test, I suggest you to modify tox.ini and add another v to py.test manually to get a full diff

pretty please pagure-ci rebuild

4 years ago

Optional: It might be good to make sure task_info isn't None before accessing a key.

Good point @cqi. The intention of the calling code is assumed in this log message, which maybe okay since it's a private function. I'm not sure what the best practice is in this case.

@lucarval, I think adding a debug log here indicating that the new repo task is active or not is good though.

Optional: Adding a debug log message indicating that you're checking to see if the new repo task is active would be nice.

Optional: I noticed that this method queries for all modules in the build state. It might be nice to limit this to modules that haven't been updated in 10+ minutes.

Optional: It's good practice to have the first line of the doc string be in the imperative mood like a commit message. An example for this would be Retrigger failed new repo tasks for builds in the build state..

I just realized this will affect the behavior of process_paused_module_builds. I'm not sure what the best approach is here though.

It seems that the modules.failed handler will cancel any new repo tasks. Will this fail if the new repo task is already complete? Should there be a check of _is_new_repo_generating before trying to cancel it?

pretty please pagure-ci rebuild

4 years ago

I just realized this will affect the behavior of process_paused_module_builds. I'm not sure what the best approach is here though.

We could check the state of the task, if one is set. If it's CLOSED assume a message was missed and we're not "waiting for the repo-regen"?

It seems that the modules.failed handler will cancel any new repo tasks. Will this fail if the new repo task is already complete? Should there be a check of _is_new_repo_generating before trying to cancel it?

Koji seems to handle this gracefully. No error is raised in such case:

def cancel(self, recurse=True):                                            
    """Cancel this task.                                                   

    A task can only be canceled if it is not already in the 'CLOSED' state.
    If it is, no action will be taken.  Return True if the task is         
    successfully canceled, or if it was already canceled, False if it is   
    closed."""                                                             

I just realized this will affect the behavior of process_paused_module_builds. I'm not sure what the best approach is here though.

We could check the state of the task, if one is set. If it's CLOSED assume a message was missed and we're not "waiting for the repo-regen"?

Sounds good to me.

Optional: Adding a debug log message indicating that you're checking to see if the new repo task is active would be nice.

This conflicts with @cqi's suggestion to make _is_new_repo_generating not produce any side effect.

rebased onto ea08cb206891690d49c8f604c15bd4bb7b1b4e0f

4 years ago

pretty please pagure-ci rebuild

4 years ago

Build #233 failed (commit: a258270d27544063308d441913cf440de8716a3a).
Rebase or make new commits to rebuild.

Optional: Adding a debug log message indicating that you're checking to see if the new repo task is active would be nice.

This conflicts with @cqi's suggestion to make _is_new_repo_generating not produce any side effect.

I just reread @cqi's comment and had misunderstood the intention of his comment. I thought it was more about the log message assuming the context of what was calling it, when we actually don't know the context in the function since it could be called for different purposes in the code-base.

I prefer having debug logs to know what MBS is doing, especially calls to external systems, but that's just my opinion.

rebased onto 53c50dcf7cae0ef3fdb62d00d7653b4ef78336dc

4 years ago

I think I addressed all the comments, including:

  1. Always debug log when fetching task info
  2. Check if task is already completed in process_paused_module_builds

Is this necessary if you're refreshing it on the next line?

By needing to reset the mock, it seems that your test is not actually testing if the handler is idempotent. I imagine because the koji_session.getTaskInfo mock is not using side_effect to return a separate state after you've called it the first time.

I also think the handler itself is not idempotent since it doesn't check if the component is already tagged when receiving a tagged message. This is outside of the scope of this PR, so I think the for loop can just be removed. I know I requested this part of the test, so I'm sorry asking you to remove it.

@lucarval this looks great. I left a few minor comments. Once addressed, feel free to merge it.

Is this necessary if you're refreshing it on the next line?

either-or is ok I think

From the FACTORY-4509 point of view, that is what I've been working on, let's use db_session fixture in test and call models.ModuleBuild.get_by_id(db_session, 3), so that we don't mix the use of database sessions in tests. On the other hand, let's reuse existing model methods as much as possible instead of writing query in ORM repeatedly.

This is following the same pattern as the other tests. I'll add a commit to use the db.session.refresh. We can change to the new pattern depending on which PR gets merged first.

rebased onto 9e50e9e

4 years ago

Pull-Request has been merged by lucarval

4 years ago
Metadata