#999 ISSUE-991: Added a method to the producer that will move stuck builds into 'failed' state
Merged 8 months ago by mprahl. Opened 8 months ago by mcurlej.
mcurlej/fm-orchestrator cancel_modules  into  master

@@ -439,6 +439,18 @@ 

              'default': 180,

              'desc': ('Time in days when to cleanup failed module builds and transition them to '

                       'the "garbage" state.')},

+         'cleanup_stuck_builds_time': {

+             'type': int,

+             'default': 7,

+             'desc': ('Time in days when to cleanup stuck module builds and transition them to '

+                      'the "failed" state. The module has to be in a state defined by the '

+                      '"cleanup_stuck_builds_states" option.')},

+         'cleanup_stuck_builds_states': {

+             'type': list,

+             'default': ["init", "build"],

+             'desc': ('States of builds which will be considered to move to failed state when a'

+                      ' build is in one of those states longer than the value configured in the '

+                      '"cleanup_stuck_builds_time"')},

          'resolver': {

              'type': str,

              'default': 'db',

@@ -328,3 +328,38 @@ 

              if delta.total_seconds() > config.koji_target_delete_time:

                  log.info("Removing target of module %r", module)

                  koji_session.deleteBuildTarget(target['id'])

+ 

+     def cancel_stuck_module_builds(self, config, session):

+         """

+         Method transitions builds which are stuck in one state too long to the "failed" state.

+         The states are defined with the "cleanup_stuck_builds_states" config option and the

+         time is defined by the "cleanup_stuck_builds_time" config option.

+         """

+         log.info(('Looking for module builds stuck in the states "{states}" '

+                   'more than {days} days').format(

+             states=' and '.join(config.cleanup_stuck_builds_states),

+             days=config.cleanup_stuck_builds_time

+         ))

+ 

+         delta = timedelta(days=config.cleanup_stuck_builds_time)

+         now = datetime.utcnow()

+         threshold = now - delta

+         states = [module_build_service.models.BUILD_STATES[state]

+                   for state in config.cleanup_stuck_builds_states]

+ 

+         module_builds = session.query(models.ModuleBuild).filter(

+             models.ModuleBuild.state.in_(states),

+             models.ModuleBuild.time_modified < threshold).all()

+ 

+         log.info(' {0!r} module builds are stuck...'.format(len(module_builds)))

+ 

+         for build in module_builds:

+             nsvc = ":".join([build.name, build.stream, build.version, build.context])

+             log.info('Transitioning build "{nsvc}" to "Failed" state.'.format(nsvc=nsvc))

+ 

+             state_reason = "The module was in {state} for more than {days} days".format(

+                 state=build.state,

+                 days=config.cleanup_stuck_builds_time

+             )

+             build.transition(config, state=models.BUILD_STATES["failed"], state_reason=state_reason)

+             session.commit()

@@ -442,3 +442,39 @@ 

          assert module_build_one.state == models.BUILD_STATES['failed']

          # Make sure that the builder was never instantiated

          create_builder.assert_not_called()

+ 

+     @pytest.mark.parametrize('test_state', [models.BUILD_STATES[state]

+                                             for state in conf.cleanup_stuck_builds_states])

+     def test_cancel_stuck_module_builds(self, create_builder, koji_get_session, global_consumer,

+                                         dbg, test_state):

+ 

+         module_build1 = models.ModuleBuild.query.get(1)

+         module_build1.state = test_state

+         under_thresh = conf.cleanup_stuck_builds_time - 1

+         module_build1.time_modified = datetime.utcnow() - timedelta(

+             days=under_thresh, hours=23, minutes=59)

+ 

+         module_build2 = models.ModuleBuild.query.get(2)

+         module_build2.state = test_state

+         module_build2.time_modified = datetime.utcnow() - timedelta(

+             days=conf.cleanup_stuck_builds_time)

+ 

+         module_build2 = models.ModuleBuild.query.get(3)

+         module_build2.state = test_state

+         module_build2.time_modified = datetime.utcnow()

+ 

+         db.session.commit()

+ 

+         consumer = mock.MagicMock()

+         consumer.incoming = queue.Queue()

+         global_consumer.return_value = consumer

+         hub = mock.MagicMock()

+         poller = MBSProducer(hub)

+ 

+         assert consumer.incoming.qsize() == 0

+ 

+         poller.cancel_stuck_module_builds(conf, db.session)

+ 

+         module = models.ModuleBuild.query.filter_by(state=4).all()

+         assert len(module) == 1

+         assert module[0].id == 2

Hi All,

as the subject says i added a new method to the producer which will move builds to failed state, when it is stuck in a certain state (default init, build) for specific period of time (default 7 days). Can you take a look?

@mprahl @jkaluza

Signed-off-by: Martin Curlej mcurlej@redhat.com

Use build.transition(config, state=models.BUILD_STATES['failed'], state_reason=state_reason), otherwise the state trace won't be logged in DB.

Also define some state_reason why the module build failed. Something like "The module was in {state} for more than {N} days."

Finished my review. The tests failed because of infra failure, I will see if they finish after the next PR update.

This is fine to have this as a config option but I don't think it's needed.

Optional: It'd be nice to have this as a join such as:

states=' and '.join(config.cleanup_stuck_builds_states),

+1 to @jkaluza's comment. It also sends a message on the message bus announcing the transition.

Optional: This could be simplified to models.ModuleBuild.query.get(1)

"under_tresh" => "under_thresh" or "under_threshold"

@mcurlej the code looks good! I just left a few comments. Once my comments and @jkaluza's comments are addressed, let us know and we can merge it.

rebased onto 1db187b95f7fb93dd191d61ea0232fa936ef1d9c

8 months ago

@mcurlej is this ready for review again?

@mprahl yes i mentioned that on the standup.

@mprahl yes i mentioned that on the standup.

My comment was asked before standup :smile:

I'll go review this now.

Could you remove this line? The state change is handled in the build.transition call and this will cause MBS to not publish a message that the build has transition to failed because it thinks the state didn't change.

@mcurlej just one line you need to delete and then this is ready to be merged.

rebased onto 5a3d3d9

8 months ago

Commit 1077f11 fixes this pull-request

Pull-Request has been merged by mprahl

8 months ago

Pull-Request has been merged by mprahl

8 months ago