#1188 Builds counters
Merged 5 years ago by vmaljulin. Opened 5 years ago by vmaljulin.
vmaljulin/fm-orchestrator FACTORY-3839  into  master

@@ -320,7 +320,7 @@ 

          except ValueError:

              reason = "Failed to gather buildroot groups from SCM."

              log.exception(reason)

-             module.transition(conf, state="failed", state_reason=reason)

+             module.transition(conf, state="failed", state_reason=reason, failure_type='user')

              session.commit()

              raise

          return groups

@@ -562,7 +562,7 @@ 

              )

          return module

  

-     def transition(self, conf, state, state_reason=None):

+     def transition(self, conf, state, state_reason=None, failure_type='unspec'):

          """Record that a build has transitioned state.

  

          The history of state transitions are recorded in model
@@ -575,14 +575,21 @@ 

          :type conf: :class:`Config`

          :param int state: the state value to transition to. Refer to ``BUILD_STATES``.

          :param str state_reason: optional reason of why to transform to ``state``.

+         :param str failure_reason: optional failure type: 'unspec', 'user', 'infra'

          """

          now = datetime.utcnow()

          old_state = self.state

          self.state = state

          self.time_modified = now

  

+         from module_build_service.monitor import builder_success_counter, builder_failed_counter

+ 

          if INVERSE_BUILD_STATES[self.state] in ['done', 'failed']:

              self.time_completed = now

+             if INVERSE_BUILD_STATES[self.state] == 'done':

+                 builder_success_counter.inc()

+             else:

+                 builder_failed_counter.labels(reason=failure_type).inc()

  

          if state_reason:

              self.state_reason = state_reason

@@ -27,8 +27,8 @@ 

  

  from flask import Blueprint, Response

  from prometheus_client import (  # noqa: F401

-     ProcessCollector, CollectorRegistry, Counter, multiprocess,

-     Histogram, generate_latest, start_http_server, CONTENT_TYPE_LATEST)

+     ProcessCollector, CollectorRegistry, Counter, multiprocess, Histogram, generate_latest,

+     start_http_server, CONTENT_TYPE_LATEST)

  from sqlalchemy import event

  

  # Service-specific imports
@@ -72,6 +72,16 @@ 

      'Number of messages, for which the sender failed',

      registry=registry)

  

+ builder_success_counter = Counter(

+     'builds_success',

+     'Number of successful builds',

+     registry=registry)

+ builder_failed_counter = Counter(

+     'builds_failed_total',

+     'Number of failed builds',

+     labelnames=['reason'],      # reason could be: 'user', 'infra', 'unspec'

+     registry=registry)

+ 

  db_dbapi_error_counter = Counter(

      'db_dbapi_error',

      'Number of DBAPI errors',

@@ -264,7 +264,7 @@ 

                  if build:

                      session.refresh(build)

                      build.transition(conf, state=models.BUILD_STATES['failed'],

-                                      state_reason=str(e))

+                                      state_reason=str(e), failure_type='infra')

                      session.commit()

  

              log.debug("Done with %s" % idx)

@@ -70,7 +70,7 @@ 

      if (component_build.package == 'module-build-macros' and

         state != koji.BUILD_STATES['COMPLETE']):

          parent.transition(config, state=models.BUILD_STATES['failed'],

-                           state_reason=state_reason)

+                           state_reason=state_reason, failure_type='user')

          session.commit()

          return

  
@@ -104,7 +104,7 @@ 

                  ', '.join(c.package for c in failed_components_in_batch))

              parent.transition(config,

                                state=models.BUILD_STATES['failed'],

-                               state_reason=state_reason)

+                               state_reason=state_reason, failure_type='user')

              session.commit()

              return []

          elif not built_components_in_batch:

@@ -98,13 +98,13 @@ 

          if not build.state_reason:

              reason = "Missing koji tag. Assuming previously failed module lookup."

              log.error(reason)

-             build.transition(config, state="failed", state_reason=reason)

+             build.transition(config, state="failed", state_reason=reason, failure_type='infra')

              session.commit()

              return

  

      # Don't transition it again if it's already been transitioned

      if build.state != models.BUILD_STATES["failed"]:

-         build.transition(config, state="failed")

+         build.transition(config, state="failed", failure_type='user')

  

      session.commit()

  
@@ -149,6 +149,7 @@ 

          time.sleep(1)

  

      error_msg = ''

+     failure_reason = 'unspec'

      try:

          mmd = build.mmd()

          record_component_builds(mmd, build, session=session)
@@ -162,12 +163,15 @@ 

      except (UnprocessableEntity, Forbidden, ValidationError, RuntimeError) as e:

          log.exception(str(e))

          error_msg = str(e)

+         failure_reason = 'user'

      except (xmlrpclib.ProtocolError, koji.GenericError) as e:

          log.exception(str(e))

          error_msg = 'Koji communication error: "{0}"'.format(str(e))

+         failure_reason = 'infra'

      except Exception as e:

          log.exception(str(e))

          error_msg = "An unknown error occurred while validating the modulemd"

+         failure_reason = 'user'

      else:

          session.add(build)

          session.commit()
@@ -175,7 +179,8 @@ 

          if error_msg:

              # Rollback changes underway

              session.rollback()

-             build.transition(conf, models.BUILD_STATES["failed"], state_reason=error_msg)

+             build.transition(conf, models.BUILD_STATES["failed"], state_reason=error_msg,

+                              failure_type=failure_reason)

  

  

  def generate_module_build_koji_tag(build):
@@ -288,7 +293,7 @@ 

      except ValueError:

          reason = "Failed to get module info from MBS. Max retries reached."

          log.exception(reason)

-         build.transition(config, state="failed", state_reason=reason)

+         build.transition(config, state="failed", state_reason=reason, failure_type='infra')

          session.commit()

          raise

  

@@ -99,7 +99,8 @@ 

      if module_build.component_builds and not good:

          state_reason = 'Component(s) {} failed to build.'.format(

              ', '.join(c.package for c in current_batch if c.state in failed_states))

-         module_build.transition(config, models.BUILD_STATES['failed'], state_reason)

+         module_build.transition(config, models.BUILD_STATES['failed'], state_reason,

+                                 failure_type='infra')

          session.commit()

          log.warning("Odd!  All components in batch failed for %r." % module_build)

          return
@@ -147,7 +148,8 @@ 

              )

              module_build.transition(config,

                                      state=models.BUILD_STATES['failed'],

-                                     state_reason=state_reason)

+                                     state_reason=state_reason,

+                                     failure_type='user')

          else:

              # Tell the external buildsystem to wrap up (CG import, createrepo, etc.)

              module_build.time_completed = datetime.utcnow()

@@ -169,7 +169,8 @@ 

                  state_reason = ('The module was garbage collected since it has failed over {0}'

                                  ' day(s) ago'.format(conf.cleanup_failed_builds_time))

                  module.transition(

-                     conf, models.BUILD_STATES['garbage'], state_reason=state_reason)

+                     conf, models.BUILD_STATES['garbage'], state_reason=state_reason,

+                     failure_type='user')

                  session.add(module)

                  session.commit()

  
@@ -372,7 +373,8 @@ 

                  state=build.state,

                  days=config.cleanup_stuck_builds_time

              )

-             build.transition(config, state=models.BUILD_STATES["failed"], state_reason=state_reason)

+             build.transition(config, state=models.BUILD_STATES["failed"],

+                              state_reason=state_reason, failure_type='user')

              session.commit()

  

      def sync_koji_build_tags(self, config, session):

@@ -78,14 +78,14 @@ 

          c.state = koji.BUILD_STATES['FAILED']

          c.state_reason = "Failed to build artifact %s: %s" % (c.package, str(e))

          log.exception(e)

-         c.module_build.transition(conf, models.BUILD_STATES['failed'])

+         c.module_build.transition(conf, models.BUILD_STATES['failed'], failure_type='infra')

          return

  

      if not c.task_id and c.state == koji.BUILD_STATES['BUILDING']:

          c.state = koji.BUILD_STATES['FAILED']

          c.state_reason = ("Failed to build artifact %s: "

                            "Builder did not return task ID" % (c.package))

-         c.module_build.transition(conf, models.BUILD_STATES['failed'])

+         c.module_build.transition(conf, models.BUILD_STATES['failed'], failure_type='infra')

          return

  

  
@@ -239,7 +239,7 @@ 

              ', '.join([str(t['id']) for t in active_tasks])

          )

          module.transition(config, state=models.BUILD_STATES['failed'],

-                           state_reason=state_reason)

+                           state_reason=state_reason, failure_type='infra')

          session.commit()

          return []

  

file modified
+31 -2
@@ -23,12 +23,16 @@ 

  import os

  import pytest

  import requests

+ import mock

+ import module_build_service.config as mbs_config

  import module_build_service.monitor

+ from module_build_service import models

+ from conf.config import TestConfiguration

  

  from six.moves import reload_module

- from tests import app, init_data

+ from tests import app, init_data, make_module

  

- num_of_metrics = 16

+ num_of_metrics = 18

  

  

  class TestViews:
@@ -56,3 +60,28 @@ 

  

      assert len([l for l in r.text.splitlines()

                  if (l.startswith('# TYPE') and '_created ' not in l)]) == num_of_metrics

+ 

+ 

+ @mock.patch('module_build_service.monitor.builder_failed_counter.labels')

+ @mock.patch('module_build_service.monitor.builder_success_counter.inc')

+ def test_monitor_state_changing_success(succ_cnt, failed_cnt):

+     conf = mbs_config.Config(TestConfiguration)

+     b = make_module('pkg:0.1:1:c1', requires_list={'platform': 'el8'})

+     b.transition(conf, models.BUILD_STATES['wait'])

+     b.transition(conf, models.BUILD_STATES['build'])

+     b.transition(conf, models.BUILD_STATES['done'])

+     succ_cnt.assert_called_once()

+     failed_cnt.assert_not_called()

+ 

+ 

+ @mock.patch('module_build_service.monitor.builder_failed_counter.labels')

+ @mock.patch('module_build_service.monitor.builder_success_counter.inc')

+ def test_monitor_state_changing_failure(succ_cnt, failed_cnt):

+     failure_type = 'user'

+     conf = mbs_config.Config(TestConfiguration)

+     b = make_module('pkg:0.1:1:c1', requires_list={'platform': 'el8'})

+     b.transition(conf, models.BUILD_STATES['wait'])

+     b.transition(conf, models.BUILD_STATES['build'])

+     b.transition(conf, models.BUILD_STATES['failed'], failure_type=failure_type)

+     succ_cnt.assert_not_called()

+     failed_cnt.assert_called_once_with(reason=failure_type)

So far looks good to me, let's move on to the timers.

rebased onto 34581ab5aa3b8d2349d2f6b39eb37701d9900db2

5 years ago

Timers were separated to another issue

pretty please pagure-ci rebuild

5 years ago

pretty please pagure-ci rebuild

5 years ago

pretty please pagure-ci rebuild

5 years ago

I was thinking that maybe instead of repeating these line every time we move the module to "failed" state, we could change the models.ModuleBuild.transition to accept new failure_context variable which could be set to "user" or "infrastructure" and increment the counters in transition method itself.

This would remove the extra need to import and increment these counters everywhere. It also makes it easier to not forget incrementing these variables in case we add new transition to failed somewhere in a code.

What do you think?

What do you think?

OK for me, though, there's only one question, which status could be considered as a default: infrastructure or user? Or none of those if there's no context?

+1 on Jan's suggested improvement.

@vmaljulin I'd prefer to use labels for this, see https://github.com/prometheus/client_python#labels. We can have different labels according to the proposed failure_context, which could default to something like none (as you suggested), unspecified, unknown or whatever you think is most describing/exact.

rebased onto e7b719e3cd6370f4933834be1bc096095eba5d71

5 years ago

rebased onto aaf2e9462b83bfd0bd29bebf3d248b59ec35d07f

5 years ago

What do you think?

Done

rebased onto f56f4223cacd49535f9a8c33dba8fbaccef24241

5 years ago

It'd be safer to use failure_type=failure_type if there are any other labels in the future for the Counter.

I am not sure, whether it's good to have it as int and do the mapping between the failure type name<->int.

Just to clarify the background: If there are meaningful labels (words) to the metrics, you can work with them in Prometheus directly. If we had only the numbers, we'd have to check MBS, of what's the meaning of the numerical value (keep it in sync), then do some relabeling in Prometheus back to label names. It would make things more complicated.

rebased onto 0d9e68c15df326a8e5f3918498682cd23ca05f5a

5 years ago

rebased onto c31c041c052ce2fe8829a77f47c7435ccdb73005

5 years ago

Looks good to me now, I will wait for @fivaldi to review the prometheus bits.

int is probably from previous revision, should be 'str'

@jkaluza Is the RuntimeError more of a infra cause, isn't it?

Let's set it to unspec here.

It can be both. It can for example happen when user sets wrong git commit or wrong component name in the modulemd file. In both cases, the SCM (which is behind that record_component_builds) will fail, because it cannot find the commit ref in a repo, but who knows if that's infra issue or user issue.

I would keep it marked as "user" issue, because in most cases it is "user" issue. There might be cases where it isn't and we would need extra code to decide that, but this is not worth doing for this corner case when the reason of failure is not so clear.

I have also quickly checked when we raise other RuntimeError exceptions and in most times, it is caused by some user failure leading to situation when MBS cannot proceed the request further.

rebased onto e564edc

5 years ago

@vmaljulin Great! :thumbsup: Feel free to merge!

Pull-Request has been merged by vmaljulin

5 years ago

Cool! I'm excited to see a demo of this.