From e564edc808b0d533c9e52402772e0c0998614693 Mon Sep 17 00:00:00 2001 From: Valerij Maljulin Date: Apr 09 2019 13:44:45 +0000 Subject: Build counters Signed-off-by: Valerij Maljulin --- diff --git a/module_build_service/builder/base.py b/module_build_service/builder/base.py index b54e45a..ccb3633 100644 --- a/module_build_service/builder/base.py +++ b/module_build_service/builder/base.py @@ -320,7 +320,7 @@ class GenericBuilder(six.with_metaclass(ABCMeta)): except ValueError: reason = "Failed to gather buildroot groups from SCM." log.exception(reason) - module.transition(conf, state="failed", state_reason=reason) + module.transition(conf, state="failed", state_reason=reason, failure_type='user') session.commit() raise return groups diff --git a/module_build_service/models.py b/module_build_service/models.py index a206b16..22e2f79 100644 --- a/module_build_service/models.py +++ b/module_build_service/models.py @@ -562,7 +562,7 @@ class ModuleBuild(MBSBase): ) return module - def transition(self, conf, state, state_reason=None): + def transition(self, conf, state, state_reason=None, failure_type='unspec'): """Record that a build has transitioned state. The history of state transitions are recorded in model @@ -575,14 +575,21 @@ class ModuleBuild(MBSBase): :type conf: :class:`Config` :param int state: the state value to transition to. Refer to ``BUILD_STATES``. :param str state_reason: optional reason of why to transform to ``state``. + :param str failure_reason: optional failure type: 'unspec', 'user', 'infra' """ now = datetime.utcnow() old_state = self.state self.state = state self.time_modified = now + from module_build_service.monitor import builder_success_counter, builder_failed_counter + if INVERSE_BUILD_STATES[self.state] in ['done', 'failed']: self.time_completed = now + if INVERSE_BUILD_STATES[self.state] == 'done': + builder_success_counter.inc() + else: + builder_failed_counter.labels(reason=failure_type).inc() if state_reason: self.state_reason = state_reason diff --git a/module_build_service/monitor.py b/module_build_service/monitor.py index 70d6afc..d6e455a 100644 --- a/module_build_service/monitor.py +++ b/module_build_service/monitor.py @@ -27,8 +27,8 @@ import tempfile from flask import Blueprint, Response from prometheus_client import ( # noqa: F401 - ProcessCollector, CollectorRegistry, Counter, multiprocess, - Histogram, generate_latest, start_http_server, CONTENT_TYPE_LATEST) + ProcessCollector, CollectorRegistry, Counter, multiprocess, Histogram, generate_latest, + start_http_server, CONTENT_TYPE_LATEST) from sqlalchemy import event # Service-specific imports @@ -72,6 +72,16 @@ messaging_tx_failed_counter = Counter( 'Number of messages, for which the sender failed', registry=registry) +builder_success_counter = Counter( + 'builds_success', + 'Number of successful builds', + registry=registry) +builder_failed_counter = Counter( + 'builds_failed_total', + 'Number of failed builds', + labelnames=['reason'], # reason could be: 'user', 'infra', 'unspec' + registry=registry) + db_dbapi_error_counter = Counter( 'db_dbapi_error', 'Number of DBAPI errors', diff --git a/module_build_service/scheduler/consumer.py b/module_build_service/scheduler/consumer.py index 4e1b4f4..d43fa8e 100644 --- a/module_build_service/scheduler/consumer.py +++ b/module_build_service/scheduler/consumer.py @@ -264,7 +264,7 @@ class MBSConsumer(fedmsg.consumers.FedmsgConsumer): if build: session.refresh(build) build.transition(conf, state=models.BUILD_STATES['failed'], - state_reason=str(e)) + state_reason=str(e), failure_type='infra') session.commit() log.debug("Done with %s" % idx) diff --git a/module_build_service/scheduler/handlers/components.py b/module_build_service/scheduler/handlers/components.py index 4f7c8a7..38ac504 100644 --- a/module_build_service/scheduler/handlers/components.py +++ b/module_build_service/scheduler/handlers/components.py @@ -70,7 +70,7 @@ def _finalize(config, session, msg, state): if (component_build.package == 'module-build-macros' and state != koji.BUILD_STATES['COMPLETE']): parent.transition(config, state=models.BUILD_STATES['failed'], - state_reason=state_reason) + state_reason=state_reason, failure_type='user') session.commit() return @@ -104,7 +104,7 @@ def _finalize(config, session, msg, state): ', '.join(c.package for c in failed_components_in_batch)) parent.transition(config, state=models.BUILD_STATES['failed'], - state_reason=state_reason) + state_reason=state_reason, failure_type='user') session.commit() return [] elif not built_components_in_batch: diff --git a/module_build_service/scheduler/handlers/modules.py b/module_build_service/scheduler/handlers/modules.py index 2d1ea2f..a617bcd 100644 --- a/module_build_service/scheduler/handlers/modules.py +++ b/module_build_service/scheduler/handlers/modules.py @@ -98,13 +98,13 @@ def failed(config, session, msg): if not build.state_reason: reason = "Missing koji tag. Assuming previously failed module lookup." log.error(reason) - build.transition(config, state="failed", state_reason=reason) + build.transition(config, state="failed", state_reason=reason, failure_type='infra') session.commit() return # Don't transition it again if it's already been transitioned if build.state != models.BUILD_STATES["failed"]: - build.transition(config, state="failed") + build.transition(config, state="failed", failure_type='user') session.commit() @@ -149,6 +149,7 @@ def init(config, session, msg): time.sleep(1) error_msg = '' + failure_reason = 'unspec' try: mmd = build.mmd() record_component_builds(mmd, build, session=session) @@ -162,12 +163,15 @@ def init(config, session, msg): except (UnprocessableEntity, Forbidden, ValidationError, RuntimeError) as e: log.exception(str(e)) error_msg = str(e) + failure_reason = 'user' except (xmlrpclib.ProtocolError, koji.GenericError) as e: log.exception(str(e)) error_msg = 'Koji communication error: "{0}"'.format(str(e)) + failure_reason = 'infra' except Exception as e: log.exception(str(e)) error_msg = "An unknown error occurred while validating the modulemd" + failure_reason = 'user' else: session.add(build) session.commit() @@ -175,7 +179,8 @@ def init(config, session, msg): if error_msg: # Rollback changes underway session.rollback() - build.transition(conf, models.BUILD_STATES["failed"], state_reason=error_msg) + build.transition(conf, models.BUILD_STATES["failed"], state_reason=error_msg, + failure_type=failure_reason) def generate_module_build_koji_tag(build): @@ -288,7 +293,7 @@ def wait(config, session, msg): except ValueError: reason = "Failed to get module info from MBS. Max retries reached." log.exception(reason) - build.transition(config, state="failed", state_reason=reason) + build.transition(config, state="failed", state_reason=reason, failure_type='infra') session.commit() raise diff --git a/module_build_service/scheduler/handlers/repos.py b/module_build_service/scheduler/handlers/repos.py index 6358624..be822a5 100644 --- a/module_build_service/scheduler/handlers/repos.py +++ b/module_build_service/scheduler/handlers/repos.py @@ -99,7 +99,8 @@ def done(config, session, msg): if module_build.component_builds and not good: state_reason = 'Component(s) {} failed to build.'.format( ', '.join(c.package for c in current_batch if c.state in failed_states)) - module_build.transition(config, models.BUILD_STATES['failed'], state_reason) + module_build.transition(config, models.BUILD_STATES['failed'], state_reason, + failure_type='infra') session.commit() log.warning("Odd! All components in batch failed for %r." % module_build) return @@ -147,7 +148,8 @@ def done(config, session, msg): ) module_build.transition(config, state=models.BUILD_STATES['failed'], - state_reason=state_reason) + state_reason=state_reason, + failure_type='user') else: # Tell the external buildsystem to wrap up (CG import, createrepo, etc.) module_build.time_completed = datetime.utcnow() diff --git a/module_build_service/scheduler/producer.py b/module_build_service/scheduler/producer.py index 339ea93..be85c9a 100644 --- a/module_build_service/scheduler/producer.py +++ b/module_build_service/scheduler/producer.py @@ -169,7 +169,8 @@ class MBSProducer(PollingProducer): state_reason = ('The module was garbage collected since it has failed over {0}' ' day(s) ago'.format(conf.cleanup_failed_builds_time)) module.transition( - conf, models.BUILD_STATES['garbage'], state_reason=state_reason) + conf, models.BUILD_STATES['garbage'], state_reason=state_reason, + failure_type='user') session.add(module) session.commit() @@ -372,7 +373,8 @@ class MBSProducer(PollingProducer): state=build.state, days=config.cleanup_stuck_builds_time ) - build.transition(config, state=models.BUILD_STATES["failed"], state_reason=state_reason) + build.transition(config, state=models.BUILD_STATES["failed"], + state_reason=state_reason, failure_type='user') session.commit() def sync_koji_build_tags(self, config, session): diff --git a/module_build_service/utils/batches.py b/module_build_service/utils/batches.py index 018856b..0c591db 100644 --- a/module_build_service/utils/batches.py +++ b/module_build_service/utils/batches.py @@ -78,14 +78,14 @@ def start_build_component(builder, c): c.state = koji.BUILD_STATES['FAILED'] c.state_reason = "Failed to build artifact %s: %s" % (c.package, str(e)) log.exception(e) - c.module_build.transition(conf, models.BUILD_STATES['failed']) + c.module_build.transition(conf, models.BUILD_STATES['failed'], failure_type='infra') return if not c.task_id and c.state == koji.BUILD_STATES['BUILDING']: c.state = koji.BUILD_STATES['FAILED'] c.state_reason = ("Failed to build artifact %s: " "Builder did not return task ID" % (c.package)) - c.module_build.transition(conf, models.BUILD_STATES['failed']) + c.module_build.transition(conf, models.BUILD_STATES['failed'], failure_type='infra') return @@ -239,7 +239,7 @@ def start_next_batch_build(config, module, session, builder, components=None): ', '.join([str(t['id']) for t in active_tasks]) ) module.transition(config, state=models.BUILD_STATES['failed'], - state_reason=state_reason) + state_reason=state_reason, failure_type='infra') session.commit() return [] diff --git a/tests/test_monitor.py b/tests/test_monitor.py index 6d89ef0..0d48b26 100644 --- a/tests/test_monitor.py +++ b/tests/test_monitor.py @@ -23,12 +23,16 @@ import os import pytest import requests +import mock +import module_build_service.config as mbs_config import module_build_service.monitor +from module_build_service import models +from conf.config import TestConfiguration from six.moves import reload_module -from tests import app, init_data +from tests import app, init_data, make_module -num_of_metrics = 16 +num_of_metrics = 18 class TestViews: @@ -56,3 +60,28 @@ def test_standalone_metrics_server(): assert len([l for l in r.text.splitlines() if (l.startswith('# TYPE') and '_created ' not in l)]) == num_of_metrics + + +@mock.patch('module_build_service.monitor.builder_failed_counter.labels') +@mock.patch('module_build_service.monitor.builder_success_counter.inc') +def test_monitor_state_changing_success(succ_cnt, failed_cnt): + conf = mbs_config.Config(TestConfiguration) + b = make_module('pkg:0.1:1:c1', requires_list={'platform': 'el8'}) + b.transition(conf, models.BUILD_STATES['wait']) + b.transition(conf, models.BUILD_STATES['build']) + b.transition(conf, models.BUILD_STATES['done']) + succ_cnt.assert_called_once() + failed_cnt.assert_not_called() + + +@mock.patch('module_build_service.monitor.builder_failed_counter.labels') +@mock.patch('module_build_service.monitor.builder_success_counter.inc') +def test_monitor_state_changing_failure(succ_cnt, failed_cnt): + failure_type = 'user' + conf = mbs_config.Config(TestConfiguration) + b = make_module('pkg:0.1:1:c1', requires_list={'platform': 'el8'}) + b.transition(conf, models.BUILD_STATES['wait']) + b.transition(conf, models.BUILD_STATES['build']) + b.transition(conf, models.BUILD_STATES['failed'], failure_type=failure_type) + succ_cnt.assert_not_called() + failed_cnt.assert_called_once_with(reason=failure_type)