#1129 Basic monitoring
Merged 2 months ago by mprahl. Opened 3 months ago by fivaldi.
fivaldi/fm-orchestrator fivaldi_basic_monitoring  into  master

file modified
+1

@@ -32,6 +32,7 @@ 

      python-mock \

      python-munch \

      python-pip \

+     python-prometheus_client \

      python-requests \

      python-six \

      python-solv \

@@ -21,6 +21,7 @@ 

      python3-ldap3 \

      python3-munch \

      python3-pip \

+     python3-prometheus_client \

      python3-requests \

      python3-six \

      python3-solv \

@@ -260,7 +260,18 @@ 

      except KeyError:

          raise KeyError("No messaging backend found for %r in %r" % (

              conf.messaging, _messaging_backends.keys()))

-     return handler(topic, msg, conf, service)

+ 

+     from module_build_service.monitor import (

+         messaging_tx_to_send_counter, messaging_tx_sent_ok_counter,

+         messaging_tx_failed_counter)

+     messaging_tx_to_send_counter.inc()

+     try:

+         rv = handler(topic, msg, conf, service)

+         messaging_tx_sent_ok_counter.inc()

+         return rv

+     except Exception:

+         messaging_tx_failed_counter.inc()

+         raise

  

  

  def _fedmsg_publish(topic, msg, conf, service):

@@ -118,6 +118,10 @@ 

          sqlalchemy.event.listen(session, 'before_commit',

                                  session_before_commit_handlers)

  

+     # initialize DB event listeners from the monitor module

+     from module_build_service.monitor import db_hook_event_listeners

+     db_hook_event_listeners(session.bind.engine)

+ 

  

  @contextlib.contextmanager

  def make_session(conf):

@@ -0,0 +1,130 @@ 

+ # -*- coding: utf-8 -*-

+ # Copyright (c) 2019  Red Hat, Inc.

+ #

+ # Permission is hereby granted, free of charge, to any person obtaining a copy

+ # of this software and associated documentation files (the "Software"), to deal

+ # in the Software without restriction, including without limitation the rights

+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

+ # copies of the Software, and to permit persons to whom the Software is

+ # furnished to do so, subject to the following conditions:

+ #

+ # The above copyright notice and this permission notice shall be included in all

+ # copies or substantial portions of the Software.

+ #

+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

+ # SOFTWARE.

+ 

+ # For an up-to-date version of this module, see:

+ #   https://pagure.io/monitor-flask-sqlalchemy

+ 

+ import os

+ import tempfile

+ 

+ from flask import Blueprint, Response

+ from prometheus_client import (  # noqa: F401

+     ProcessCollector, CollectorRegistry, Counter, multiprocess,

+     Histogram, generate_latest, start_http_server, CONTENT_TYPE_LATEST)

+ from sqlalchemy import event

+ 

+ # Service-specific imports

+ from module_build_service.utils import cors_header, validate_api_version

+ 

+ 

+ if not os.environ.get('prometheus_multiproc_dir'):

+     os.environ.setdefault('prometheus_multiproc_dir', tempfile.mkdtemp())

+ registry = CollectorRegistry()

+ ProcessCollector(registry=registry)

+ multiprocess.MultiProcessCollector(registry)

+ if os.getenv('MONITOR_STANDALONE_METRICS_SERVER_ENABLE', 'false') == 'true':

+     port = os.getenv('MONITOR_STANDALONE_METRICS_SERVER_PORT', '10040')

+     start_http_server(int(port), registry=registry)

+ 

+ 

+ # Generic metrics

+ messaging_rx_counter = Counter(

+     'messaging_rx',

+     'Total number of messages received',

+     registry=registry)

+ messaging_rx_processed_ok_counter = Counter(

+     'messaging_rx_processed_ok',

+     'Number of received messages, which were processed successfully',

+     registry=registry)

+ messaging_rx_failed_counter = Counter(

+     'messaging_rx_failed',

+     'Number of received messages, which failed during processing',

+     registry=registry)

+ 

+ messaging_tx_to_send_counter = Counter(

+     'messaging_tx_to_send',

+     'Total number of messages to send',

+     registry=registry)

+ messaging_tx_sent_ok_counter = Counter(

+     'messaging_tx_sent_ok',

+     'Number of messages, which were sent successfully',

+     registry=registry)

+ messaging_tx_failed_counter = Counter(

+     'messaging_tx_failed',

+     'Number of messages, for which the sender failed',

+     registry=registry)

+ 

+ db_dbapi_error_counter = Counter(

+     'db_dbapi_error',

+     'Number of DBAPI errors',

+     registry=registry)

+ db_engine_connect_counter = Counter(

+     'db_engine_connect',

+     'Number of \'engine_connect\' events',

+     registry=registry)

+ db_handle_error_counter = Counter(

+     'db_handle_error',

+     'Number of exceptions during connection',

+     registry=registry)

+ db_transaction_rollback_counter = Counter(

+     'db_transaction_rollback',

+     'Number of transactions, which were rolled back',

+     registry=registry)

+ 

+ # Service-specific metrics

+ # XXX: TODO

+ 

+ 

+ def db_hook_event_listeners(target=None):

+     # Service-specific import of db

+     from module_build_service import db

+ 

+     if not target:

+         target = db.engine

+ 

+     @event.listens_for(target, 'dbapi_error', named=True)

+     def receive_dbapi_error(**kw):

+         db_dbapi_error_counter.inc()

+ 

+     @event.listens_for(target, 'engine_connect')

+     def receive_engine_connect(conn, branch):

+         db_engine_connect_counter.inc()

+ 

+     @event.listens_for(target, 'handle_error')

+     def receive_handle_error(exception_context):

+         db_handle_error_counter.inc()

+ 

+     @event.listens_for(target, 'rollback')

+     def receive_rollback(conn):

+         db_transaction_rollback_counter.inc()

+ 

+ 

+ monitor_api = Blueprint(

+     'monitor', __name__,

+     url_prefix='/module-build-service/<int:api_version>/monitor')

+ 

+ 

+ @cors_header()

+ @validate_api_version()

+ @monitor_api.route('/metrics')

+ def metrics(api_version):

+     return Response(generate_latest(registry),

+                     content_type=CONTENT_TYPE_LATEST)

@@ -47,6 +47,7 @@ 

  import module_build_service.scheduler.handlers.components

  import module_build_service.scheduler.handlers.modules

  import module_build_service.scheduler.handlers.tags

+ import module_build_service.monitor as monitor

  from module_build_service import models, log, conf

  

  

@@ -147,6 +148,8 @@ 

              super(MBSConsumer, self).validate(message)

  

      def consume(self, message):

+         monitor.messaging_rx_counter.inc()

+ 

          # Sometimes, the messages put into our queue are artificially put there

          # by other parts of our own codebase.  If they are already abstracted

          # messages, then just use them as-is.  If they are not already

@@ -161,7 +164,9 @@ 

          try:

              with models.make_session(conf) as session:

                  self.process_message(session, msg)

+             monitor.messaging_rx_processed_ok_counter.inc()

          except sqlalchemy.exc.OperationalError as error:

+             monitor.messaging_rx_failed_counter.inc()

              if 'could not translate host name' in str(error):

                  log.exception(

                      "SQLAlchemy can't resolve DNS records. Scheduling fedmsg-hub to shutdown.")

@@ -169,6 +174,7 @@ 

              else:

                  raise

          except Exception:

+             monitor.messaging_rx_failed_counter.inc()

              log.exception('Failed while handling {0!r}'.format(msg))

  

          if self.stop_condition and self.stop_condition(message):

@@ -41,6 +41,7 @@ 

  from module_build_service.errors import (

      ValidationError, Forbidden, NotFound, ProgrammingError)

  from module_build_service.backports import jsonify

+ from module_build_service.monitor import monitor_api

  

  

  api_routes = {

@@ -461,5 +462,7 @@ 

          else:

              raise NotImplementedError("Unhandled api key.")

  

+     app.register_blueprint(monitor_api)

+ 

  

  register_api()

file modified
+1

@@ -13,6 +13,7 @@ 

  ldap3

  moksha.hub

  munch

+ prometheus_client

  pyOpenSSL

  pygobject

  requests

@@ -0,0 +1,58 @@ 

+ # Copyright (c) 2019  Red Hat, Inc.

+ #

+ # Permission is hereby granted, free of charge, to any person obtaining a copy

+ # of this software and associated documentation files (the "Software"), to deal

+ # in the Software without restriction, including without limitation the rights

+ # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell

+ # copies of the Software, and to permit persons to whom the Software is

+ # furnished to do so, subject to the following conditions:

+ #

+ # The above copyright notice and this permission notice shall be included in all

+ # copies or substantial portions of the Software.

+ #

+ # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

+ # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

+ # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

+ # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

+ # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

+ # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

+ # SOFTWARE.

+ #

+ # Written by Filip Valder <fvalder@redhat.com>

+ 

+ import os

+ import pytest

+ import requests

+ import module_build_service.monitor

+ 

+ from six.moves import reload_module

+ from tests import app, init_data

+ 

+ num_of_metrics = 16

+ 

+ 

+ class TestViews:

+     def setup_method(self, test_method):

+         self.client = app.test_client()

+         init_data(2)

+ 

+     def test_metrics(self):

+         rv = self.client.get('/module-build-service/1/monitor/metrics')

+ 

+         assert len([l for l in rv.get_data(as_text=True).splitlines()

+                     if (l.startswith('# TYPE') and '_created ' not in l)]) == num_of_metrics

+ 

+ 

+ def test_standalone_metrics_server_disabled_by_default():

+     with pytest.raises(requests.exceptions.ConnectionError):

+         requests.get('http://127.0.0.1:10040/metrics')

+ 

+ 

+ def test_standalone_metrics_server():

+     os.environ['MONITOR_STANDALONE_METRICS_SERVER_ENABLE'] = 'true'

+     reload_module(module_build_service.monitor)

+ 

+     r = requests.get('http://127.0.0.1:10040/metrics')

+ 

+     assert len([l for l in r.text.splitlines()

+                 if (l.startswith('# TYPE') and '_created ' not in l)]) == num_of_metrics

This adds monitor module providing Prometheus export for the common set of metrics (as of now).

@jkaluza Please for your hints on where to add service-specific counters... What comes to my mind are module transitions. Can it be?

It'd be nice if your imports were ordered by stdlib, third-party, and then local

prometheus_client isn't available in EPEL, so this needs to be an optional dependency.

How would these metrics get exposed since the back-end is on a separate server from the front-end?

Do we know if we actually need to create a separate registry? I think we might only use multi-threading and not multi-processing, which should be okay with the default registry. @jkaluza can you comment here?

This doesn't seem very useful.

@jkaluza Please for your hints on where to add service-specific counters... What comes to my mind are module transitions. Can it be?

Here are the metrics I'd find interesting:

  • Number of failed module builds
  • Number of failed module builds due to infrastructure error (such as Koji being unavailable)
  • Number of failed module builds due to user error (invalid modulemd). This can probably be deduced from the previous two though.
  • Number of successful builds
  • Module build duration
  • Duration of component builds being stuck due to the concurrent component threshold being met (this will enable us to know if we need to increase the threshold)
  • Time taken for a module build submission

Freshmaker uses this counter, because there is some logic which finally filters out some messages from processing. I'd keep this projects-wide, because 1) we'd have the common set of basic metrics everywhere, 2) it can give relevant information in case of malformed messages -> this counter starts to rise.

I suggest another way: exporting the metrics to a text file, which can then be exported via the textfile exporter or served via "stupid" netcat or whetever. Adding text file output support to the monitor module would be trivial.

What's the alternative to have this in operation on EPEL? Since this seems to be a significant issue for the internal deployment.

rebased onto 1891737b06c79e9e2f6ef2c548660f57f7269d12

3 months ago

rebased onto ff4e260faf0c9bbb0fb2fdfb3c3cbc77cbcaf53b

3 months ago

What's the alternative to have this in operation on EPEL? Since this seems to be a significant issue for the internal deployment.

I'll reach out to the package maintainer to see if they'd be willing to package this for EPEL7.

I suggest another way: exporting the metrics to a text file, which can then be exported via the textfile exporter or served via "stupid" netcat or whetever. Adding text file output support to the monitor module would be trivial.

I'd be fine with the textfile exporter, but we could probably just add a small API endpoint on the backend using start_http_server from the Prometheus Python client. That may just be simpler. It doesn't need to be run in front of Apache or nginx because only Prometheus will be using that.

@mprahl, note that I (we) also wanted to provide some REST API in frontend to get the build.log of failed module build (basically what we store in /tmp/build-$module_id-log.txt). Using some simple http server on backend would allow us implementing this too :).

(That probably has more issues, because build-log can be on any backend, but we could maybe store the hostname of backend which built the module in DB...)

(Or we could store the build-log in frontend once the module is build proactively)

rebased onto b313fceede83cecae0b646a08b71ee630343089b

2 months ago

@mprahl and @jkaluza Please see the most recent changes containing the standalone http server.

Anything else to do...?

1 new commit added

  • Fix tests
2 months ago

rebased onto 67ee78a7c1b637481168e19446c5e0e7e0c90994

2 months ago

I mirrored the changes from the waiverdb#274 comments to this PR. Please for final review here.

rebased onto f5c5fbd66609ed89d88aede5f3d07c7b00b6e205

2 months ago

I think I'm OK with this change. But before merging that, we need to know what to do with prometheus package in RHEL/EPEL. Once we merge this, we basically cannot release and deploy new version in RHEL without prometheus.

Let's not use the path variable in the route. Let Flask do the route matching, we shouldn't being doing this ourselves.

The idea was to hang the /monitor{/,...} route in Flask and then let the monitor module to do it's own "subrouting" here, e.g. /monitor/db-ping etc. Otherwise, service from service, we would have to do it in a diferent ways, depending on how the routing is implemented. Or we'd have to use app in the monitor module, which I want to avoid completely. It only brings problems with the app context.
I found the current solution simple/understandable. Is there any other way to let the monitor module be standalone and do its routing in a "mainstream"/clean way?

Could you use a Flask blueprint instead if you don't want to use app? That's what they're written for.

rebased onto f9f4aeb53cd0f4478b22a6ce1929f37e63d803c2

2 months ago

2 new commits added

  • fixup this
  • Basic monitoring w/-o MBS-specific metrics
2 months ago

@fivaldi could you please add python-prometheus_client to docker/Dockerfile-tests?

It should be available soon:
https://bodhi.fedoraproject.org/updates/FEDORA-EPEL-2019-0e4f158a14

1 new commit added

  • Add python-prometheus_client as it is estimated in epel7 soon
2 months ago

rebased onto 0d6a8c4238421dd20ff6d6d833b8f695a9fd635a

2 months ago

I verified the tests manually in a local docker, using the candidate python-prometheus_client-0.5.0-1.el7 (https://bodhi.fedoraproject.org/updates/FEDORA-EPEL-2019-0e4f158a14), the outuput follows:

=================================================================================== 460 passed, 32 warnings in 100.07 seconds ====================================================================================
____________________________________________________________________________________________________ summary _____________________________________________________________________________________________________
  flake8: commands succeeded
  py27: commands succeeded
  congratulations :)

pretty please pagure-ci rebuild

2 months ago

rebased onto 5ba2e91185ae55eaeb1f4ab68658312aa7e5f671

2 months ago

rebased onto 947d41534e04c8aba9ff00e63f521986ff03d933

2 months ago

pretty please pagure-ci rebuild

2 months ago

Pull-Request has been merged by mprahl

2 months ago