#274 Basic monitoring
Merged 10 months ago by gnaponie. Opened 11 months ago by fivaldi.
fivaldi/waiverdb fivaldi_basic_monitoring  into  master

file modified
+4

@@ -13,6 +13,7 @@ 

  mock

  stomp.py

  Flask-Migrate

+ six

  

  # Documentation requirements

  sphinx

@@ -26,3 +27,6 @@ 

  

  # Database

  psycopg2-binary

+ 

+ # Monitoring

+ prometheus_client

file modified
+2

@@ -5,6 +5,7 @@ 

  import pytest

  from sqlalchemy import create_engine

  from waiverdb.app import create_app

+ from waiverdb.monitor import db_hook_event_listeners

  

  

  @pytest.fixture(scope='session')

@@ -36,6 +37,7 @@ 

          connection.execute('DROP DATABASE IF EXISTS {}'.format(dbname))

          connection.execute('CREATE DATABASE {}'.format(dbname))

      db.create_all()

+     db_hook_event_listeners()

      return db

  

  

@@ -0,0 +1,39 @@ 

+ # SPDX-License-Identifier: GPL-2.0+

+ 

+ import os

+ import pytest

+ import requests

+ import waiverdb.monitor

+ 

+ from six.moves import reload_module

+ 

+ 

+ def test_metrics(client):

+     r = client.get('/api/v1.0/metrics')

+ 

+     assert r.status_code == 200

+     assert len([l for l in r.get_data(as_text=True).splitlines()

+                 if l.startswith('# TYPE messaging_') and

+                 l.endswith(' counter')]) == 4

+     assert len([l for l in r.get_data(as_text=True).splitlines()

+                 if l.startswith('# TYPE db_') and

+                 l.endswith(' counter')]) == 4

+ 

+ 

+ def test_standalone_metrics_server_disabled_by_default():

+     with pytest.raises(requests.exceptions.ConnectionError):

+         requests.get('http://127.0.0.1:10040/metrics')

+ 

+ 

+ def test_standalone_metrics_server():

+     os.environ['MONITOR_STANDALONE_METRICS_SERVER_ENABLE'] = 'true'

+     reload_module(waiverdb.monitor)

+ 

+     r = requests.get('http://127.0.0.1:10040/metrics')

+ 

+     assert len([l for l in r.text.splitlines()

+                 if l.startswith('# TYPE messaging_') and

+                 l.endswith(' counter')]) == 4

+     assert len([l for l in r.text.splitlines()

+                 if l.startswith('# TYPE db_') and

+                 l.endswith(' counter')]) == 4

file modified
+4 -1

@@ -15,7 +15,10 @@ 

      find

  commands =

      find -name *.pyc -delete

-     py.test tests/

+     py.test {posargs}

+ 

+ [pytest]

+ testpaths = tests/

  

  [testenv:docs]

  changedir = docs

file modified
+7

@@ -607,9 +607,16 @@ 

          return {'version': __version__, 'auth_method': current_app.config['AUTH_METHOD']}

  

  

+ class MonitorResource(Resource):

+     def get(self):

+         from waiverdb.monitor import MonitorAPI

+         return MonitorAPI().get()

+ 

+ 

  # set up the Api resource routing here

  api.add_resource(WaiversResource, '/waivers/')

  api.add_resource(WaiverResource, '/waivers/<int:waiver_id>')

  api.add_resource(FilteredWaiversResource, '/waivers/+filtered')

  api.add_resource(GetWaiversBySubjectsAndTestcases, '/waivers/+by-subjects-and-testcases')

  api.add_resource(AboutResource, '/about')

+ api.add_resource(MonitorResource, '/metrics')

file modified
+4

@@ -20,6 +20,7 @@ 

  from waiverdb.utils import json_error

  from flask_oidc import OpenIDConnect

  from werkzeug.exceptions import default_exceptions

+ from waiverdb.monitor import db_hook_event_listeners

  

  

  def load_config(app):

@@ -108,6 +109,9 @@ 

  

      app.after_request(insert_headers)

  

+     # initialize DB event listeners from the monitor module

+     app.before_first_request(db_hook_event_listeners)

+ 

      return app

  

  

file modified
+33 -9

@@ -15,10 +15,12 @@ 

  import fedmsg

  import stomp

  import json

+ import waiverdb.monitor as monitor

+ 

+ from flask import current_app

  from waiverdb.fields import waiver_fields

  from waiverdb.models import Waiver

  from waiverdb.utils import stomp_connection

- from flask import current_app

  

  _log = logging.getLogger(__name__)

  

@@ -61,23 +63,45 @@ 

      """

      _log.debug('The publish_new_waiver SQLAlchemy event has been activated (%r)',

                 current_app.config['MESSAGE_PUBLISHER'])

+ 

      if current_app.config['MESSAGE_PUBLISHER'] == 'stomp':

          with stomp_connection() as conn:

              stomp_configs = current_app.config.get('STOMP_CONFIGS')

              for row in session.identity_map.values():

-                 if isinstance(row, Waiver):

-                     _log.debug('Publishing a message for %r', row)

-                     msg = json.dumps(marshal(row, waiver_fields))

-                     kwargs = dict(body=msg, headers={}, destination=stomp_configs['destination'])

-                     if stomp.__version__[0] < 4:

-                         kwargs['message'] = kwargs.pop('body')  # On EL7, different sig.

+                 monitor.messaging_tx_to_send_counter.inc()

+                 if not isinstance(row, Waiver):

+                     continue

+                 _log.debug('Publishing a message for %r', row)

+                 msg = json.dumps(marshal(row, waiver_fields))

+                 kwargs = dict(body=msg, headers={}, destination=stomp_configs['destination'])

+                 if stomp.__version__[0] < 4:

+                     kwargs['message'] = kwargs.pop('body')  # On EL7, different sig.

+                 try:

                      conn.send(**kwargs)

+                     monitor.messaging_tx_sent_ok_counter.inc()

+                 except Exception:

+                     _log.exception('Couldn\'t publish message via stomp')

+                     monitor.messaging_tx_failed_counter.inc()

+                     raise

+ 

      elif current_app.config['MESSAGE_PUBLISHER'] == 'fedmsg':

          for row in session.identity_map.values():

-             if isinstance(row, Waiver):

-                 _log.debug('Publishing a message for %r', row)

+             monitor.messaging_tx_to_send_counter.inc()

+             if not isinstance(row, Waiver):

+                 continue

+             _log.debug('Publishing a message for %r', row)

+             try:

                  fedmsg.publish(topic='waiver.new', msg=marshal(row, waiver_fields))

+                 monitor.messaging_tx_sent_ok_counter.inc()

+             except Exception:

+                 _log.exception('Couldn\'t publish message via fedmsg')

+                 monitor.messaging_tx_failed_counter.inc()

+                 raise

+ 

      elif current_app.config['MESSAGE_PUBLISHER'] is None:

          _log.info('No message published.  MESSAGE_PUBLISHER disabled.')

+         monitor.messaging_tx_stopped_counter.inc()

+ 

      else:

          _log.warning('Unhandled MESSAGE_PUBLISHER %r', current_app.config['MESSAGE_PUBLISHER'])

+         monitor.messaging_tx_failed_counter.inc()

file added
+97

@@ -0,0 +1,97 @@ 

+ # SPDX-License-Identifier: GPL-2.0+

+ 

+ # For an up-to-date version of this module, see:

+ #   https://pagure.io/monitor-flask-sqlalchemy

+ 

+ # pylint: disable=W,unexpected-keyword-arg,no-value-for-parameter

+ 

+ import os

+ import tempfile

+ 

+ from flask import Response

+ from flask.views import MethodView

+ from prometheus_client import (  # noqa: F401

+     ProcessCollector, CollectorRegistry, Counter, multiprocess,

+     Histogram, generate_latest, start_http_server, CONTENT_TYPE_LATEST)

+ from sqlalchemy import event

+ 

+ # Service-specific imports

+ 

+ 

+ if not os.environ.get('prometheus_multiproc_dir'):

+     os.environ.setdefault('prometheus_multiproc_dir', tempfile.mkdtemp())

+ registry = CollectorRegistry()

+ ProcessCollector(registry=registry)

+ multiprocess.MultiProcessCollector(registry)

+ if os.getenv('MONITOR_STANDALONE_METRICS_SERVER_ENABLE', 'false') == 'true':

+     port = os.getenv('MONITOR_STANDALONE_METRICS_SERVER_PORT', '10040')

+     start_http_server(int(port), registry=registry)

+ 

+ 

+ # Generic metrics

+ messaging_tx_to_send_counter = Counter(

+     'messaging_tx_to_send',

+     'Total number of messages to send',

+     registry=registry)

+ messaging_tx_stopped_counter = Counter(

+     'messaging_tx_stopped',

+     'Number of messages, which were eventually stopped before sending',

+     registry=registry)

+ messaging_tx_sent_ok_counter = Counter(

+     'messaging_tx_sent_ok',

+     'Number of messages, which were sent successfully',

+     registry=registry)

+ messaging_tx_failed_counter = Counter(

+     'messaging_tx_failed',

+     'Number of messages, for which the sender failed',

+     registry=registry)

+ 

+ db_dbapi_error_counter = Counter(

+     'db_dbapi_error',

+     'Number of DBAPI errors',

+     registry=registry)

+ db_engine_connect_counter = Counter(

+     'db_engine_connect',

+     'Number of \'engine_connect\' events',

+     registry=registry)

+ db_handle_error_counter = Counter(

+     'db_handle_error',

+     'Number of exceptions during connection',

+     registry=registry)

+ db_transaction_rollback_counter = Counter(

+     'db_transaction_rollback',

+     'Number of transactions, which were rolled back',

+     registry=registry)

+ 

+ # Service-specific metrics

+ # XXX: TODO

+ 

+ 

+ def db_hook_event_listeners(target=None):

+     # Service-specific import of db

+     from waiverdb.models import db

+ 

+     if not target:

+         target = db.engine

+ 

+     @event.listens_for(target, 'dbapi_error', named=True)

+     def receive_dbapi_error(**kw):

+         db_dbapi_error_counter.inc()

+ 

+     @event.listens_for(target, 'engine_connect')

+     def receive_engine_connect(conn, branch):

+         db_engine_connect_counter.inc()

+ 

+     @event.listens_for(target, 'handle_error')

+     def receive_handle_error(exception_context):

+         db_handle_error_counter.inc()

+ 

+     @event.listens_for(target, 'rollback')

+     def receive_rollback(conn):

+         db_transaction_rollback_counter.inc()

+ 

+ 

+ class MonitorAPI(MethodView):

+     def get(self):

+         return Response(generate_latest(registry),

+                         content_type=CONTENT_TYPE_LATEST)

This adds monitor module providing Prometheus export for the common set of metrics (as of now).

rebased onto 76bbe41a72ed4d0d8684ab9561a5fd7213122a40

11 months ago

rebased onto 56871d8edb2419dc0759de9610a3662ed9d6b45f

11 months ago

can we make this consistent with Greenwave? Usually WaiverDB and Greenwave have the same paths since are "twins" :) Greenwave exposes the metrics at /metrics

Why is this import here and not at the begging of the file?

What "messaging_sent_failed_counter" stands for? If it is the number of messages not sent I wouldn't call it "messaging sent", since it wasn't actually sent.

Why do you have this counter if you already have the "messaging_sent_passed_counter"?

You are increasing "messaging_sent_counter" before this, if it will hit the "continue" it won't publish a message and it wouldn't make sense to increase "messaging_sent_counter"

why changing this logic?

I'm not sure we want to actually raise an exception if an error occurs, if we don't raise an error it will just continue with the next item of the loop. It would be ok to log it.

rebased onto 16f9e456e0d5f4b421a4a09452fd658f81714904

11 months ago

I can do it. I was thinking forward on the /monitor/ endpoint as a "home" for other things like healthchecks etc., as proposed in "Factory 2.0 Monitoring docs". If we do it the other way round, I'd prepare a PR for changing it in Greenwave.

What "messaging_sent_failed_counter" stands for? If it is the number of messages not sent I wouldn't call it "messaging sent", since it wasn't actually sent.

Why do you have this counter if you already have the "messaging_sent_passed_counter"?

You are increasing "messaging_sent_counter" before this, if it will hit the "continue" it won't publish a message and it wouldn't make sense to increase "messaging_sent_counter"

Sorry, the naming was already fixed in the monitor module, but not here.. -> Fixed now.

where is this used?

This is a generic set of metrics, not used on waiverdb.

why changing this logic?

There shouldn't be changes in logic. Just changes to de-indent:
if...else -> if not

As I added one more level, it started to be unreadable.

I'm not sure we want to actually raise an exception if an error occurs, if we don't raise an error it will just continue with the next item of the loop. It would be ok to log it.

There shouldn't be any difference. If the sender(s) raised an exception before, this just re-raises it... The only difference is that it gets caught and counters are incremented...

It is fine if we use this approach, but yeah, I would like it to be the same for both WaiverDB and Greenwave. We just have to make sure no one is already using /metrics in Greenwave (I don't think so).

where is this used?

This is a generic set of metrics, not used on waiverdb.

So is it useful to keep it if we don't use it?

where is this used?
This is a generic set of metrics, not used on waiverdb.

So is it useful to keep it if we don't use it?

It'd more difficult to maintain various versions of the monitor module for each service if the generic part would be different. Most of other services publish/consume messages. But WaiverDB just publishes (from what I know). I am open to ideas of keeping it slim but also maintainable.

It is fine if we use this approach, but yeah, I would like it to be the same for both WaiverDB and Greenwave. We just have to make sure no one is already using /metrics in Greenwave (I don't think so).

I think that internal deployment would need to make slight modifications to annotations in the templates, and that should be all.

rebased onto e0054e8008f31b6c34c53354478d6fa98b70eceb

11 months ago

rebased onto 1f6c2aea3220950c000ef3893a712c52cd2156d0

11 months ago

rebased onto 8c534bb85808701287f8230632db6ea75444e259

11 months ago

It'd more difficult to maintain various versions of the monitor module for each service if the generic part would be different. Most of other services publish/consume messages. But WaiverDB just publishes (from what I know). I am open to ideas of keeping it slim but also maintainable.

This is causing also failings on our CIs... https://jenkins-waiverdb-test.cloud.paas.upshift.redhat.com/job/waiverdb-test/job/waiverdb-test-waiverdb-dev/162/console
I understand your point, but wiaverdb is still a independent service and I'm not really happy about having stuff that we don't need in the code.

Optional: You could just use raise instead of needing the e variable

Why not just use the path /monitor/metrics that points directly to this view? I don't see any benefit from making path a variable to parse. It just adds complexity in my opinion.

rebased onto bc5a25cc1c0ac1de04a66fb46858dd5bfc22cafc

10 months ago

@mprahl @gnaponie I shrank the code to fit just the waiverdb needs. The etalon can be found in a separate repo, see monitor.py's header.

rebased onto 96aa03b62d5ff6d2bfb1f075ea4521fed7d33b3d

10 months ago

The CI is still failing... these errors:
08:43:09 [Invoke Pylint] E: 11, 0: Unable to import 'prometheus_client' (import-error)
08:43:09 [Invoke Pylint] W: 76, 0: Unused argument 'kw' (unused-argument)
08:43:09 [Invoke Pylint] W: 80,31: Unused argument 'conn' (unused-argument)
08:43:09 [Invoke Pylint] W: 80,37: Unused argument 'branch' (unused-argument)
08:43:09 [Invoke Pylint] W: 84,29: Unused argument 'exception_context' (unused-argument)
08:43:09 [Invoke Pylint] W: 88,25: Unused argument 'conn' (unused-argument)
08:43:09 [Invoke Pylint] W: 76, 4: Unused variable 'receive_dbapi_error' (unused-variable)
08:43:09 [Invoke Pylint] W: 80, 4: Unused variable 'receive_engine_connect' (unused-variable)
08:43:09 [Invoke Pylint] W: 84, 4: Unused variable 'receive_handle_error' (unused-variable)
08:43:09 [Invoke Pylint] W: 88, 4: Unused variable 'receive_rollback' (unused-variable)
08:43:09 [Invoke Pylint] W: 11, 0: Unused Histogram imported from prometheus_client (unused-import)

See PR#275 which block this PR on the CI. Then we can continue here...

You should add the keyword argument content_type=prometheus_client.CONTENT_TYPE_LATEST since this will set the content type of plain text with the metrics output version that python-prometheus_client is using.

You should add the keyword argument content_type=prometheus_client.CONTENT_TYPE_LATEST since this will set the content type of plain text with the metrics output version that python-prometheus_client is using.

Thanks for pointing this out. I have to do it F2.0-projects-wide. :)

rebased onto 391b5f0ea0ab052175cca632300e093047eb7ba1

10 months ago

rebased onto dbec3f98f5f38f15acc40421b25292f9e5493d77

10 months ago

1 new commit added

  • [WIP] Silence pylint-3
10 months ago

1 new commit added

  • Fix tests
10 months ago

1 new commit added

  • Debug failing tests in CI
10 months ago

1 new commit added

  • Just check our (F2.0) counters in the tests
10 months ago

The CI failed again, it looks like an error in Jenkins itself. Could you re-trigger the job?

rebased onto 4a731ea1b3e966b7b46b496a1aab8310bc1eb9ab

10 months ago

rebased onto 8e51a0534e317ca1896d3fd257cadba5a40822d5

10 months ago

@gnaponie Please for final review here. All comments have been addressed.

Why this is not imported at the beginning of the life?

From PEP8 doc:
Imports should be grouped in the following order:

Standard library imports.
Related third party imports.
Local application/library specific imports.
You should put a blank line between each group of imports.

Why is that before "from flask import current_app"?
Is there some issue with circular imports?

Just to avoid cyclic imports. But the noqa seems to be some relic from previous revisions.

rebased onto 85cc21669c138fb09590f8a89b1f5eff60f44acb

10 months ago

Can we add the log of the error here?

Logging is not related to this PR, but I'll add some _log.exception there.

Beside the minor comments it looks good +1

1 new commit added

  • fixup this
10 months ago

rebased onto 62e7388

10 months ago

@gnaponie Build 181 Finished: SUCCESS for 62e7388.

Commit 5265247 fixes this pull-request

Pull-Request has been merged by gnaponie

10 months ago

Pull-Request has been merged by gnaponie

10 months ago