PR#274: Basic monitoring - waiverdb

waiverdb

#274 Basic monitoring

Merged 5 years ago by gnaponie. Opened 5 years ago by fivaldi.

fivaldi/waiverdb fivaldi_basic_monitoring into master

Filip Valder • 5 years ago

62e7388

requirements.txt

file modified

		`@@ -13,6 +13,7 @@`
		`mock`
		`stomp.py`
		`Flask-Migrate`
		`+ six`

		`# Documentation requirements`
		`sphinx`
		`@@ -26,3 +27,6 @@`

		`# Database`
		`psycopg2-binary`
		`+`
		`+ # Monitoring`
		`+ prometheus_client`

tests/conftest.py

file modified

		`@@ -5,6 +5,7 @@`
		`import pytest`
		`from sqlalchemy import create_engine`
		`from waiverdb.app import create_app`
		`+ from waiverdb.monitor import db_hook_event_listeners`


		`@pytest.fixture(scope='session')`
		`@@ -36,6 +37,7 @@`
		`connection.execute('DROP DATABASE IF EXISTS {}'.format(dbname))`
		`connection.execute('CREATE DATABASE {}'.format(dbname))`
		`db.create_all()`
		`+ db_hook_event_listeners()`
		`return db`

tests/test_monitor.py

file added

+39

		`@@ -0,0 +1,39 @@`
		`+ # SPDX-License-Identifier: GPL-2.0+`
		`+`
		`+ import os`
		`+ import pytest`
		`+ import requests`
		`+ import waiverdb.monitor`
		`+`
		`+ from six.moves import reload_module`
		`+`
		`+`
		`+ def test_metrics(client):`
		`+ r = client.get('/api/v1.0/metrics')`
		`+`
		`+ assert r.status_code == 200`
		`+ assert len([l for l in r.get_data(as_text=True).splitlines()`
		`+ if l.startswith('# TYPE messaging_') and`
		`+ l.endswith(' counter')]) == 4`
		`+ assert len([l for l in r.get_data(as_text=True).splitlines()`
		`+ if l.startswith('# TYPE db_') and`
		`+ l.endswith(' counter')]) == 4`
		`+`
		`+`
		`+ def test_standalone_metrics_server_disabled_by_default():`
		`+ with pytest.raises(requests.exceptions.ConnectionError):`
		`+ requests.get('http://127.0.0.1:10040/metrics')`
		`+`
		`+`
		`+ def test_standalone_metrics_server():`
		`+ os.environ['MONITOR_STANDALONE_METRICS_SERVER_ENABLE'] = 'true'`
		`+ reload_module(waiverdb.monitor)`
		`+`
		`+ r = requests.get('http://127.0.0.1:10040/metrics')`
		`+`
		`+ assert len([l for l in r.text.splitlines()`
		`+ if l.startswith('# TYPE messaging_') and`
		`+ l.endswith(' counter')]) == 4`
		`+ assert len([l for l in r.text.splitlines()`
		`+ if l.startswith('# TYPE db_') and`
		`+ l.endswith(' counter')]) == 4`

tox.ini

file modified

+4 -1

		`@@ -15,7 +15,10 @@`
		`find`
		`commands =`
		`find -name *.pyc -delete`
		`- py.test tests/`
		`+ py.test {posargs}`
		`+`
		`+ [pytest]`
		`+ testpaths = tests/`

		`[testenv:docs]`
		`changedir = docs`

waiverdb/api_v1.py

file modified

		`@@ -607,9 +607,16 @@`
		`return {'version': __version__, 'auth_method': current_app.config['AUTH_METHOD']}`


		`+ class MonitorResource(Resource):`
		`+ def get(self):`
		`+ from waiverdb.monitor import MonitorAPI`
		`+ return MonitorAPI().get()`
		`+`
		`+`
		`# set up the Api resource routing here`
		`api.add_resource(WaiversResource, '/waivers/')`
		`api.add_resource(WaiverResource, '/waivers/<int:waiver_id>')`
		`api.add_resource(FilteredWaiversResource, '/waivers/+filtered')`
		`api.add_resource(GetWaiversBySubjectsAndTestcases, '/waivers/+by-subjects-and-testcases')`
		`api.add_resource(AboutResource, '/about')`
		`+ api.add_resource(MonitorResource, '/metrics')`

waiverdb/app.py

file modified

		`@@ -20,6 +20,7 @@`
		`from waiverdb.utils import json_error`
		`from flask_oidc import OpenIDConnect`
		`from werkzeug.exceptions import default_exceptions`
		`+ from waiverdb.monitor import db_hook_event_listeners`


		`def load_config(app):`
		`@@ -108,6 +109,9 @@`

		`app.after_request(insert_headers)`

		`+ # initialize DB event listeners from the monitor module`
		`+ app.before_first_request(db_hook_event_listeners)`
		`+`
		`return app`

waiverdb/events.py

file modified

+33 -9

		`@@ -15,10 +15,12 @@`
		`import fedmsg`
		`import stomp`
		`import json`
		`+ import waiverdb.monitor as monitor`
		`+`
		`+ from flask import current_app`
		`from waiverdb.fields import waiver_fields`
		`from waiverdb.models import Waiver`
		`from waiverdb.utils import stomp_connection`
		`- from flask import current_app`

		`_log = logging.getLogger(__name__)`

		`@@ -61,23 +63,45 @@`
		`"""`
		`_log.debug('The publish_new_waiver SQLAlchemy event has been activated (%r)',`
		`current_app.config['MESSAGE_PUBLISHER'])`
		`+`
		`if current_app.config['MESSAGE_PUBLISHER'] == 'stomp':`
		`with stomp_connection() as conn:`
		`stomp_configs = current_app.config.get('STOMP_CONFIGS')`
		`for row in session.identity_map.values():`
		`- if isinstance(row, Waiver):`
		`- _log.debug('Publishing a message for %r', row)`
		`- msg = json.dumps(marshal(row, waiver_fields))`
		`- kwargs = dict(body=msg, headers={}, destination=stomp_configs['destination'])`
		`- if stomp.__version__[0] < 4:`
		`- kwargs['message'] = kwargs.pop('body') # On EL7, different sig.`
		`+ monitor.messaging_tx_to_send_counter.inc()`
		`+ if not isinstance(row, Waiver):`
		`+ continue`
		`+ _log.debug('Publishing a message for %r', row)`
		`+ msg = json.dumps(marshal(row, waiver_fields))`
		`+ kwargs = dict(body=msg, headers={}, destination=stomp_configs['destination'])`
		`+ if stomp.__version__[0] < 4:`
		`+ kwargs['message'] = kwargs.pop('body') # On EL7, different sig.`
		`+ try:`
		`conn.send(**kwargs)`
		`+ monitor.messaging_tx_sent_ok_counter.inc()`
		`+ except Exception:`
		`+ _log.exception('Couldn\'t publish message via stomp')`
		`+ monitor.messaging_tx_failed_counter.inc()`
		`+ raise`
		`+`
		`elif current_app.config['MESSAGE_PUBLISHER'] == 'fedmsg':`
		`for row in session.identity_map.values():`
		`- if isinstance(row, Waiver):`
		`- _log.debug('Publishing a message for %r', row)`
		`+ monitor.messaging_tx_to_send_counter.inc()`
		`+ if not isinstance(row, Waiver):`
		`+ continue`
		`+ _log.debug('Publishing a message for %r', row)`
		`+ try:`
		`fedmsg.publish(topic='waiver.new', msg=marshal(row, waiver_fields))`
		`+ monitor.messaging_tx_sent_ok_counter.inc()`
		`+ except Exception:`
		`+ _log.exception('Couldn\'t publish message via fedmsg')`
		`+ monitor.messaging_tx_failed_counter.inc()`
		`+ raise`
		`+`
		`elif current_app.config['MESSAGE_PUBLISHER'] is None:`
		`_log.info('No message published. MESSAGE_PUBLISHER disabled.')`
		`+ monitor.messaging_tx_stopped_counter.inc()`
		`+`
		`else:`
		`_log.warning('Unhandled MESSAGE_PUBLISHER %r', current_app.config['MESSAGE_PUBLISHER'])`
		`+ monitor.messaging_tx_failed_counter.inc()`

waiverdb/monitor.py

file added

+97

		`@@ -0,0 +1,97 @@`
		`+ # SPDX-License-Identifier: GPL-2.0+`
		`+`
		`+ # For an up-to-date version of this module, see:`
		`+ # https://pagure.io/monitor-flask-sqlalchemy`
		`+`
		`+ # pylint: disable=W,unexpected-keyword-arg,no-value-for-parameter`
		`+`
		`+ import os`
		`+ import tempfile`
		`+`
		`+ from flask import Response`
		`+ from flask.views import MethodView`
		`+ from prometheus_client import ( # noqa: F401`
		`+ ProcessCollector, CollectorRegistry, Counter, multiprocess,`
		`+ Histogram, generate_latest, start_http_server, CONTENT_TYPE_LATEST)`
		`+ from sqlalchemy import event`
		`+`
		`+ # Service-specific imports`
		`+`
		`+`
		`+ if not os.environ.get('prometheus_multiproc_dir'):`
		`+ os.environ.setdefault('prometheus_multiproc_dir', tempfile.mkdtemp())`
		`+ registry = CollectorRegistry()`
		`+ ProcessCollector(registry=registry)`
		`+ multiprocess.MultiProcessCollector(registry)`
		`+ if os.getenv('MONITOR_STANDALONE_METRICS_SERVER_ENABLE', 'false') == 'true':`
		`+ port = os.getenv('MONITOR_STANDALONE_METRICS_SERVER_PORT', '10040')`
		`+ start_http_server(int(port), registry=registry)`
		`+`
		`+`
		`+ # Generic metrics`
		`+ messaging_tx_to_send_counter = Counter(`
		`+ 'messaging_tx_to_send',`
		`+ 'Total number of messages to send',`
		`+ registry=registry)`
		`+ messaging_tx_stopped_counter = Counter(`
		`+ 'messaging_tx_stopped',`
		`+ 'Number of messages, which were eventually stopped before sending',`
		`+ registry=registry)`
		`+ messaging_tx_sent_ok_counter = Counter(`
		`+ 'messaging_tx_sent_ok',`
		`+ 'Number of messages, which were sent successfully',`
		`+ registry=registry)`
		`+ messaging_tx_failed_counter = Counter(`
		`+ 'messaging_tx_failed',`
		`+ 'Number of messages, for which the sender failed',`
		`+ registry=registry)`
		`+`
		`+ db_dbapi_error_counter = Counter(`
		`+ 'db_dbapi_error',`
		`+ 'Number of DBAPI errors',`
		`+ registry=registry)`
		`+ db_engine_connect_counter = Counter(`
		`+ 'db_engine_connect',`
		`+ 'Number of \'engine_connect\' events',`
		`+ registry=registry)`
		`+ db_handle_error_counter = Counter(`
		`+ 'db_handle_error',`
		`+ 'Number of exceptions during connection',`
		`+ registry=registry)`
		`+ db_transaction_rollback_counter = Counter(`
		`+ 'db_transaction_rollback',`
		`+ 'Number of transactions, which were rolled back',`
		`+ registry=registry)`
		`+`
		`+ # Service-specific metrics`
		`+ # XXX: TODO`
		`+`
		`+`
		`+ def db_hook_event_listeners(target=None):`
		`+ # Service-specific import of db`
		`+ from waiverdb.models import db`
		`+`
		`+ if not target:`
		`+ target = db.engine`
		`+`
		`+ @event.listens_for(target, 'dbapi_error', named=True)`
		`+ def receive_dbapi_error(**kw):`
		`+ db_dbapi_error_counter.inc()`
		`+`
		`+ @event.listens_for(target, 'engine_connect')`
		`+ def receive_engine_connect(conn, branch):`
		`+ db_engine_connect_counter.inc()`
		`+`
		`+ @event.listens_for(target, 'handle_error')`
		`+ def receive_handle_error(exception_context):`
		`+ db_handle_error_counter.inc()`
		`+`
		`+ @event.listens_for(target, 'rollback')`
		`+ def receive_rollback(conn):`
		`+ db_transaction_rollback_counter.inc()`
		`+`
		`+`
		`+ class MonitorAPI(MethodView):`
		`+ def get(self):`
		`+ return Response(generate_latest(registry),`
		`+ content_type=CONTENT_TYPE_LATEST)`

fivaldi commented 5 years ago

This adds monitor module providing Prometheus export for the common set of metrics (as of now).

rebased onto 76bbe41a72ed4d0d8684ab9561a5fd7213122a40

5 years ago

rebased onto 56871d8edb2419dc0759de9610a3662ed9d6b45f

5 years ago

gnaponie commented on line 17 of waiverdb/api_v1.py 5 years ago

can we make this consistent with Greenwave? Usually WaiverDB and Greenwave have the same paths since are "twins" :) Greenwave exposes the metrics at /metrics

gnaponie commented on line 7 of waiverdb/events.py 5 years ago

Why is this import here and not at the begging of the file?

gnaponie commented on line 33 of waiverdb/events.py 5 years ago

What "messaging_sent_failed_counter" stands for? If it is the number of messages not sent I wouldn't call it "messaging sent", since it wasn't actually sent.

gnaponie commented on line 21 of waiverdb/events.py 5 years ago

Why do you have this counter if you already have the "messaging_sent_passed_counter"?

gnaponie commented on line 23 of waiverdb/events.py 5 years ago

You are increasing "messaging_sent_counter" before this, if it will hit the "continue" it won't publish a message and it wouldn't make sense to increase "messaging_sent_counter"

gnaponie commented on line 15 of waiverdb/events.py 5 years ago

why changing this logic?

gnaponie commented on line 34 of waiverdb/events.py 5 years ago

I'm not sure we want to actually raise an exception if an error occurs, if we don't raise an error it will just continue with the next item of the loop. It would be ok to log it.

gnaponie commented on line 25 of waiverdb/monitor.py 5 years ago

where is this used?

rebased onto 16f9e456e0d5f4b421a4a09452fd658f81714904

5 years ago

fivaldi commented on line 17 of waiverdb/api_v1.py 5 years ago

I can do it. I was thinking forward on the /monitor/ endpoint as a "home" for other things like healthchecks etc., as proposed in "Factory 2.0 Monitoring docs". If we do it the other way round, I'd prepare a PR for changing it in Greenwave.

fivaldi commented 5 years ago

What "messaging_sent_failed_counter" stands for? If it is the number of messages not sent I wouldn't call it "messaging sent", since it wasn't actually sent.

Why do you have this counter if you already have the "messaging_sent_passed_counter"?

You are increasing "messaging_sent_counter" before this, if it will hit the "continue" it won't publish a message and it wouldn't make sense to increase "messaging_sent_counter"

Sorry, the naming was already fixed in the monitor module, but not here.. -> Fixed now.

fivaldi commented 5 years ago

where is this used?

This is a generic set of metrics, not used on waiverdb.

fivaldi commented 5 years ago

why changing this logic?

There shouldn't be changes in logic. Just changes to de-indent:
if...else -> if not

As I added one more level, it started to be unreadable.

Edited 5 years ago by fivaldi

fivaldi commented 5 years ago

I'm not sure we want to actually raise an exception if an error occurs, if we don't raise an error it will just continue with the next item of the loop. It would be ok to log it.

There shouldn't be any difference. If the sender(s) raised an exception before, this just re-raises it... The only difference is that it gets caught and counters are incremented...

gnaponie commented on line 17 of waiverdb/api_v1.py 5 years ago

It is fine if we use this approach, but yeah, I would like it to be the same for both WaiverDB and Greenwave. We just have to make sure no one is already using /metrics in Greenwave (I don't think so).

gnaponie commented 5 years ago

where is this used?

This is a generic set of metrics, not used on waiverdb.

So is it useful to keep it if we don't use it?

fivaldi commented 5 years ago

where is this used?
This is a generic set of metrics, not used on waiverdb.

So is it useful to keep it if we don't use it?

It'd more difficult to maintain various versions of the monitor module for each service if the generic part would be different. Most of other services publish/consume messages. But WaiverDB just publishes (from what I know). I am open to ideas of keeping it slim but also maintainable.

fivaldi commented 5 years ago

It is fine if we use this approach, but yeah, I would like it to be the same for both WaiverDB and Greenwave. We just have to make sure no one is already using /metrics in Greenwave (I don't think so).

I think that internal deployment would need to make slight modifications to annotations in the templates, and that should be all.

rebased onto e0054e8008f31b6c34c53354478d6fa98b70eceb

5 years ago

rebased onto 1f6c2aea3220950c000ef3893a712c52cd2156d0

5 years ago

rebased onto 8c534bb85808701287f8230632db6ea75444e259

5 years ago

gnaponie commented 5 years ago

It'd more difficult to maintain various versions of the monitor module for each service if the generic part would be different. Most of other services publish/consume messages. But WaiverDB just publishes (from what I know). I am open to ideas of keeping it slim but also maintainable.

This is causing also failings on our CIs... https://jenkins-waiverdb-test.cloud.paas.upshift.redhat.com/job/waiverdb-test/job/waiverdb-test-waiverdb-dev/162/console
I understand your point, but wiaverdb is still a independent service and I'm not really happy about having stuff that we don't need in the code.

mprahl commented on line 43 of waiverdb/events.py 5 years ago

Optional: You could just use raise instead of needing the e variable

mprahl commented on line 112 of waiverdb/monitor.py 5 years ago

Why not just use the path /monitor/metrics that points directly to this view? I don't see any benefit from making path a variable to parse. It just adds complexity in my opinion.

rebased onto bc5a25cc1c0ac1de04a66fb46858dd5bfc22cafc

5 years ago

fivaldi commented 5 years ago

@mprahl @gnaponie I shrank the code to fit just the waiverdb needs. The etalon can be found in a separate repo, see monitor.py's header.

rebased onto 96aa03b62d5ff6d2bfb1f075ea4521fed7d33b3d

5 years ago

gnaponie commented 5 years ago

The CI is still failing... these errors:
08:43:09 [Invoke Pylint] E: 11, 0: Unable to import 'prometheus_client' (import-error)
08:43:09 [Invoke Pylint] W: 76, 0: Unused argument 'kw' (unused-argument)
08:43:09 [Invoke Pylint] W: 80,31: Unused argument 'conn' (unused-argument)
08:43:09 [Invoke Pylint] W: 80,37: Unused argument 'branch' (unused-argument)
08:43:09 [Invoke Pylint] W: 84,29: Unused argument 'exception_context' (unused-argument)
08:43:09 [Invoke Pylint] W: 88,25: Unused argument 'conn' (unused-argument)
08:43:09 [Invoke Pylint] W: 76, 4: Unused variable 'receive_dbapi_error' (unused-variable)
08:43:09 [Invoke Pylint] W: 80, 4: Unused variable 'receive_engine_connect' (unused-variable)
08:43:09 [Invoke Pylint] W: 84, 4: Unused variable 'receive_handle_error' (unused-variable)
08:43:09 [Invoke Pylint] W: 88, 4: Unused variable 'receive_rollback' (unused-variable)
08:43:09 [Invoke Pylint] W: 11, 0: Unused Histogram imported from prometheus_client (unused-import)

fivaldi commented 5 years ago

See PR#275 which block this PR on the CI. Then we can continue here...

mprahl commented on line 95 of waiverdb/monitor.py 5 years ago

You should add the keyword argument content_type=prometheus_client.CONTENT_TYPE_LATEST since this will set the content type of plain text with the metrics output version that python-prometheus_client is using.

fivaldi commented 5 years ago

You should add the keyword argument content_type=prometheus_client.CONTENT_TYPE_LATEST since this will set the content type of plain text with the metrics output version that python-prometheus_client is using.

Thanks for pointing this out. I have to do it F2.0-projects-wide. :)

rebased onto 391b5f0ea0ab052175cca632300e093047eb7ba1

5 years ago

rebased onto dbec3f98f5f38f15acc40421b25292f9e5493d77

5 years ago

1 new commit added

[WIP] Silence pylint-3

5 years ago

1 new commit added

Fix tests

5 years ago

1 new commit added

Debug failing tests in CI

5 years ago

1 new commit added

Just check our (F2.0) counters in the tests

5 years ago

gnaponie commented 5 years ago

The CI failed again, it looks like an error in Jenkins itself. Could you re-trigger the job?

rebased onto 4a731ea1b3e966b7b46b496a1aab8310bc1eb9ab

5 years ago

rebased onto 8e51a0534e317ca1896d3fd257cadba5a40822d5

5 years ago

fivaldi commented 5 years ago

@gnaponie Please for final review here. All comments have been addressed.

gnaponie commented on line 7 of waiverdb/api_v1.py 5 years ago

Why this is not imported at the beginning of the life?

gnaponie commented on line 5 of waiverdb/events.py 5 years ago

From PEP8 doc:
Imports should be grouped in the following order:

Standard library imports.
Related third party imports.
Local application/library specific imports.
You should put a blank line between each group of imports.

Why is that before "from flask import current_app"?
Is there some issue with circular imports?

fivaldi commented on line 7 of waiverdb/api_v1.py 5 years ago

Just to avoid cyclic imports. But the noqa seems to be some relic from previous revisions.

rebased onto 85cc21669c138fb09590f8a89b1f5eff60f44acb

5 years ago

gnaponie commented on line 43 of waiverdb/events.py 5 years ago

Can we add the log of the error here?

fivaldi commented on line 43 of waiverdb/events.py 5 years ago

Logging is not related to this PR, but I'll add some _log.exception there.

gnaponie commented 5 years ago

Beside the minor comments it looks good +1

1 new commit added