From 3d8a450f5eddb0fec4194119cfcbe389557b88a9 Mon Sep 17 00:00:00 2001 From: Rick Elrod Date: Feb 28 2020 01:20:38 +0000 Subject: nagios: Add script and check for checking that a timestamp within a file is within a delta of now, and then use this for alerting when websites stop building Signed-off-by: Rick Elrod --- diff --git a/roles/nagios_client/files/scripts/check_timestamp_from_file b/roles/nagios_client/files/scripts/check_timestamp_from_file new file mode 100644 index 0000000..9064337 --- /dev/null +++ b/roles/nagios_client/files/scripts/check_timestamp_from_file @@ -0,0 +1,43 @@ +#!/usr/bin/env python + +# Takes a path to a file and a delta. The file must simply contain an epoch +# timestamp. It can be an integer or a float, as can the delta. +# +# Alerts critical if (now - timestamp contained in file) > delta. +# +# Rick Elrod +# MIT + +import sys +import time + +if len(sys.argv) != 3: + print('UNKNOWN: Pass path to file and delta as parameters') + sys.exit(3) + +filename = sys.argv[1] +delta = float(sys.argv[2]) + +timestamp = None + +try: + with open(filename, 'r') as f: + timestamp = float(f.read().strip()) +except Exception as e: + print('UNKNOWN: Unable to open/read file path') + sys.exit(3) + +difference = round(time.time() - timestamp, 2) +if difference > delta: + print( + 'CRITICAL: Timestamp in file (%.2f) exceeds delta (%.2f) by %.2f seconds' % ( + timestamp, + delta, + difference - delta)) + sys.exit(2) + +print('OK: Timestamp in file (%.2f) is within delta (%.2f) of now, by %.2f seconds' % ( + timestamp, + delta, + abs(difference - delta))) +sys.exit(0) diff --git a/roles/nagios_client/tasks/main.yml b/roles/nagios_client/tasks/main.yml index 2e5e0df..8e71a3b 100644 --- a/roles/nagios_client/tasks/main.yml +++ b/roles/nagios_client/tasks/main.yml @@ -47,6 +47,7 @@ - check_osbs_api.py - check_ipa_replication - check_redis_queue.sh + - check_timestamp_from_file when: not inventory_hostname.startswith('noc') tags: - nagios_client @@ -226,6 +227,16 @@ tags: - nagios_client +- name: install nrpe checks for sundries/websites + template: src={{ item }}.j2 dest=/etc/nrpe.d/{{ item }} owner=root group=root mode=0644 + with_items: + - check_websites_buildtime.cfg + when: inventory_hostname.startswith('sundries') + notify: + - restart nrpe + tags: + - nagios_client + - name: install nrpe config for the RabbitMQ checks template: src: "rabbitmq_args.ini.j2" diff --git a/roles/nagios_client/templates/check_websites_buildtime.cfg.j2 b/roles/nagios_client/templates/check_websites_buildtime.cfg.j2 new file mode 100644 index 0000000..ff5639d --- /dev/null +++ b/roles/nagios_client/templates/check_websites_buildtime.cfg.j2 @@ -0,0 +1,2 @@ +# Alert if websites haven't been built in 3 hours +command[check_websites_buildtime]={{ libdir }}/nagios/plugins/check_timestamp_from_file /srv/websites/getfedora.org/build.timestamp.txt 10800 diff --git a/roles/nagios_server/templates/nagios/services/websites.cfg.j2 b/roles/nagios_server/templates/nagios/services/websites.cfg.j2 index 85e8f8e..c8958d7 100644 --- a/roles/nagios_server/templates/nagios/services/websites.cfg.j2 +++ b/roles/nagios_server/templates/nagios/services/websites.cfg.j2 @@ -316,4 +316,14 @@ define service { use ppc-secondarytemplate } +## Auxillary to websites but necessary to make them happen + +define service { + host_name sundries01.phx2.fedoraproject.org + service_description websites build happened recently + check_command check_by_nrpe!check_websites_buildtime + use websitetemplate +} + + {% endif %}