#167 Monitoring for bastion mail queue to redhat.com
Merged 3 years ago by kevin. Opened 3 years ago by darknao.
fedora-infra/ darknao/ansible feature/mail_queue  into  master

@@ -0,0 +1,67 @@ 

+ #!/usr/bin/python3

+ 

+ import subprocess

+ import json

+ import argparse

+ import sys

+ 

+ from datetime import datetime

+ 

+ parser = argparse.ArgumentParser()

+ parser.add_argument('domain', help="Required. Domain to check")

+ parser.add_argument('-c', '--critical', dest='critical', type=int, default=50,

+                     help="Critical threshold")

+ parser.add_argument('-w', '--warning', dest='warning', type=int, default=20,

+                     help="Warning threshold")

+ parser.add_argument('-i', '--ignore', dest='ignore', type=int, default=5,

+                     help="Ignore queues from the last X minutes (default: 5)")

+ args = parser.parse_args()

+ 

+ 

+ now = datetime.now()

+ p = subprocess.Popen(['/usr/sbin/postqueue', '-j'],

+                      stdin=subprocess.PIPE,

+                      stdout=subprocess.PIPE,

+                      stderr=subprocess.STDOUT)

+ output = str(p.stdout.read(), "utf-8").splitlines()

+ mail_queue = 0

+ 

+ 

+ if args.domain == 'all':

+   mail_queue = len(output)

+ else:

+ 

+   for line in output:

+       j = json.loads(line)

+       if j["queue_name"] == 'active':

+           # Ignore Active queue

+           continue

+ 

+       queue_old = now - datetime.fromtimestamp(j["arrival_time"])

+       if queue_old.total_seconds() / 60 < args.ignore:

+           # Not old enough

+           continue

+           

+       for recipient in j['recipients']:

+           if recipient['address'].endswith(args.domain):

+               mail_queue += 1

+               break

+ 

+ 

+ ret_val = 0

+ msg = ("OK: Queue length for %s destination < %s (%s)"

+       % (args.domain, args.warning, mail_queue))

+ 

+ if mail_queue > args.warning:

+     msg = ("WARNING: Queue length for %s destination > %s (%s)"

+           % (args.domain, args.warning, mail_queue))

+     ret_val = 1

+ 

+ if mail_queue > args.critical:

+     msg = ("CRITICAL: Queue length for %s destination > %s (%s)"

+           % (args.domain, args.critical, mail_queue))

+     ret_val = 2

+ 

+ 

+ print(msg)

+ sys.exit(ret_val) 

\ No newline at end of file

@@ -30,6 +30,7 @@ 

    with_items:

    - check_haproxy_conns.py

    - check_postfix_queue

+   - check_postfix_queue.py

    - check_raid.py

    - check_lock

    - check_fcomm_queue
@@ -125,6 +126,7 @@ 

    - check_disk.cfg

    - check_swap.cfg

    - check_postfix_queue.cfg

+   - check_postfix_redhat.cfg

    - check_lock.cfg

    - check_fedmsg_hub_proc.cfg

    - check_fedmsg_irc_proc.cfg

@@ -0,0 +1,1 @@ 

+ command[check_postfix_redhat]={{ libdir }}/nagios/plugins/check_postfix_queue.py redhat.com -w 30 -c 50

@@ -5,3 +5,11 @@ 

    max_check_attempts    7

    use                   defaulttemplate

  }

+ 

+ define service {

+   host_name             bastion01.iad2.fedoraproject.org

+   service_description   mail_queue_redhat

+   check_command         check_by_nrpe!check_postfix_redhat

+   max_check_attempts    7

+   use                   defaulttemplate

+ }

This is addressing fedora-infrastructure#5908
I've seen the mail queue going from 0 to around 15 during my testing, and on rare occasion, going over 20.
So I've set the initial warning threshold at 30 hoping it will not raise any alarm when not necessary.
I'm not a nagios expert, so I hope I got everything right here.

This is targeting bastion01 only as I'm not sure if others bastion need it too.

rebased onto bf533db

3 years ago

rebased onto bf533db

3 years ago

ok, looks good to me, lets give it a try!

Thanks for the PR @darknao

Pull-Request has been merged by kevin

3 years ago