From 7301d4356b19b55de892970a6311b901900e6af8 Mon Sep 17 00:00:00 2001 From: Mark Reynolds Date: Dec 05 2019 17:38:12 +0000 Subject: Issue 50747 - Port readnsstate to dsctl Description: Port the legacy tool readnsstate to dsctl, and add a healthcheck for local and remote offset that are close to triggering replication time skew errors relates: https://pagure.io/389-ds-base/issue/50747 Reviewed by: tbordaz(Thanks!) Revise lint messages per Thierry's requests adjust skew calculation Update man page --- diff --git a/src/lib389/cli/dsctl b/src/lib389/cli/dsctl index 8b86629..8484f86 100755 --- a/src/lib389/cli/dsctl +++ b/src/lib389/cli/dsctl @@ -1,7 +1,7 @@ #!/usr/bin/python3 # --- BEGIN COPYRIGHT BLOCK --- -# Copyright (C) 2016 Red Hat, Inc. +# Copyright (C) 2019 Red Hat, Inc. # All rights reserved. # # License: GPL (version 3 or any later version). @@ -21,6 +21,7 @@ from lib389.cli_ctl import instance as cli_instance from lib389.cli_ctl import dbtasks as cli_dbtasks from lib389.cli_ctl import tls as cli_tls from lib389.cli_ctl import health as cli_health +from lib389.cli_ctl import nsstate as cli_nsstate from lib389.cli_ctl.instance import instance_remove_all from lib389.cli_base import ( _get_arg, @@ -58,6 +59,7 @@ if not os.path.exists(DSRC_CONTAINER): cli_dbtasks.create_parser(subparsers) cli_tls.create_parser(subparsers) cli_health.create_parser(subparsers) +cli_nsstate.create_parser(subparsers) argcomplete.autocomplete(parser) diff --git a/src/lib389/lib389/cli_ctl/health.py b/src/lib389/lib389/cli_ctl/health.py index d8f3d73..a420163 100644 --- a/src/lib389/lib389/cli_ctl/health.py +++ b/src/lib389/lib389/cli_ctl/health.py @@ -1,5 +1,5 @@ # --- BEGIN COPYRIGHT BLOCK --- -# Copyright (C) 2016 Red Hat, Inc. +# Copyright (C) 2019 Red Hat, Inc. # All rights reserved. # # License: GPL (version 3 or any later version). @@ -15,7 +15,7 @@ from lib389.config import Encryption, Config from lib389.monitor import MonitorDiskSpace from lib389.replica import Replica, Changelog5 from lib389.nss_ssl import NssSsl -from lib389.dseldif import FSChecks +from lib389.dseldif import FSChecks, DSEldif from lib389 import plugins from lib389._constants import DSRC_HOME @@ -33,6 +33,7 @@ CHECK_OBJECTS = [ MonitorDiskSpace, Replica, Changelog5, + DSEldif, NssSsl, ] diff --git a/src/lib389/lib389/cli_ctl/nsstate.py b/src/lib389/lib389/cli_ctl/nsstate.py new file mode 100644 index 0000000..6a74178 --- /dev/null +++ b/src/lib389/lib389/cli_ctl/nsstate.py @@ -0,0 +1,64 @@ +# --- BEGIN COPYRIGHT BLOCK --- +# Copyright (C) 2019 Red Hat, Inc. +# All rights reserved. +# +# License: GPL (version 3 or any later version). +# See LICENSE for details. +# --- END COPYRIGHT BLOCK --- + +import json +from lib389.dseldif import DSEldif + + +def get_nsstate(inst, log, args): + """Process the nsState attribute""" + dse_ldif = DSEldif(inst) + states = dse_ldif.readNsState(suffix=args.suffix, flip=args.flip) + if args.json: + log.info(json.dumps(states)) + else: + for state in states: + log.info("Replica DN: " + state['dn']) + log.info("Replica Suffix: " + state['suffix']) + log.info("Replica ID: " + state['rid']) + log.info("Gen Time: " + state['gen_time']) + log.info("Gen Time String: " + state['gen_time_str']) + log.info("Gen as CSN: " + state['gencsn']) + log.info("Local Offset: " + state['local_offset']) + log.info("Local Offset String: " + state['local_offset_str']) + log.info("Remote Offset: " + state['remote_offset']) + log.info("Remote Offset String: " + state['remote_offset_str']) + log.info("Time Skew: " + state['time_skew']) + log.info("Time Skew String: " + state['time_skew_str']) + log.info("Seq Num: " + state['seq_num']) + log.info("System Time: " + state['sys_time']) + log.info("Diff in Seconds: " + state['diff_secs']) + log.info("Diff in days/secs: " + state['diff_days_secs']) + log.info("Endian: " + state['endian']) + log.info("") + + +def create_parser(subparsers): + repl_get_nsstate = subparsers.add_parser('get-nsstate', help="""Get the replication nsState in a human readable format + +Replica DN: The DN of the replication configuration entry +Replica SUffix: The replicated suffix +Replica ID: The Replica identifier +Gen Time The time the CSN generator was created +Gen Time String: The time string of generator +Gen as CSN: The generation CSN +Local Offset: The offset due to the local clock being set back +Local Offset String: The offset in a nice human format +Remote Offset: The offset due to clock difference with remote systems +Remote Offset String: The offset in a nice human format +Time Skew: The time skew between this server and its replicas +Time Skew String: The time skew in a nice human format +Seq Num: The number of multiple csns within a second +System Time: The local system time +Diff in Seconds: The time difference in seconds from the CSN generator creation to now +Diff in days/secs: The time difference broken up into days and seconds +Endian: Little/Big Endian +""") + repl_get_nsstate.add_argument('--suffix', default=False, help='The DN of the replication suffix to read the state from') + repl_get_nsstate.add_argument('--flip', default=False, help='Flip between Little/Big Endian, this might be required for certain architectures') + repl_get_nsstate.set_defaults(func=get_nsstate) diff --git a/src/lib389/lib389/dseldif.py b/src/lib389/lib389/dseldif.py index 4155abc..75fc76a 100644 --- a/src/lib389/lib389/dseldif.py +++ b/src/lib389/lib389/dseldif.py @@ -9,9 +9,22 @@ import copy import os +import sys +import base64 +import time +from struct import pack, unpack +from datetime import timedelta from stat import ST_MODE +# from lib389.utils import print_nice_time from lib389.paths import Paths -from lib389.lint import DSPERMLE0001, DSPERMLE0002 +from lib389.lint import ( + DSPERMLE0001, + DSPERMLE0002, + DSSKEWLE0001, + DSSKEWLE0002, + DSSKEWLE0003 +) + class DSEldif(object): """A class for working with dse.ldif file @@ -46,6 +59,37 @@ class DSEldif(object): processed_line = line else: processed_line = processed_line[:-1] + line[1:] + self._lint_functions = [self._lint_nsstate] + + def lint(self): + results = [] + for fn in self._lint_functions: + for result in fn(): + if result is not None: + results.append(result) + return results + + def _lint_nsstate(self): + suffixes = self.readNsState() + for suffix in suffixes: + # Check the local offset first + report = None + skew = int(suffix['time_skew']) + if skew >= 86400: + # 24 hours - replication will break + report = copy.deepcopy(DSSKEWLE0003) + elif skew >= 43200: + # 12 hours + report = copy.deepcopy(DSSKEWLE0002) + elif skew >= 21600: + # 6 hours + report = copy.deepcopy(DSSKEWLE0001) + if report is not None: + report['items'].append(suffix['suffix']) + report['items'].append('Time Skew') + report['items'].append('Skew: ' + suffix['time_skew_str']) + report['fix'] = report['fix'].replace('YOUR_INSTANCE', self._instance.serverid) + yield report def _update(self): """Update the dse.ldif with a new contents""" @@ -159,6 +203,123 @@ class DSEldif(object): self.add(entry_dn, attr, value) self._update() + # Read NsState helper functions + def _flipend(self, end): + if end == '<': + return '>' + if end == '>': + return '<' + + def _getGenState(self, dn, replica_suffix, nsstate, flip): + """Return a dict ofall the nsState properties + """ + from lib389.utils import print_nice_time + if pack('h', 1) == pack('=h',1): + endian = "Big Endian" + end = '>' + if flip: + end = flipend(end) + else: + raise ValueError("Unknown endian, unable to proceed") + + thelen = len(nsstate) + if thelen <= 20: + pad = 2 # padding for short H values + timefmt = 'I' # timevals are unsigned 32-bit int + else: + pad = 6 # padding for short H values + timefmt = 'Q' # timevals are unsigned 64-bit int + + base_fmtstr = "H%dx3%sH%dx" % (pad, timefmt, pad) + fmtstr = end + base_fmtstr + (rid, sampled_time, local_offset, remote_offset, seq_num) = unpack(fmtstr, nsstate) + now = int(time.time()) + tdiff = now-sampled_time + wrongendian = False + try: + tdelta = timedelta(seconds=tdiff) + wrongendian = tdelta.days > 10*365 + except OverflowError: # int overflow + wrongendian = True + + # if the sampled time is more than 20 years off, this is + # probably the wrong endianness + if wrongendian: + end = flipend(end) + fmtstr = end + base_fmtstr + (rid, sampled_time, local_offset, remote_offset, seq_num) = unpack(fmtstr, nsstate) + tdiff = now-sampled_time + tdelta = timedelta(seconds=tdiff) + + return { + 'dn': dn, + 'suffix': replica_suffix, + 'endian': endian, + 'rid': str(rid), + 'gen_time': str(sampled_time), + 'gencsn': "%08x%04d%04d0000" % (sampled_time, seq_num, rid), + 'gen_time_str': time.ctime(sampled_time), + 'local_offset': str(local_offset), + 'local_offset_str': print_nice_time(local_offset), + 'remote_offset': str(remote_offset), + 'remote_offset_str': print_nice_time(remote_offset), + 'time_skew': str(local_offset + remote_offset), + 'time_skew_str': print_nice_time(local_offset + remote_offset), + 'seq_num': str(seq_num), + 'sys_time': str(time.ctime(now)), + 'diff_secs': str(tdiff), + 'diff_days_secs': "%d:%d" % (tdelta.days, tdelta.seconds), + } + + def readNsState(self, suffix=None, flip=False): + """Look for the nsState attribute in replication configuration entries, + then decode the base64 value and provide a dict of all stats it + contains + + :param suffix: specific suffix to read nsState from + :type suffix: str + """ + found_replica = False + found_suffix = False + replica_suffix = "" + nsstate = "" + states = [] + + for line in self._contents: + if line.startswith("dn: "): + dn = line[4:].strip() + if dn.startswith("cn=replica"): + found_replica = True + else: + found_replica = False + else: + if line.lower().startswith("nsstate:: ") and dn.startswith("cn=replica"): + b64val = line[10:].strip() + nsstate = base64.decodebytes(b64val.encode()) + elif line.lower().startswith("nsds5replicaroot"): + found_suffix = True + replica_suffix = line.lower().split(':')[1].strip() + + if found_replica and found_suffix and nsstate != "": + # We have everything we need to proceed + if suffix is not None and suffix == replica_suffix: + states.append(self._getGenState(dn, replica_suffix, nsstate, flip)) + break + else: + states.append(self._getGenState(dn, replica_suffix, nsstate, flip)) + # reset flags for next round... + found_replica = False + found_suffix = False + replica_suffix = "" + nsstate = "" + + return states + class FSChecks(object): """This is for the healthcheck feature, check commonly used system config files the diff --git a/src/lib389/lib389/lint.py b/src/lib389/lib389/lint.py index 736dffa..b2bd8cd 100644 --- a/src/lib389/lib389/lint.py +++ b/src/lib389/lib389/lint.py @@ -344,3 +344,55 @@ security database pin/password files should only be readable by Directory Server # chmod PERMS FILE""" } + +# NsState time skew issues +DSSKEWLE0001 = { + 'dsle': 'DSSKEWLE0001', + 'severity': 'Low', + 'items' : ['Replication'], + 'detail': """The time skew is over 6 hours. If this time skew continues to increase +to 24 hours then replication can potentially stop working. Please continue to +monitor the time skew offsets for increasing values.""", + 'fix' : """Monitor the time skew and avoid making changes to the system time. +Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems +and find the paragraph "Too much time skew".""" +} + +DSSKEWLE0002 = { + 'dsle': 'DSSKEWLE0002', + 'severity': 'Medium', + 'items' : ['Replication'], + 'detail': """The time skew is over 12 hours. If this time skew continues to increase +to 24 hours then replication can potentially stop working. Please continue to +monitor the time skew offsets for increasing values. Setting nsslapd-ignore-time-skew +to "on" on each replica will allow replication to continue, but if the time skew +continues to increase other more serious replication problems can occur.""", + 'fix' : """Monitor the time skew and avoid making changes to the system time. +If you get close to 24 hours of time skew replication may stop working. +In that case configure the server to ignore the time skew until the system +times can be fixed/synchronized: + + # dsconf slapd-YOUR_INSTANCE config replace nsslapd-ignore-time-skew=on + +Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems +and find the paragraph "Too much time skew".""" +} + +DSSKEWLE0003 = { + 'dsle': 'DSSKEWLE0003', + 'severity': 'High', + 'items' : ['Replication'], + 'detail': """The time skew is over 24 hours. Setting nsslapd-ignore-time-skew +to "on" on each replica will allow replication to continue, but if the +time skew continues to increase other serious replication problems can +occur.""", + 'fix' : """Avoid making changes to the system time, and make sure the clocks +on all the replicas are correct. If you haven't set the server's +"ignore time skew" setting then do the following on all the replicas +until the time issues have been resolved: + + # dsconf slapd-YOUR_INSTANCE config replace nsslapd-ignore-time-skew=on + +Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems +and find the paragraph "Too much time skew".""" +} diff --git a/src/lib389/lib389/utils.py b/src/lib389/lib389/utils.py index 459a490..70a3a10 100644 --- a/src/lib389/lib389/utils.py +++ b/src/lib389/lib389/utils.py @@ -1327,3 +1327,31 @@ def search_filter_escape_bytes(bytes_value): else: raise RuntimeError('Running with Python 2 is unsupported') + +def print_nice_time(seconds): + """Convert seconds to a pretty format + """ + seconds = int(seconds) + d, s = divmod(seconds, 24*60*60) + h, s = divmod(s, 60*60) + m, s = divmod(s, 60) + d_plural = "" + h_plural = "" + m_plural = "" + s_plural = "" + if d > 1: + d_plural = "s" + if h != 1: + h_plural = "s" + if m != 1: + m_plural = "s" + if s != 1: + s_plural = "s" + if d > 0: + return f'{d:d} day{d_plural}, {h:d} hour{h_plural}, {m:d} minute{m_plural}, {s:d} second{s_plural}' + elif h > 0: + return f'{h:d} hour{h_plural}, {m:d} minute{m_plural}, {s:d} second{s_plural}' + elif m > 0: + return f'{m:d} minute{m_plural}, {s:d} second{s_plural}' + else: + return f'{s:d} second{s_plural}'