#50765 Issue 50747 - Port readnsstate to dsctl
Closed 3 years ago by spichugi. Opened 4 years ago by mreynolds.
mreynolds/389-ds-base issue50747  into  master

file modified
+3 -1
@@ -1,7 +1,7 @@ 

  #!/usr/bin/python3

  

  # --- BEGIN COPYRIGHT BLOCK ---

- # Copyright (C) 2016 Red Hat, Inc.

+ # Copyright (C) 2019 Red Hat, Inc.

  # All rights reserved.

  #

  # License: GPL (version 3 or any later version).
@@ -21,6 +21,7 @@ 

  from lib389.cli_ctl import dbtasks as cli_dbtasks

  from lib389.cli_ctl import tls as cli_tls

  from lib389.cli_ctl import health as cli_health

+ from lib389.cli_ctl import nsstate as cli_nsstate

  from lib389.cli_ctl.instance import instance_remove_all

  from lib389.cli_base import (

      _get_arg,
@@ -58,6 +59,7 @@ 

  cli_dbtasks.create_parser(subparsers)

  cli_tls.create_parser(subparsers)

  cli_health.create_parser(subparsers)

+ cli_nsstate.create_parser(subparsers)

  

  argcomplete.autocomplete(parser)

  

@@ -1,5 +1,5 @@ 

  # --- BEGIN COPYRIGHT BLOCK ---

- # Copyright (C) 2016 Red Hat, Inc.

+ # Copyright (C) 2019 Red Hat, Inc.

  # All rights reserved.

  #

  # License: GPL (version 3 or any later version).
@@ -15,7 +15,7 @@ 

  from lib389.monitor import MonitorDiskSpace

  from lib389.replica import Replica, Changelog5

  from lib389.nss_ssl import NssSsl

- from lib389.dseldif import FSChecks

+ from lib389.dseldif import FSChecks, DSEldif

  from lib389 import plugins

  from lib389._constants import DSRC_HOME

  
@@ -33,6 +33,7 @@ 

      MonitorDiskSpace,

      Replica,

      Changelog5,

+     DSEldif,

      NssSsl,

  ]

  

@@ -0,0 +1,64 @@ 

+ # --- BEGIN COPYRIGHT BLOCK ---

+ # Copyright (C) 2019 Red Hat, Inc.

+ # All rights reserved.

+ #

+ # License: GPL (version 3 or any later version).

+ # See LICENSE for details.

+ # --- END COPYRIGHT BLOCK ---

+ 

+ import json

+ from lib389.dseldif import DSEldif

+ 

+ 

+ def get_nsstate(inst, log, args):

+     """Process the nsState attribute"""

+     dse_ldif = DSEldif(inst)

+     states = dse_ldif.readNsState(suffix=args.suffix, flip=args.flip)

+     if args.json:

+         log.info(json.dumps(states))

+     else:

+         for state in states:

+             log.info("Replica DN:           " + state['dn'])

+             log.info("Replica Suffix:       " + state['suffix'])

+             log.info("Replica ID:           " + state['rid'])

+             log.info("Gen Time:             " + state['gen_time'])

+             log.info("Gen Time String:      " + state['gen_time_str'])

+             log.info("Gen as CSN:           " + state['gencsn'])

+             log.info("Local Offset:         " + state['local_offset'])

+             log.info("Local Offset String:  " + state['local_offset_str'])

+             log.info("Remote Offset:        " + state['remote_offset'])

+             log.info("Remote Offset String: " + state['remote_offset_str'])

+             log.info("Time Skew:            " + state['time_skew'])

+             log.info("Time Skew String:     " + state['time_skew_str'])

+             log.info("Seq Num:              " + state['seq_num'])

+             log.info("System Time:          " + state['sys_time'])

+             log.info("Diff in Seconds:      " + state['diff_secs'])

+             log.info("Diff in days/secs:    " + state['diff_days_secs'])

+             log.info("Endian:               " + state['endian'])

+             log.info("")

+ 

+ 

+ def create_parser(subparsers):

+     repl_get_nsstate = subparsers.add_parser('get-nsstate', help="""Get the replication nsState in a human readable format

+ 

+ Replica DN:           The DN of the replication configuration entry

+ Replica SUffix:       The replicated suffix

+ Replica ID:           The Replica identifier

+ Gen Time              The time the CSN generator was created

+ Gen Time String:      The time string of generator

+ Gen as CSN:           The generation CSN

+ Local Offset:         The offset due to the local clock being set back

+ Local Offset String:  The offset in a nice human format

+ Remote Offset:        The offset due to clock difference with remote systems

+ Remote Offset String: The offset in a nice human format

+ Time Skew:            The time skew between this server and its replicas

+ Time Skew String:     The time skew in a nice human format

+ Seq Num:              The number of multiple csns within a second

+ System Time:          The local system time

+ Diff in Seconds:      The time difference in seconds from the CSN generator creation to now

+ Diff in days/secs:    The time difference broken up into days and seconds

+ Endian:               Little/Big Endian

+ """)

+     repl_get_nsstate.add_argument('--suffix', default=False, help='The DN of the replication suffix to read the state from')

+     repl_get_nsstate.add_argument('--flip', default=False, help='Flip between Little/Big Endian, this might be required for certain architectures')

+     repl_get_nsstate.set_defaults(func=get_nsstate)  

file modified
+162 -1
@@ -9,9 +9,22 @@ 

  

  import copy

  import os

+ import sys

+ import base64

+ import time

+ from struct import pack, unpack

+ from datetime import timedelta

  from stat import ST_MODE

+ # from lib389.utils import print_nice_time

  from lib389.paths import Paths

- from lib389.lint import DSPERMLE0001, DSPERMLE0002

+ from lib389.lint import (

+     DSPERMLE0001,

+     DSPERMLE0002,

+     DSSKEWLE0001,

+     DSSKEWLE0002,

+     DSSKEWLE0003

+ )

+ 

  

  class DSEldif(object):

      """A class for working with dse.ldif file
@@ -46,6 +59,37 @@ 

                          processed_line = line

                  else:

                      processed_line = processed_line[:-1] + line[1:]

+         self._lint_functions = [self._lint_nsstate]

+ 

+     def lint(self):

+         results = []

+         for fn in self._lint_functions:

+             for result in fn():

+                 if result is not None:

+                     results.append(result)

+         return results

+ 

+     def _lint_nsstate(self):

+         suffixes = self.readNsState()

+         for suffix in suffixes:

+             # Check the local offset first

+             report = None

+             skew = int(suffix['time_skew'])

+             if skew >= 86400:

+                 # 24 hours - replication will break

+                 report = copy.deepcopy(DSSKEWLE0003)

+             elif skew >= 43200:

+                 # 12 hours

+                 report = copy.deepcopy(DSSKEWLE0002)

+             elif skew >= 21600:

+                 # 6 hours

+                 report = copy.deepcopy(DSSKEWLE0001)

+             if report is not None:

+                 report['items'].append(suffix['suffix'])

+                 report['items'].append('Time Skew')

+                 report['items'].append('Skew: ' + suffix['time_skew_str'])

+                 report['fix'] = report['fix'].replace('YOUR_INSTANCE', self._instance.serverid)

+                 yield report

  

      def _update(self):

          """Update the dse.ldif with a new contents"""
@@ -159,6 +203,123 @@ 

          self.add(entry_dn, attr, value)

          self._update()

  

+     # Read NsState helper functions

+     def _flipend(self, end):

+         if end == '<':

+             return '>'

+         if end == '>':

+             return '<'

+ 

+     def _getGenState(self, dn, replica_suffix, nsstate, flip):

+         """Return a dict ofall the nsState properties

+         """

+         from lib389.utils import print_nice_time

+         if pack('<h', 1) == pack('=h',1):

+             endian = "Little Endian"

+             end = '<'

+             if flip:

+                 end = flipend(end)

+         elif pack('>h', 1) == pack('=h',1):

+             endian = "Big Endian"

+             end = '>'

+             if flip:

+                 end = flipend(end)

+         else:

+             raise ValueError("Unknown endian, unable to proceed")

+ 

+         thelen = len(nsstate)

+         if thelen <= 20:

+             pad = 2 # padding for short H values

+             timefmt = 'I' # timevals are unsigned 32-bit int

+         else:

+             pad = 6 # padding for short H values

+             timefmt = 'Q' # timevals are unsigned 64-bit int

+ 

+         base_fmtstr = "H%dx3%sH%dx" % (pad, timefmt, pad)

+         fmtstr = end + base_fmtstr

+         (rid, sampled_time, local_offset, remote_offset, seq_num) = unpack(fmtstr, nsstate)

+         now = int(time.time())

+         tdiff = now-sampled_time

+         wrongendian = False

+         try:

+             tdelta = timedelta(seconds=tdiff)

+             wrongendian = tdelta.days > 10*365

+         except OverflowError: # int overflow

+             wrongendian = True

+ 

+         # if the sampled time is more than 20 years off, this is

+         # probably the wrong endianness

+         if wrongendian:

+             end = flipend(end)

+             fmtstr = end + base_fmtstr

+             (rid, sampled_time, local_offset, remote_offset, seq_num) = unpack(fmtstr, nsstate)

+             tdiff = now-sampled_time

+             tdelta = timedelta(seconds=tdiff)

+ 

+         return {

+             'dn': dn,

+             'suffix': replica_suffix,

+             'endian': endian,

+             'rid': str(rid),

+             'gen_time': str(sampled_time),

+             'gencsn': "%08x%04d%04d0000" % (sampled_time, seq_num, rid),

+             'gen_time_str': time.ctime(sampled_time),

+             'local_offset': str(local_offset),

+             'local_offset_str': print_nice_time(local_offset),

+             'remote_offset': str(remote_offset),

+             'remote_offset_str': print_nice_time(remote_offset),

+             'time_skew': str(local_offset + remote_offset),

+             'time_skew_str': print_nice_time(local_offset + remote_offset),

+             'seq_num': str(seq_num),

+             'sys_time': str(time.ctime(now)),

+             'diff_secs': str(tdiff),

+             'diff_days_secs': "%d:%d" % (tdelta.days, tdelta.seconds),

+         }

+ 

+     def readNsState(self, suffix=None, flip=False):

+         """Look for the nsState attribute in replication configuration entries,

+         then decode the base64 value and  provide a dict of all stats it

+         contains

+ 

+         :param suffix: specific suffix to read nsState from

+         :type suffix: str

+         """

+         found_replica = False

+         found_suffix = False

+         replica_suffix = ""

+         nsstate = ""

+         states = []

+ 

+         for line in self._contents:

+             if line.startswith("dn: "):

+                 dn = line[4:].strip()

+                 if dn.startswith("cn=replica"):

+                     found_replica = True

+                 else:

+                     found_replica = False

+             else:

+                 if line.lower().startswith("nsstate:: ") and dn.startswith("cn=replica"):

+                     b64val = line[10:].strip()

+                     nsstate = base64.decodebytes(b64val.encode())

+                 elif line.lower().startswith("nsds5replicaroot"):

+                     found_suffix = True

+                     replica_suffix = line.lower().split(':')[1].strip()

+ 

+             if found_replica and found_suffix and nsstate != "":

+                 # We have everything we need to proceed

+                 if suffix is not None and suffix == replica_suffix:

+                     states.append(self._getGenState(dn, replica_suffix, nsstate, flip))

+                     break

+                 else:

+                     states.append(self._getGenState(dn, replica_suffix, nsstate, flip))

+                     # reset flags for next round...

+                     found_replica = False

+                     found_suffix = False

+                     replica_suffix = ""

+                     nsstate = ""

+ 

+         return states

+ 

  

  class FSChecks(object):

      """This is for the healthcheck feature, check commonly used system config files the

file modified
+52
@@ -344,3 +344,55 @@ 

  

      # chmod PERMS FILE"""

  }

+ 

+ # NsState time skew issues

+ DSSKEWLE0001 = {

+     'dsle': 'DSSKEWLE0001',

+     'severity': 'Low',

+     'items' : ['Replication'],

+     'detail': """The time skew is over 6 hours.  If this time skew continues to increase

+ to 24 hours then replication can potentially stop working.  Please continue to

+ monitor the time skew offsets for increasing values.""",

+     'fix' : """Monitor the time skew and avoid making changes to the system time.

+ Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems

+ and find the paragraph "Too much time skew"."""

+ }

+ 

+ DSSKEWLE0002 = {

+     'dsle': 'DSSKEWLE0002',

+     'severity': 'Medium',

+     'items' : ['Replication'],

+     'detail': """The time skew is over 12 hours.  If this time skew continues to increase

+ to 24 hours then replication can potentially stop working.  Please continue to

+ monitor the time skew offsets for increasing values.  Setting nsslapd-ignore-time-skew

+ to "on" on each replica will allow replication to continue, but if the time skew

+ continues to increase other more serious replication problems can occur.""",

+     'fix' : """Monitor the time skew and avoid making changes to the system time.

+ If you get close to 24 hours of time skew replication may stop working.

+ In that case configure the server to ignore the time skew until the system

+ times can be fixed/synchronized:

+ 

+     # dsconf slapd-YOUR_INSTANCE config replace nsslapd-ignore-time-skew=on

+ 

+ Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems

+ and find the paragraph "Too much time skew"."""

+ }

+ 

+ DSSKEWLE0003 = {

+     'dsle': 'DSSKEWLE0003',

+     'severity': 'High',

+     'items' : ['Replication'],

+     'detail': """The time skew is over 24 hours.  Setting nsslapd-ignore-time-skew

+ to "on" on each replica will allow replication to continue, but if the

+ time skew continues to increase other serious replication problems can

+ occur.""",

+     'fix' : """Avoid making changes to the system time, and make sure the clocks

+ on all the replicas are correct.  If you haven't set the server's

+ "ignore time skew" setting then do the following on all the replicas

+ until the time issues have been resolved:

+ 

+     # dsconf slapd-YOUR_INSTANCE config replace nsslapd-ignore-time-skew=on

+ 

+ Also look at https://access.redhat.com/documentation/en-us/red_hat_directory_server/11/html/administration_guide/managing_replication-troubleshooting_replication_related_problems

+ and find the paragraph "Too much time skew"."""

+ }

@@ -1327,3 +1327,31 @@ 

      else:

          raise RuntimeError('Running with Python 2 is unsupported')

  

+ 

+ def print_nice_time(seconds):

+     """Convert seconds to a pretty format

+     """

+     seconds = int(seconds)

+     d, s = divmod(seconds, 24*60*60)

+     h, s = divmod(s, 60*60)

+     m, s = divmod(s, 60)

+     d_plural = ""

+     h_plural = ""

+     m_plural = ""

+     s_plural = ""

+     if d > 1:

+         d_plural = "s"

+     if h != 1:

+         h_plural = "s"

+     if m != 1:

+         m_plural = "s"

+     if s != 1:

+         s_plural = "s"

+     if d > 0:

+         return f'{d:d} day{d_plural}, {h:d} hour{h_plural}, {m:d} minute{m_plural}, {s:d} second{s_plural}'

+     elif h > 0:

+         return f'{h:d} hour{h_plural}, {m:d} minute{m_plural}, {s:d} second{s_plural}'

+     elif m > 0:

+         return f'{m:d} minute{m_plural}, {s:d} second{s_plural}'

+     else:

+         return f'{s:d} second{s_plural}'

Description:

Port the legacy tool readnsstate to dsctl, and add a healthcheck for local and remote offset that are close to triggering replication time skew errors

relates: https://pagure.io/389-ds-base/issue/50747

This server will generate new_csn = now + local_offset + remote_offset
This server is fine, but others may refuse updates from this server if new_csn - now > 24h.

So I think messages should say that replication from this server is potentially at risk. Potentially, because if csngen on others servers have the same offset, it will be fine.

Also, I am not sure it is useful to separate the test of local/remote offset rather than testing local+remote. At the end of the day, replication is potentially at risk since local_offset+remote_offset > 24h.

Also I am not sure what we can say about the system time (behind/ahead) of the server itself. I would rather say something like that the mechanism to handle the time skew may reach a limit.

This server will generate new_csn = now + local_offset + remote_offset
This server is fine, but others may refuse updates from this server if new_csn - now > 24h.
So I think messages should say that replication from this server is potentially at risk. Potentially, because if csngen on others servers have the same offset, it will be fine.

Well we can not look at the other server's nsState in healthcheck. So I was looking at csngen_adjust_time(), and if the offset is greater than CSN_MAX_TIME_ADJUST we return an error CSN_LIMIT_EXCEEDED.

Should I be checking the offset differential differently?

Or, what can I learn from nsState that is worth reporting in the health check in regards to clock skew?

Also, I am not sure it is useful to separate the test of local/remote offset rather than testing local+remote. At the end of the day, replication is potentially at risk since local_offset+remote_offset > 24h.

Well I thought it would be useful to know if the local server's clock is off, or a remote server clock is off. We want the customer to be able to fix what is broken, so I was trying to point them in the right direction.

Also I am not sure what we can say about the system time (behind/ahead) of the server itself. I would rather say something like that the mechanism to handle the time skew may reach a limit.

Which messages are you referring to?

IMHO if nsstate reveal a high remote and local offset, it worth that the healthcheck tool report there is a potential risk. I agree the 6h=low, 12h=medium and 24h=high as 24h offset may break replication. But to break it depends on the csngen.offsets off the consumers that we do not know, and the ignore-time-skew config that we do not know either.

investigation that admin could take is to monitor the time-skew and if it continue to increase, he should identify why (is a server system time jumping, is nscd making frequent change..). A protective action is to configure ignore-time-skew=no on the topology if the risk become high even if the risk is potential.

I was thinking at messages DSSKEWLE00[1-3] that start with 'MSG has a system time that is over xxx hours behind this server...'. I think that healthcheck may detect offsets even if system time look in sync (because time was set forward/backward for a short period of time). So the message should just say there is possible time skew limit without reference to the current system time.

1 new commit added

  • Revise lint messages per Thierry's requests
4 years ago

@tbordaz I revised the lint messages. I'm not sure it's exactly what you want. So please review it again, and I can make further changes. Thanks!

@mreynolds the messages looks perfect to me !

The only remaining concern is regarding test of local and remote offsets in _lint_nsstate.
IMHO it is useless to separate them in your test. We should rather have only one value tested: local_offset+remote_offset.

rebased onto 30cf0c462dfabdf0a36d5c2cbbba23c4f1747f63

4 years ago

@mreynolds the messages looks perfect to me !
The only remaining concern is regarding test of local and remote offsets in _lint_nsstate.
IMHO it is useless to separate them in your test. We should rather have only one value tested: local_offset+remote_offset.

Done! Added a time_skew field to the get-nstate output as well. Please review...

Thanks @mreynolds . The patch looks good to me. ACK

rebased onto 7301d43

4 years ago

Pull-Request has been merged by mreynolds

4 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3820

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago