#50447 Issue 49602 - Revise replication status messages
Closed 3 years ago by spichugi. Opened 4 years ago by mreynolds.
mreynolds/389-ds-base ticket49602  into  master

@@ -23,6 +23,7 @@ 

                                  REPLICATION_BIND_METHOD, REPLICATION_TRANSPORT, DEFAULT_BACKUPDIR,

                                  RA_NAME, RA_BINDDN, RA_BINDPW, RA_METHOD, RA_TRANSPORT_PROT,

                                  defaultProperties)

+ import json

  

  pytestmark = pytest.mark.tier1

  
@@ -95,7 +96,7 @@ 

      consumer.start()

  

      log.info("Make a search for mail attribute in attempt to crash server")

-     consumer.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, "(mail=testuser@redhat.com)", ["mail"])

+     c_user.get_attr_val("mail")

  

      log.info("Make sure that server hasn't crashed")

      repl.test_replication(master, consumer)
@@ -111,11 +112,13 @@ 

          1. Check nsds5replicaLastUpdateStart value

          2. Check nsds5replicaLastUpdateEnd value

          3. Check nsds5replicaLastUpdateStatus value

+         4. Check nsds5replicaLastUpdateStatusJSON is parsable

      :expectedresults:

          1. nsds5replicaLastUpdateStart should be equal to 0

          2. nsds5replicaLastUpdateEnd should be equal to 0

          3. nsds5replicaLastUpdateStatus should not be equal

-            to "0 Replica acquired successfully: Incremental update started"

+            to "Replica acquired successfully: Incremental update started"

+         4. Success

      """

  

      master = topo_nr.ins["standalone1"]
@@ -139,11 +142,15 @@ 

      with pytest.raises(Exception):

          repl.wait_for_replication(master, consumer, timeout=5)

  

-     assert agmt.get_attr_val_bytes('nsds5replicaLastUpdateStart') == b"19700101000000Z"

-     assert agmt.get_attr_val_bytes("nsds5replicaLastUpdateEnd") == b"19700101000000Z"

-     assert b"Replica acquired successfully" not in agmt.get_attr_val_bytes("nsds5replicaLastUpdateStatus")

- 

+     assert agmt.get_attr_val_utf8('nsds5replicaLastUpdateStart') == "19700101000000Z"

+     assert agmt.get_attr_val_utf8("nsds5replicaLastUpdateEnd") == "19700101000000Z"

+     assert "replica acquired successfully" not in agmt.get_attr_val_utf8_l("nsds5replicaLastUpdateStatus")

  

+     # make sure the JSON attribute is parsable

+     json_status = agmt.get_attr_val_utf8("nsds5replicaLastUpdateStatusJSON")

+     if json_status is not None:

+         json_obj = json.loads(json_status)

+         log.debug("JSON status message: {}".format(json_obj))

  

  if __name__ == '__main__':

      # Run isolated

@@ -312,6 +312,8 @@ 

  attributeTypes: ( 2.16.840.1.113730.3.1.2344 NAME 'nsslapd-tls-check-crl' DESC 'Check CRL when opening outbound TLS connections. Valid options are none, peer, all.' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' )

  attributeTypes: ( 2.16.840.1.113730.3.1.2353 NAME 'nsslapd-encryptionalgorithm' DESC 'The encryption algorithm used to encrypt the changelog' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE X-ORIGIN '389 Directory Server' )

  attributeTypes: ( 2.16.840.1.113730.3.1.2084 NAME 'nsSymmetricKey' DESC 'A symmetric key - currently used by attribute encryption' SYNTAX 1.3.6.1.4.1.1466.115.121.1.40 SINGLE-VALUE X-ORIGIN 'attribute encryption' )

+ attributeTypes: ( 2.16.840.1.113730.3.1.2364 NAME 'nsds5replicaLastInitStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' )

+ attributeTypes: ( 2.16.840.1.113730.3.1.2365 NAME 'nsds5replicaLastUpdateStatusJSON' DESC 'Netscape defined attribute type' SYNTAX 1.3.6.1.4.1.1466.115.121.1.15 SINGLE-VALUE NO-USER-MODIFICATION X-ORIGIN 'Netscape Directory Server' )

  #

  # objectclasses

  #

@@ -60,7 +60,11 @@ 

  #define DEFAULT_TIMEOUT 120             /* (seconds) default outbound LDAP connection */

  #define DEFAULT_FLOWCONTROL_WINDOW 1000 /* #entries sent without acknowledgment */

  #define DEFAULT_FLOWCONTROL_PAUSE 2000  /* msec of pause when #entries sent witout acknowledgment */

- #define STATUS_LEN 1024

+ #define STATUS_LEN 2048

+ #define STATUS_GOOD "green"

+ #define STATUS_WARNING "amber"

+ #define STATUS_BAD "red"

+ 

  

  struct changecounter

  {
@@ -93,11 +97,13 @@ 

      time_t last_update_start_time;       /* Local start time of last update session */

      time_t last_update_end_time;         /* Local end time of last update session */

      char last_update_status[STATUS_LEN]; /* Status of last update. Format = numeric code <space> textual description */

+     char last_update_status_json[STATUS_LEN];

      PRBool update_in_progress;

      PRBool is_enabled;

      time_t last_init_start_time;       /* Local start time of last total init */

      time_t last_init_end_time;         /* Local end time of last total init */

      char last_init_status[STATUS_LEN]; /* Status of last total init. Format = numeric code <space> textual description */

+     char last_init_status_json[STATUS_LEN];

      PRLock *lock;

      Object *consumerRUV;     /* last RUV received from the consumer - used for changelog purging */

      CSN *consumerSchemaCSN;  /* last schema CSN received from the consumer */
@@ -2443,6 +2449,21 @@ 

      }

  }

  

+ static void

+ agmt_set_last_update_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc)

+ {

+     char ts[SLAPI_TIMESTAMP_BUFSIZE];

+     time_t now;

+ 

+     time(&now);

+     strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));

+     PR_snprintf(ra->last_update_status_json, STATUS_LEN,

+             "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", "

+             "\"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}",

+             state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc),

+             ts, ra->last_update_status);

+ }

+ 

  void

  agmt_set_last_update_status(Repl_Agmt *ra, int ldaprc, int replrc, const char *message)

  {
@@ -2463,19 +2484,29 @@ 

              PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) %s%s - LDAP error: %s%s%s%s",

                          ldaprc, message ? message : "", message ? "" : " - ",

                          slapi_err2string(ldaprc), replmsg ? " (" : "", replmsg ? replmsg : "", replmsg ? ")" : "");

+             agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);

          }

          /* ldaprc == LDAP_SUCCESS */

          else if (replrc != 0) {

              if (replrc == NSDS50_REPL_REPLICA_BUSY) {

                  PR_snprintf(ra->last_update_status, STATUS_LEN,

-                             "Error (%d) Can't acquire busy replica", replrc);

+                             "Error (%d) Can't acquire busy replica (%s)",

+                             replrc, message ? message : "");

+                 agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc);

+             } else if (replrc == NSDS50_REPL_TRANSIENT_ERROR  || replrc == NSDS50_REPL_BACKOFF) {

+                 PR_snprintf(ra->last_update_status, STATUS_LEN,

+                             "Error (%d) Can't acquire replica (%s)",

+                             replrc, message ? message : "");

+                 agmt_set_last_update_status_json(ra, STATUS_WARNING, ldaprc, replrc);

              } else if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) {

                  PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (0) Replication session successful");

+                 agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc);

              } else if (replrc == NSDS50_REPL_DISABLED) {

                  PR_snprintf(ra->last_update_status, STATUS_LEN, "Error (%d) Incremental update aborted: "

                                                                  "Replication agreement for %s\n can not be updated while the replica is disabled.\n"

                                                                  "(If the suffix is disabled you must enable it then restart the server for replication to take place).",

                              replrc, ra->long_name ? ra->long_name : "a replica");

+                 agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);

                  /* Log into the errors log, as "ra->long_name" is not accessible from the caller */

                  slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name,

                                "Incremental update aborted: Replication agreement for \"%s\" "
@@ -2487,17 +2518,35 @@ 

                  PR_snprintf(ra->last_update_status, STATUS_LEN,

                              "Error (%d) Replication error acquiring replica: %s%s(%s)",

                              replrc, message ? message : "", message ? " " : "", protocol_response2string(replrc));

+                 agmt_set_last_update_status_json(ra, STATUS_BAD, ldaprc, replrc);

              }

          } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */

          {

              PR_snprintf(ra->last_update_status, STATUS_LEN,

                          "Error (0) Replica acquired successfully: %s", message);

+             agmt_set_last_update_status_json(ra, STATUS_GOOD, ldaprc, replrc);

          } else { /* agmt_set_last_update_status(0,0,NULL) to reset agmt */

              ra->last_update_status[0] = '\0';

+             ra->last_update_status_json[0] = '\0';

          }

      }

  }

  

+ static void

+ agmt_set_last_init_status_json(Repl_Agmt *ra, char *state, int ldaprc, int replrc, int connrc)

+ {

+     char ts[SLAPI_TIMESTAMP_BUFSIZE];

+     time_t now;

+ 

+     time(&now);

+     strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));

+     PR_snprintf(ra->last_init_status_json, STATUS_LEN,

+             "{\"state\": \"%s\", \"ldap_rc\": \"%d\", \"ldap_rc_text\": \"%s\", \"repl_rc\": \"%d\", \"repl_rc_text\": \"%s\", "

+             "\"conn_rc\": \"%d\", \"conn_rc_text\": \"%s\", \"date\": \"%s\", \"message\": \"%s\"}",

+             state, ldaprc, ldap_err2string(ldaprc), replrc, protocol_response2string(replrc),

+             connrc, conn_result2string(connrc), ts, ra->last_init_status);

+ }

+ 

  void

  agmt_set_last_init_status(Repl_Agmt *ra, int ldaprc, int replrc, int connrc, const char *message)

  {
@@ -2523,16 +2572,16 @@ 

                      replmsg = NULL;

                  }

              }

-             PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s%sLDAP error: %s%s%s%s%s",

+             PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)%s%sLDAP error: %s%s%s%s%s",

                          ldaprc, message ? message : "", message ? "" : " - ",

                          slapi_err2string(ldaprc), replmsg ? " - " : "", replmsg ? replmsg : "",

                          connrc ? " - " : "", connrc ? connmsg : "");

+             agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);

          }

          /* ldaprc == LDAP_SUCCESS */

          else if (replrc != 0) {

              if (replrc == NSDS50_REPL_REPLICA_RELEASE_SUCCEEDED) {

-                 PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) %s",

-                             ldaprc, "Replication session successful");

+                 PR_snprintf(ra->last_init_status, STATUS_LEN, "Replication session successful");

              } else if (replrc == NSDS50_REPL_DISABLED) {

                  if (agmt_is_enabled(ra)) {

                      slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: "
@@ -2543,6 +2592,7 @@ 

                                                                    "Replication agreement for \"%s\" can not be updated while the suffix is disabled.\n"

                                                                    "You must enable it then restart the server for replication to take place).",

                                  replrc, ra->long_name ? ra->long_name : "a replica");

+                     agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);

                  } else {

                      /* You do not need to restart the server after enabling the agreement */

                      slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name, "Total update aborted: "
@@ -2551,6 +2601,7 @@ 

                      PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d) Total update aborted: "

                                                                    "Replication agreement for \"%s\" can not be updated while the agreement is disabled.",

                                  replrc, ra->long_name ? ra->long_name : "a replica");

+                     agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);

                  }

              } else {

                  PR_snprintf(ra->last_init_status, STATUS_LEN,
@@ -2558,19 +2609,21 @@ 

                              replrc, protocol_response2string(replrc),

                              message ? " - " : "", message ? message : "",

                              connrc ? " - " : "", connrc ? connmsg : "");

+                 agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);

              }

          } else if (connrc != CONN_OPERATION_SUCCESS) {

              PR_snprintf(ra->last_init_status, STATUS_LEN,

                          "Error (%d) connection error: %s%s%s",

                          connrc, connmsg,

                          message ? " - " : "", message ? message : "");

-         } else if (message != NULL) /* replrc == NSDS50_REPL_REPLICA_READY == 0 */

-         {

+             agmt_set_last_init_status_json(ra, STATUS_BAD, ldaprc, replrc, connrc);

+         } else if (message != NULL) { /* replrc == NSDS50_REPL_REPLICA_READY == 0 */

              PR_snprintf(ra->last_init_status, STATUS_LEN,

-                         "Error (%d) %s",

-                         ldaprc, message);

+                         "Error (%d) %s", ldaprc, message);

+             agmt_set_last_init_status_json(ra, STATUS_GOOD, ldaprc, replrc, connrc);

          } else { /* agmt_set_last_init_status(0,0,NULL) to reset agmt */

-             PR_snprintf(ra->last_init_status, STATUS_LEN, "Error (%d)", ldaprc);

+             ra->last_init_status[0] = '\0';

+             ra->last_init_status_json[0] = '\0';

          }

      }

  }
@@ -2705,10 +2758,20 @@ 

          agmt_get_changecount_string(ra, changecount_string, sizeof(changecount_string));

          slapi_entry_add_string(e, "nsds5replicaChangesSentSinceStartup", changecount_string);

          if (ra->last_update_status[0] == '\0') {

+             char status_msg[STATUS_LEN];

+             char ts[SLAPI_TIMESTAMP_BUFSIZE];

+             time_t now;

+             time(&now);

+             strftime(ts, sizeof ts, "%FT%TZ", gmtime(&now));

              slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus",

                                     "Error (0) No replication sessions started since server startup");

+             PR_snprintf(status_msg, STATUS_LEN,

+                     "{\"state\": \"green\", \"ldap_rc\": \"0\", \"ldap_rc_text\": \"success\", \"repl_rc\": \"0\", \"repl_rc_text\": \"replica acquired\", "

+                     "\"date\": \"%s\", \"message\": \"Error (0) No replication sessions started since server startup\"}", ts);

+             slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", status_msg);

          } else {

              slapi_entry_add_string(e, "nsds5replicaLastUpdateStatus", ra->last_update_status);

+             slapi_entry_add_string(e, "nsds5replicaLastUpdateStatusJSON", ra->last_update_status_json);

          }

          slapi_entry_add_string(e, "nsds5replicaUpdateInProgress", ra->update_in_progress ? "TRUE" : "FALSE");

  
@@ -2724,6 +2787,7 @@ 

  

          if (ra->last_init_status[0] != '\0') {

              slapi_entry_add_string(e, "nsds5replicaLastInitStatus", ra->last_init_status);

+             slapi_entry_add_string(e, "nsds5replicaLastInitStatusJSON", ra->last_init_status_json);

          }

      }

  bail:

@@ -374,13 +374,13 @@ 

                                        "has the same Replica ID as this one. "

                                        "Replication is aborting.\n",

                                        agmt_get_long_name(prp->agmt));

-                         agmt_set_last_update_status(prp->agmt, 0, 0,

-                                                     "Unable to aquire replica: the replica has the same "

+                         agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_REPLICAID_ERROR,

+                                                     "Unable to acquire replica: the replica has the same "

                                                      "Replica ID as this one. Replication is aborting.");

                          return_value = ACQUIRE_FATAL_ERROR;

                          break;

                      case NSDS50_REPL_BACKOFF:

-                         /* A replication sesssion hook on the replica

+                         /* A replication session hook on the replica

                           * wants us to go into backoff mode. */

                          slapi_log_err(SLAPI_LOG_ERR, repl_plugin_name,

                                        "acquire_replica - "
@@ -487,9 +487,8 @@ 

                            "%s: Unable to obtain current CSN. "

                            "Replication is aborting.\n",

                            agmt_get_long_name(prp->agmt));

-             agmt_set_last_update_status(prp->agmt, 0, 0,

-                                         "Unable to obtain current CSN. "

-                                         "Replication is aborting.");

+             agmt_set_last_update_status(prp->agmt, 0, NSDS50_REPL_INTERNAL_ERROR,

+                                         "Unable to obtain current CSN. Replication is aborting.");

              return_value = ACQUIRE_FATAL_ERROR;

          }

      }
@@ -665,6 +664,8 @@ 

          return "transient warning";

      case NSDS50_REPL_RUV_ERROR:

          return "RUV error";

+     case NSDS50_REPL_REPLICA_NO_RESPONSE:

+         return "no response received";

      default:

          return "unknown error";

      }

Bug Description: All agreement status messages start with "Error (##)" followed
by a text string. Even success states start with "Error", and
this is confusing.

             Instead use new keywords:  Good, Warning, Bad

             Also add new attributers to display the status in a JSON format
             for easier parsing for applications:

                 replicaLastUpdateStatusJSON
                 replicaLastInitStatusJSON

Design Doc: https://www.port389.org/docs/389ds/design/repl-agmt-status-design.html

https://pagure.io/389-ds-base/issue/49602

Minor nitpick, if you have time can we use snprintf rather than PR_snprintf? No issue if you don't want to change.

As discussed, 8601 timestamps may be better (I think access log does them?)

And finally, good maybe better as "operational" or "green" etc. Email has more. Otherwise looks good!

Ahh one more point - when we update the json field moving between healthy -> warning/error, we should write to the error log at the same time, and if we move from warning/error -> healthy, we should log that too (but subsequent "healthy" messages shuold not be logged ...).

This could be a future addition though, but it would help gss diagnose issues in retrospect rather than relying on monitoring the attribute.

As discussed, 8601 timestamps may be better (I think access log does them?)

I added the ISO 8601 timestamp already - is this not what you wanted? I'm confused

And finally, good maybe better as "operational" or "green" etc. Email has more. Otherwise looks good!

This is getting silly about the names. Perhaps this ticket should be pushed out to 1.4.2 as its going to miss the deadline anyway (today) --> which means once again we missed ALL the rfe's for the next release.

I did just say pick one and be done ... :(

As discussed, 8601 timestamps may be better (I think access log does them?)

I added the ISO 8601 timestamp already - is this not what you wanted? I'm confused

Maybe I misinterpreted this becausu in your email you said you wourd remove them?

As discussed, 8601 timestamps may be better (I think access log does them?)
I added the ISO 8601 timestamp already - is this not what you wanted? I'm confused

Maybe I misinterpreted this becausu in your email you said you wourd remove them?

Well I looked into it and it was trivial - so I added it and updated the design doc.

I'm really sorry about this. I should have check the code more carefully, and I'm really sorry for the miscommunication. I'd say in the future I should do better in my review to check this, we should have kept the communication to a single source (maybe just the pr), and that it's also a good idea to comment the code to address comments that were made in the code.

Sorry this was delayed, :(

rebased onto de38d2eb4bd1cdc2ca9cfd09bd7e8cb8a23c9c85

4 years ago

rebased onto 35f5d6db6517bc8a64f34c994b21888f5f3254dc

4 years ago

rebased onto b4b1df6eb8ae12c0c8ca091fdc44d63d65998eab

4 years ago

ack - thanks so much for your patience

rebased onto bd80a4f

4 years ago

Pull-Request has been merged by mreynolds

4 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3505

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago