#49759 Ticket 49734 - Fix various issues with Disk Monitoring
Closed 3 years ago by spichugi. Opened 5 years ago by mreynolds.
mreynolds/389-ds-base ticket49734  into  master

file modified
+41 -38
@@ -572,8 +572,8 @@ 

               */

              if (passed_threshold) {

                  if (logs_disabled) {

-                     slapi_log_err(SLAPI_LOG_INFO, "disk_monitoring_thread", "Disk space is now within acceptable levels.  "

-                                                                             "Restoring the log settings.\n");

+                     slapi_log_err(SLAPI_LOG_INFO, "disk_monitoring_thread",

+                             "Disk space is now within acceptable levels.  Restoring the log settings.\n");

                      if (using_accesslog) {

                          config_set_accesslog_enabled(LOGGING_ON);

                      }
@@ -599,25 +599,27 @@ 

           *  Check if we are already critical

           */

          if (disk_space < 4096) { /* 4 k */

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is critically low on disk (%s), "

-                                                                      "remaining space: %" PRIu64 " Kb.  Signaling slapd for shutdown...\n",

-                           dirstr, (disk_space / 1024));

-             g_set_shutdown(SLAPI_SHUTDOWN_EXIT);

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is critically low on disk (%s), remaining space: %" PRIu64 " Kb.  Signaling slapd for shutdown...\n",

+                     dirstr, (disk_space / 1024));

+             g_set_shutdown(SLAPI_SHUTDOWN_DISKFULL);

              return;

          }

          /*

           *  If we are low, see if we are using verbose error logging, and turn it off

           *  if logging is not critical

           */

-         if (verbose_logging != 0 && verbose_logging != LDAP_DEBUG_ANY) {

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is low on disk (%s), remaining space: "

-                                                                      "%" PRIu64 " Kb, temporarily setting error loglevel to the default level(%d).\n",

-                           dirstr,

-                           (disk_space / 1024), SLAPD_DEFAULT_ERRORLOG_LEVEL);

+         if (verbose_logging != 0 &&

+             verbose_logging != LDAP_DEBUG_ANY &&

+             verbose_logging != SLAPD_DEFAULT_FE_ERRORLOG_LEVEL &&

+             verbose_logging != SLAPD_DEFAULT_ERRORLOG_LEVEL)

+         {

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is low on disk (%s), remaining space: %" PRIu64 " Kb, "

+                     "temporarily setting error loglevel to the default level.\n",

+                     dirstr, (disk_space / 1024));

              /* Setting the log level back to zero, actually sets the value to LDAP_DEBUG_ANY */

-             config_set_errorlog_level(CONFIG_LOGLEVEL_ATTRIBUTE,

-                                       STRINGIFYDEFINE(SLAPD_DEFAULT_ERRORLOG_LEVEL),

-                                       NULL, CONFIG_APPLY);

+             config_set_errorlog_level(CONFIG_LOGLEVEL_ATTRIBUTE, "0", NULL, CONFIG_APPLY);

              continue;

          }

          /*
@@ -625,9 +627,9 @@ 

           *  access/audit logs, log another error, and continue.

           */

          if (!logs_disabled && !logging_critical) {

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is too low on disk (%s), remaining "

-                                                                      "space: %" PRIu64 " Kb, disabling access and audit logging.\n",

-                           dirstr, (disk_space / 1024));

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is too low on disk (%s), remaining space: %" PRIu64 " Kb, disabling access and audit logging.\n",

+                     dirstr, (disk_space / 1024));

              config_set_accesslog_enabled(LOGGING_OFF);

              config_set_auditlog_enabled(LOGGING_OFF);

              config_set_auditfaillog_enabled(LOGGING_OFF);
@@ -639,9 +641,9 @@ 

           *  access/audit logging, then delete the rotated logs, log another error, and continue.

           */

          if (!deleted_rotated_logs && !logging_critical) {

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is too low on disk (%s), remaining "

-                                                                      "space: %" PRIu64 " Kb, deleting rotated logs.\n",

-                           dirstr, (disk_space / 1024));

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is too low on disk (%s), remaining space: %" PRIu64 " Kb, deleting rotated logs.\n",

+                     dirstr, (disk_space / 1024));

              log__delete_rotated_logs();

              deleted_rotated_logs = 1;

              continue;
@@ -650,22 +652,21 @@ 

           *  Ok, we've done what we can, log a message if we continue to lose available disk space

           */

          if (disk_space < previous_mark) {

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is too low on disk (%s), remaining "

-                                                                      "space: %" PRIu64 " Kb\n",

-                           dirstr, (disk_space / 1024));

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is too low on disk (%s), remaining space: %" PRIu64 " Kb\n",

+                     dirstr, (disk_space / 1024));

          }

          /*

-          *

           *  If we are below the halfway mark, and we did everything else,

           *  go into shutdown mode. If the disk space doesn't get critical,

           *  wait for the grace period before shutting down.  This gives an

           *  admin the chance to clean things up.

-          *

           */

          if (disk_space < halfway) {

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space on (%s) is too far below the threshold(%" PRIu64 " bytes).  "

-                                                                      "Waiting %d minutes for disk space to be cleaned up before shutting slapd down...\n",

-                           dirstr, threshold, (grace_period / 60));

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space on (%s) is too far below the threshold(%" PRIu64 " bytes).  "

+                     "Waiting %d minutes for disk space to be cleaned up before shutting slapd down...\n",

+                     dirstr, threshold, (grace_period / 60));

              start = slapi_current_utc_time();

              now = start;

              while ((now - start) < grace_period) {
@@ -685,9 +686,9 @@ 

                      /*

                       *  Excellent, we are back to acceptable levels, reset everything...

                       */

-                     slapi_log_err(SLAPI_LOG_INFO, "disk_monitoring_thread", "Available disk space is now "

-                                                                             "acceptable (%" PRIu64 " bytes).  Aborting shutdown, and restoring the log settings.\n",

-                                   disk_space);

+                     slapi_log_err(SLAPI_LOG_INFO, "disk_monitoring_thread",

+                             "Available disk space is now acceptable (%" PRIu64 " bytes).  Aborting shutdown, and restoring the log settings.\n",

+                             disk_space);

                      if (logs_disabled && using_accesslog) {

                          config_set_accesslog_enabled(LOGGING_ON);

                      }
@@ -709,9 +710,9 @@ 

                      /*

                       *  Disk space is critical, log an error, and shut it down now!

                       */

-                     slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is critically low "

-                                                                              "on disk (%s), remaining space: %" PRIu64 " Kb.  Signaling slapd for shutdown...\n",

-                                   dirstr, (disk_space / 1024));

+                     slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                             "Disk space is critically low on disk (%s), remaining space: %" PRIu64 " Kb.  Signaling slapd for shutdown...\n",

+                             dirstr, (disk_space / 1024));

                      g_set_shutdown(SLAPI_SHUTDOWN_DISKFULL);

                      return;

                  }
@@ -727,9 +728,9 @@ 

              /*

               *  If disk space was freed up we would of detected in the above while loop.  So shut it down.

               */

-             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread", "Disk space is still too low "

-                                                                      "(%" PRIu64 " Kb).  Signaling slapd for shutdown...\n",

-                           (disk_space / 1024));

+             slapi_log_err(SLAPI_LOG_ALERT, "disk_monitoring_thread",

+                     "Disk space is still too low (%" PRIu64 " Kb).  Signaling slapd for shutdown...\n",

+                     (disk_space / 1024));

              g_set_shutdown(SLAPI_SHUTDOWN_DISKFULL);

  

              return;
@@ -2717,7 +2718,9 @@ 

  #if 0

      slapi_log_err(SLAPI_LOG_INFO, "slapd_daemon", "slapd got shutdown signal\n");

  #endif

-     g_set_shutdown(SLAPI_SHUTDOWN_SIGNAL);

+     if (g_get_shutdown() == 0) {

+         g_set_shutdown(SLAPI_SHUTDOWN_SIGNAL);

+     }

  #ifndef LINUX

      /* don't mess with USR1/USR2 on linux, used by libpthread */

      (void)SIGNAL(SIGUSR2, set_shutdown);

@@ -121,6 +121,7 @@ 

  #include <arpa/inet.h>

  #include <netdb.h>

  #include <unistd.h>

+ #include <signal.h>

  #include <pwd.h> /* pwdnam */

  #ifdef USE_SYSCONF

  #include <unistd.h>
@@ -1376,6 +1377,7 @@ 

  g_set_shutdown(int reason)

  {

      slapd_shutdown = reason;

+     raise(SIGTERM);

  }

  

  int

file modified
+43 -4
@@ -3086,6 +3086,7 @@ 

  log__delete_rotated_logs()

  {

      struct logfileinfo *logp = NULL;

+     struct logfileinfo *prev_log = NULL;

      char buffer[BUFSIZ];

      char tbuf[TBUFSIZE];

  
@@ -3097,17 +3098,20 @@ 

          tbuf[0] = buffer[0] = '\0';

          log_convert_time(logp->l_ctime, tbuf, 1);

          PR_snprintf(buffer, sizeof(buffer), "%s.%s", loginfo.log_access_file, tbuf);

- 

-         slapi_log_err(SLAPI_LOG_ERR, "log__delete_rotated_logs",

-                       "Deleted Rotated Log: %s\n", buffer);

- 

          if (PR_Delete(buffer) != PR_SUCCESS) {

+             PRErrorCode prerr = PR_GetError();

+             slapi_log_err(SLAPI_LOG_ERR, "log__delete_rotated_logs",

+                           "Unable to remove file: %s - error %d (%s)\n",

+                           buffer, prerr, slapd_pr_strerror(prerr));

              logp = logp->l_next;

              continue;

          }

+         prev_log = logp;

          loginfo.log_numof_access_logs--;

          logp = logp->l_next;

+         slapi_ch_free((void **)&prev_log);

      }

+ 

      /*

       *  Audit Log

       */
@@ -3117,12 +3121,41 @@ 

          log_convert_time(logp->l_ctime, tbuf, 1);

          PR_snprintf(buffer, sizeof(buffer), "%s.%s", loginfo.log_audit_file, tbuf);

          if (PR_Delete(buffer) != PR_SUCCESS) {

+             PRErrorCode prerr = PR_GetError();

+             slapi_log_err(SLAPI_LOG_ERR, "log__delete_rotated_logs",

+                           "Unable to remove file: %s - error %d (%s)\n",

+                           buffer, prerr, slapd_pr_strerror(prerr));

              logp = logp->l_next;

              continue;

          }

+         prev_log = logp;

          loginfo.log_numof_audit_logs--;

          logp = logp->l_next;

+         slapi_ch_free((void **)&prev_log);

      }

+ 

+     /*

+      *  Audit Fail Log

+      */

+     logp = loginfo.log_auditfail_logchain;

+     while (logp) {

+         tbuf[0] = buffer[0] = '\0';

+         log_convert_time(logp->l_ctime, tbuf, 1);

+         PR_snprintf(buffer, sizeof(buffer), "%s.%s", loginfo.log_auditfail_file, tbuf);

+         if (PR_Delete(buffer) != PR_SUCCESS) {

+             PRErrorCode prerr = PR_GetError();

+             slapi_log_err(SLAPI_LOG_ERR, "log__delete_rotated_logs",

+                           "Unable to remove file: %s - error %d (%s)\n",

+                           buffer, prerr, slapd_pr_strerror(prerr));

+             logp = logp->l_next;

+             continue;

+         }

+         prev_log = logp;

+         loginfo.log_numof_auditfail_logs--;

+         logp = logp->l_next;

+         slapi_ch_free((void **)&prev_log);

+     }

+ 

      /*

       *  Error log

       */
@@ -3132,11 +3165,17 @@ 

          log_convert_time(logp->l_ctime, tbuf, 1);

          PR_snprintf(buffer, sizeof(buffer), "%s.%s", loginfo.log_error_file, tbuf);

          if (PR_Delete(buffer) != PR_SUCCESS) {

+             PRErrorCode prerr = PR_GetError();

+             slapi_log_err(SLAPI_LOG_ERR, "log__delete_rotated_logs",

+                           "Unable to remove file: %s - error %d (%s)\n",

+                           buffer, prerr, slapd_pr_strerror(prerr));

              logp = logp->l_next;

              continue;

          }

+         prev_log = logp;

          loginfo.log_numof_error_logs--;

          logp = logp->l_next;

+         slapi_ch_free((void **)&prev_log);

      }

  }

  

Bug Description: The first issue what the internal represenation of
the default error code changed since the feature was
added. This caused the disk monitoring thread to loop
and never actually attempt to stop the server. The
other issue is that with nunc-stans g_set_shutdown()
is no longer stops the server.

Fix Description: Change the defaulterror logging level to be more robust
and accept all the default level values. Also, we needed
to free the rotated logs when we delete them. Finally
add raise() to g_set_downdow(), and make sure set_shutdown()
does not overwrite the slapd_shutdown value.

https://pagure.io/389-ds-base/issue/49734

Reviewed by: ?

The code LGTM and it fixes the above-mentioned issues (verified internally).
I'll leave it to other team members to check more.

That is good to me. Ack

rebased onto 1c5831c

5 years ago

Pull-Request has been merged by mreynolds

5 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/2818

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago