#50811 Issue 50599 - Remove db region files prior to db recovery
Closed 3 years ago by spichugi. Opened 4 years ago by mreynolds.
mreynolds/389-ds-base issue50599  into  master

@@ -1657,7 +1657,7 @@ 

               * sessions.

               */

  

-             if (slapi_disordely_shutdown(PR_FALSE)) {

+             if (slapi_disorderly_shutdown(PR_FALSE)) {

                  slapi_log_err(SLAPI_LOG_WARNING, repl_plugin_name, "replica_check_for_data_reload - "

                                                                     "Disorderly shutdown for replica %s. Check if DB RUV needs to be updated\n",

                                slapi_sdn_get_dn(r->repl_root));
@@ -1701,7 +1701,7 @@ 

                                    slapi_sdn_get_dn(r->repl_root));

                      rc = 0;

                  }

-             } /* slapi_disordely_shutdown */

+             } /* slapi_disorderly_shutdown */

  

              object_release(ruv_obj);

          } else /* we have no changes currently logged for this replica */

@@ -15,6 +15,8 @@ 

  #include <prclist.h>

  #include <sys/types.h>

  #include <sys/statvfs.h>

+ #include <glob.h>

+ 

  

  #define DB_OPEN(oflags, db, txnid, file, database, type, flags, mode, rval)                                     \

      {                                                                                                           \
@@ -990,10 +992,9 @@ 

      return_value = dblayer_grok_directory(region_dir,

                                            DBLAYER_DIRECTORY_READWRITE_ACCESS);

      if (0 != return_value) {

-         slapi_log_err(SLAPI_LOG_CRIT, "bdb_start", "Can't start because the database "

-                                                        "directory \"%s\" either doesn't exist, or is not "

-                                                        "accessible\n",

-                       region_dir);

+         slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                 "Can't start because the database directory \"%s\" either doesn't exist, or is not accessible\n",

+                 region_dir);

          return return_value;

      }

  
@@ -1003,10 +1004,9 @@ 

          return_value = dblayer_grok_directory(log_dir,

                                                DBLAYER_DIRECTORY_READWRITE_ACCESS);

          if (0 != return_value) {

-             slapi_log_err(SLAPI_LOG_CRIT, "bdb_start", "Can't start because the log "

-                                                            "directory \"%s\" either doesn't exist, or is not "

-                                                            "accessible\n",

-                           log_dir);

+             slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                     "Can't start because the log directory \"%s\" either doesn't exist, or is not accessible\n",

+                     log_dir);

              return return_value;

          }

      }
@@ -1057,15 +1057,27 @@ 

          if (conf->bdb_recovery_required) {

              open_flags |= DB_RECOVER;

              if (DBLAYER_RESTORE_MODE & dbmode) {

-                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start", "Recovering database after restore "

-                                                                  "from archive.\n");

+                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start",

+                         "Recovering database after restore from archive.\n");

              } else if (DBLAYER_CLEAN_RECOVER_MODE & dbmode) {

-                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start", "Clean up db environment and start "

-                                                                  "from archive.\n");

+                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start",

+                         "Clean up db environment and start from archive.\n");

              } else {

-                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start", "Detected Disorderly Shutdown last "

-                                                                  "time Directory Server was running, recovering database.\n");

-                 slapi_disordely_shutdown(PR_TRUE);

+                 glob_t globbuf;

+                 char file_pattern[MAXPATHLEN];

+ 

+                 slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start",

+                         "Detected Disorderly Shutdown last time Directory Server was running, recovering database.\n");

+                 slapi_disorderly_shutdown(PR_TRUE);

+ 

+                 /* Better wipe out the region files to help ensure a clean start */

+                 PR_snprintf(file_pattern, MAXPATHLEN, "%s/%s", region_dir, "__db.*");

+                 if (glob(file_pattern, GLOB_DOOFFS, NULL, &globbuf) == 0) {

+                     for (size_t i = 0; i < globbuf.gl_pathc; i++) {

+                         remove(globbuf.gl_pathv[i]);

+                     }

+                     globfree(&globbuf);

+                 }

              }

          }

          switch (dbmode & DBLAYER_RESTORE_MASK) {
@@ -1121,7 +1133,7 @@ 

               */

              if (conf->bdb_lock_config <= BDB_LOCK_NB_MIN) {

                  slapi_log_err(SLAPI_LOG_NOTICE, "bdb_start", "New max db lock count is too small.  "

-                                                                  "Resetting it to the default value %d.\n",

+                               "Resetting it to the default value %d.\n",

                                BDB_LOCK_NB_MIN);

                  conf->bdb_lock_config = BDB_LOCK_NB_MIN;

              }
@@ -1165,29 +1177,26 @@ 

      if ((open_flags & DB_RECOVER) || (open_flags & DB_RECOVER_FATAL)) {

          /* Recover, then close, then open again */

          int recover_flags = open_flags & ~DB_THREAD;

- 

          if (DBLAYER_CLEAN_RECOVER_MODE & dbmode) /* upgrade case */

          {

              DB_ENV *thisenv = pEnv->bdb_DB_ENV;

              return_value = thisenv->remove(thisenv, region_dir, DB_FORCE);

              if (0 != return_value) {

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "Failed to remove old db env "

-                                                "in %s: %s\n",

-                               region_dir,

-                               dblayer_strerror(return_value));

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "Failed to remove old db env in %s: %s\n",

+                         region_dir, dblayer_strerror(return_value));

                  return return_value;

              }

              dbmode = DBLAYER_NORMAL_MODE;

  

              if ((return_value = bdb_make_env(&pEnv, li)) != 0) {

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "Failed to create DBENV (returned: %d).\n",

-                               return_value);

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "Failed to create DBENV (returned: %d).\n", return_value);

                  return return_value;

              }

          }

  

+ 

          return_value = (pEnv->bdb_DB_ENV->open)(

              pEnv->bdb_DB_ENV,

              region_dir,
@@ -1201,27 +1210,25 @@ 

                   */

                  slapi_log_err(SLAPI_LOG_CRIT,

                                "bdb_start", "mmap in opening database environment (recovery mode) "

-                                                "failed trying to allocate %" PRIu64 " bytes. (OS err %d - %s)\n",

+                               "failed trying to allocate %" PRIu64 " bytes. (OS err %d - %s)\n",

                                li->li_dbcachesize, return_value, dblayer_strerror(return_value));

                  bdb_free_env(&priv->dblayer_env);

                  priv->dblayer_env = CATASTROPHIC;

              } else {

                  slapi_log_err(SLAPI_LOG_CRIT, "bdb_start", "Database Recovery Process FAILED. "

-                                                                "The database is not recoverable. err=%d: %s\n",

+                               "The database is not recoverable. err=%d: %s\n",

                                return_value, dblayer_strerror(return_value));

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "Please make sure there is enough disk space for "

-                                                "dbcache (%" PRIu64 " bytes) and db region files\n",

-                               li->li_dbcachesize);

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "Please make sure there is enough disk space for dbcache (%" PRIu64 " bytes) and db region files\n",

+                         li->li_dbcachesize);

              }

              return return_value;

          } else {

              open_flags &= ~(DB_RECOVER | DB_RECOVER_FATAL);

              pEnv->bdb_DB_ENV->close(pEnv->bdb_DB_ENV, 0);

              if ((return_value = bdb_make_env(&pEnv, li)) != 0) {

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "Failed to create DBENV (returned: %d).\n",

-                               return_value);

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "Failed to create DBENV (returned: %d).\n", return_value);

                  return return_value;

              }

              bdb_free_env(&priv->dblayer_env);
@@ -1288,16 +1295,15 @@ 

                   * https://blackflag.mcom.com/show_bug.cgi?id=557319

                   * Crash ns-slapd while running scalab01 after restart slapd

                   */

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "mmap in opening database environment "

-                                                "failed trying to allocate %" PRIu64 " bytes. (OS err %d - %s)\n",

-                               li->li_dbcachesize, return_value, dblayer_strerror(return_value));

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "mmap in opening database environment failed trying to allocate %" PRIu64 " bytes. (OS err %d - %s)\n",

+                         li->li_dbcachesize, return_value, dblayer_strerror(return_value));

                  bdb_free_env(&priv->dblayer_env);

                  priv->dblayer_env = CATASTROPHIC;

              } else {

-                 slapi_log_err(SLAPI_LOG_CRIT,

-                               "bdb_start", "Opening database environment (%s) failed. err=%d: %s\n",

-                               region_dir, return_value, dblayer_strerror(return_value));

+                 slapi_log_err(SLAPI_LOG_CRIT, "bdb_start",

+                         "Opening database environment (%s) failed. err=%d: %s\n",

+                         region_dir, return_value, dblayer_strerror(return_value));

              }

          }

          return return_value;

file modified
+4 -4
@@ -4383,14 +4383,14 @@ 

  }

  

  PRBool

- slapi_disordely_shutdown(PRBool set)

+ slapi_disorderly_shutdown(PRBool set)

  {

-     static PRBool is_disordely_shutdown = PR_FALSE;

+     static PRBool is_disorderly_shutdown = PR_FALSE;

  

      if (set) {

-         is_disordely_shutdown = PR_TRUE;

+         is_disorderly_shutdown = PR_TRUE;

      }

-     return (is_disordely_shutdown);

+     return (is_disorderly_shutdown);

  }

  

  /*

@@ -7901,7 +7901,7 @@ 

  

  void slapi_set_plugin_open_rootdn_bind(Slapi_PBlock *pb);

  

- PRBool slapi_disordely_shutdown(PRBool set);

+ PRBool slapi_disorderly_shutdown(PRBool set);

  

  /*

   * Public entry extension getter/setter functions

Bug Description:

If the server crashes then the region files can become corrupted and this prevents the server from starting.

Fix Description:

If we encounter a disorderly shutdown, then remove the region files so there is a clean slate to start with.

Also fixed some indenting issues around log functions...

relates: https://pagure.io/389-ds-base/issue/50599

Looks reasonable to me, @lkrispen I think you agree with this change after we discussed it?

yes, it looks safe to delete the files in these places, but :-)

In theory there can be more or less than the 001-003 files.
I think with a very large dbcache it can be split over more files and then ther will be more.
Also if you run with private mem there is no or only the 001 file.

Could there be generic delete of __.db* ?

rebased onto fa1f69a

4 years ago

yes, it looks safe to delete the files in these places, but :-)
In theory there can be more or less than the 001-003 files.
I think with a very large dbcache it can be split over more files and then ther will be more.
Also if you run with private mem there is no or only the 001 file.
Could there be generic delete of __.db* ?

Yes, it's done now, please review...

looks good and disorderly finally spelled correctly :-)

ACK

Pull-Request has been merged by mreynolds

4 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3865

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago