#50693 Issue 50689 - Failed db restore task does not report an error
Closed a month ago by spichugi. Opened a year ago by mreynolds.
mreynolds/389-ds-base issue50689  into  master

@@ -0,0 +1,73 @@ 

+ import logging

+ import pytest

+ import os

+ from datetime import datetime

+ from lib389._constants import DEFAULT_SUFFIX, INSTALL_LATEST_CONFIG

+ from lib389.properties import BACKEND_SAMPLE_ENTRIES, TASK_WAIT

+ from lib389.topologies import topology_st as topo

+ from lib389.backend import Backend

+ from lib389.tasks import BackupTask, RestoreTask

+ 

+ DEBUGGING = os.getenv("DEBUGGING", default=False)

+ if DEBUGGING:

+     logging.getLogger(__name__).setLevel(logging.DEBUG)

+ else:

+     logging.getLogger(__name__).setLevel(logging.INFO)

+ log = logging.getLogger(__name__)

+ 

+ 

+ def test_missing_backend(topo):

+     """Test that an error is returned when a restore is performed for a 

+     backend that is no longer present.

+ 

+     :id: 889b8028-35cf-41d7-91f6-bc5193683646

+     :setup: Standalone Instance

+     :steps:

+         1. Create a second backend

+         2. Perform a back up

+         3. Remove one of the backends from the config

+         4. Perform a restore

+     :expectedresults:

+         1. Success

+         2. Success

+         3. Success

+         4. Failure

+     """

+     

+     # Create a new backend

+     BE_NAME = 'backupRoot'

+     BE_SUFFIX = 'dc=back,dc=up'

+     props = {

+         'cn': BE_NAME, 

+         'nsslapd-suffix': BE_SUFFIX,

+         BACKEND_SAMPLE_ENTRIES: INSTALL_LATEST_CONFIG

+     }

+     be = Backend(topo.standalone)

+     backend_entry = be.create(properties=props)

+     

+     # perform backup

+     backup_dir_name = "backup-%s" % datetime.now().strftime("%Y_%m_%d_%H_%M_%S")

+     archive = os.path.join(topo.standalone.ds_paths.backup_dir, backup_dir_name)

+     backup_task = BackupTask(topo.standalone)

+     task_properties = {'nsArchiveDir': archive}

+     backup_task.create(properties=task_properties)

+     backup_task.wait()

+     assert backup_task.get_exit_code() == 0

+     

+     # Remove new backend

+     backend_entry.delete()

+     

+     # Restore the backup - it should fail

+     restore_task = RestoreTask(topo.standalone)

+     task_properties = {'nsArchiveDir': archive}

+     restore_task.create(properties=task_properties)

+     restore_task.wait()

+     assert restore_task.get_exit_code() != 0

+     

+ 

+ if __name__ == '__main__':

+     # Run isolated

+     # -s for DEBUG mode

+     CURRENT_FILE = os.path.realpath(__file__)

+     pytest.main(["-s", CURRENT_FILE])

+ 

@@ -19,14 +19,14 @@ 

  ldbm_back_archive2ldbm(Slapi_PBlock *pb)

  {

      struct ldbminfo *li;

-     char *rawdirectory = NULL; /* -a <directory> */

-     char *directory = NULL;    /* normalized */

-     int return_value = -1;

-     int task_flags = 0;

-     int run_from_cmdline = 0;

      Slapi_Task *task;

-     int is_old_to_new = 0;

      ldbm_instance *inst = NULL;

+     char *rawdirectory = NULL; /* -a <directory> */

+     char *directory = NULL;    /* normalized */

+     int32_t return_value = -1;

+     int32_t task_flags = 0;

+     int32_t run_from_cmdline = 0;

+     int32_t is_old_to_new = 0;

  

      slapi_pblock_get(pb, SLAPI_PLUGIN_PRIVATE, &li);

      slapi_pblock_get(pb, SLAPI_SEQ_VAL, &rawdirectory);
@@ -165,8 +165,7 @@ 

                                                  "the backup set.  error=%d (%s)\n",

                        return_value, dblayer_strerror(return_value));

          if (task) {

-             slapi_task_log_notice(task, "Failed to read the backup file set "

-                                         "from %s",

+             slapi_task_log_notice(task, "Failed to read the backup file set from %s",

                                    directory);

          }

      }
@@ -177,7 +176,7 @@ 

              char *p;

              char c;

              char *bakup_dir = NULL;

-             int skipinit = SLAPI_UPGRADEDB_SKIPINIT;

+             int32_t skipinit = SLAPI_UPGRADEDB_SKIPINIT;

  

              p = strrchr(directory, '/');

              if (NULL == p) {
@@ -204,25 +203,26 @@ 

              return_value = ldbm_back_upgradedb(pb);

          }

      } else {

-         ldbm_instance *inst;

          Object *inst_obj;

-         int ret;

+         int32_t ret;

  

          if (0 != return_value) {

-             /* error case (607331)

-              * just to go back to the previous state if possible */

-             if ((return_value = dblayer_start(li, DBLAYER_NORMAL_MODE))) {

+             /*

+              * error case (607331)

+              * just to go back to the previous state if possible (preserve

+              * original error for now)

+              */

+             if ((ret = dblayer_start(li, DBLAYER_NORMAL_MODE))) {

                  slapi_log_err(SLAPI_LOG_ERR,

                                "ldbm_back_archive2ldbm", "Unable to to start database in [%s]\n",

                                li->li_directory);

                  if (task) {

-                     slapi_task_log_notice(task, "Failed to start the database in "

-                                                 "%s",

+                     slapi_task_log_notice(task, "Failed to start the database in %s",

                                            li->li_directory);

                  }

-                 goto out;

I think that if dblayer_recover failed, there is a call to dblayer_start as an attempt to repair something... it is old code and I do not really know if it is still working.
I agree with your comment, it would also make sense to 'goto out' as soon as we detect a failure.

The patch looks good to me. ACK

              }

          }

+ 

          /* bring all backends and changelog back online */

          plugin_call_plugins(pb, SLAPI_PLUGIN_BE_POST_OPEN_FN);

          for (inst_obj = objset_first_obj(li->li_instance_set); inst_obj;
@@ -234,8 +234,7 @@ 

                                "ldbm_back_archive2ldbm", "Unable to restart '%s'\n",

                                inst->inst_name);

                  if (task) {

-                     slapi_task_log_notice(task, "Unable to restart '%s'",

-                                           inst->inst_name);

+                     slapi_task_log_notice(task, "Unable to restart '%s'", inst->inst_name);

                  }

              } else {

                  slapi_mtn_be_enable(inst->inst_be);
@@ -243,6 +242,7 @@ 

              }

          }

      }

+ 

  out:

      if (run_from_cmdline && (0 == return_value)) {

          dblayer_restore_file_update(li, directory);

@@ -5294,8 +5294,7 @@ 

                                                          "exist.\n",

                        src_dir);

          if (task) {

-             slapi_task_log_notice(task, "Restore: backup directory %s does not "

-                                         "exist.\n",

+             slapi_task_log_notice(task, "Restore: backup directory %s does not exist.",

                                    src_dir);

          }

          return LDAP_UNWILLING_TO_PERFORM;
@@ -5304,8 +5303,7 @@ 

                                                          "a directory.\n",

                        src_dir);

          if (task) {

-             slapi_task_log_notice(task, "Restore: backup directory %s is not "

-                                         "a directory.\n",

+             slapi_task_log_notice(task, "Restore: backup directory %s is not a directory.",

                                    src_dir);

          }

          return LDAP_UNWILLING_TO_PERFORM;
@@ -5345,12 +5343,13 @@ 

                      inst = ldbm_instance_find_by_name(li, (char *)direntry->name);

                      if (inst == NULL) {

                          slapi_log_err(SLAPI_LOG_ERR,

-                                       "bdb_restore", "Target server has no %s configured\n",

+                                       "bdb_restore", "Target server has no backend (%s) configured\n",

                                        direntry->name);

                          if (task) {

                              slapi_task_log_notice(task,

-                                                   "bdb_restore - Target server has no %s configured\n",

+                                                   "bdb_restore - Target server has no backend (%s) configured",

                                                    direntry->name);

+                             slapi_task_cancel(task, LDAP_UNWILLING_TO_PERFORM);

                          }

                          PR_CloseDir(dirhandle);

                          return_value = LDAP_UNWILLING_TO_PERFORM;
@@ -5363,7 +5362,7 @@ 

                                        src_dir, inst->inst_parent_dir_name);

                          if (task) {

                              slapi_task_log_notice(task,

-                                                   "Restore: backup dir %s and target dir %s are identical\n",

+                                                   "Restore: backup dir %s and target dir %s are identical",

                                                    src_dir, inst->inst_parent_dir_name);

                          }

                          PR_CloseDir(dirhandle);
@@ -5398,7 +5397,7 @@ 

                        "bdb_restore", "Failed to open the directory \"%s\"\n", real_src_dir);

          if (task) {

              slapi_task_log_notice(task,

-                                   "Restore: failed to open the directory \"%s\"\n", real_src_dir);

+                                   "Restore: failed to open the directory \"%s\"", real_src_dir);

          }

          return_value = -1;

          goto error_out;
@@ -5431,7 +5430,7 @@ 

                                        changelogdir);

                          if (task) {

                              slapi_task_log_notice(task,

-                                                   "Restore: broken changelog dir path %s\n",

+                                                   "Restore: broken changelog dir path %s",

                                                    changelogdir);

                          }

                          goto error_out;
@@ -5540,8 +5539,7 @@ 

          char *dataversion = NULL;

  

          if (bdb_version_read(li, home_dir, &ldbmversion, &dataversion) != 0) {

-             slapi_log_err(SLAPI_LOG_WARNING, "bdb_restore", "Unable to read dbversion "

-                                                                 "file in %s\n",

+             slapi_log_err(SLAPI_LOG_WARNING, "bdb_restore", "Unable to read dbversion file in %s\n",

                            home_dir);

          } else {

              adjust_idl_switch(ldbmversion, li);

@@ -26,11 +26,12 @@ 

      task = inst.restore_online(archive=args.archive, db_type=args.db_type)

      task.wait()

      result = task.get_exit_code()

+     task_log = task.get_task_log()

  

      if task.is_complete() and result == 0:

          log.info("The backup restore task has finished successfully")

      else:

-         raise ValueError("The backup restore task has failed with the error code: ({})".format(result))

+         raise ValueError("The backup restore task has failed with the error code: {}\n{}".format(result, task_log))

  

  

  def create_parser(subparsers):

Bug Description:

If you have a back up that contains a backend that is not configured the restore fails, but a success return code is returned to the client. This happens because the return code gets overwritten after the failure.

Fix Description:

Preserve the error code upon failure and properly update the task exit code.

relates: https://pagure.io/389-ds-base/issue/50689

you set the return value, but no longer skip the next part of code. Is this intentionally ?

you set the return value, but no longer skip the next part of code. Is this intentionally ?

Correct, the next part of the code sets the backends as "not busy" otherwise all backends are inaccessible until you restart the server.

I think I am missing something.
We are here because dblayer_restore failed (return_value<>0) before dblayer_start. This will overwrite the dblayer_restore failure with the error from dblayer_start, should not we keep the original error code ?

I think I am missing something.

:-) I know the feeling

We are here because dblayer_restore failed (return_value<>0) before dblayer_start. This will overwrite the dblayer_restore failure with the error from dblayer_start, should not we keep the original error code ?

Actually part of fix prevents dblayer start from overwriting the original error code. What was happening originally was that we had an error, but then dblayer_start would succeed. Then it would return dblayer_start()'s return code of success.

I'm not 100% sure why we even try to run dblayer_start() after a failure, and the intend of overwriting the original error code (from the original source code). So if it fails which error should we return? I guess it makes sense to keep the original error code, and not the new one from dblayer_start() - I'll change that...

1 new commit added

  • Never rewrite the orginal error code
a year ago

I think that if dblayer_recover failed, there is a call to dblayer_start as an attempt to repair something... it is old code and I do not really know if it is still working.
I agree with your comment, it would also make sense to 'goto out' as soon as we detect a failure.

The patch looks good to me. ACK

I'm not 100% sure why we even try to run dblayer_start() after a failure.

Well I think the idea could have been that when the task is run, before running the task the db is closed and afterwards it has to be started, assuming that in he best of all worlds the original state is still available - which I think is not the case.
And why start only on failure I don't know - this code needs to investigated more and cleaned up :-(

rebased onto 1747f91

a year ago

I'm not 100% sure why we even try to run dblayer_start() after a failure.

Well I think the idea could have been that when the task is run, before running the task the db is closed and afterwards it has to be started, assuming that in he best of all worlds the original state is still available - which I think is not the case.
And why start only on failure I don't know - this code needs to investigated more and cleaned up :-(

Well I think it was added as a concept/theory, and was not added to address a real problem scenario. But I really don't know for sure. I know that with the old code those failures were basically ignored if dblayer_start() succeeded (like in this case). So I don't think it ever really worked as designed.

For this patch I will leave it as is, but I suspect we could rewrite this function and remove that dblayer_start() code.

Pull-Request has been merged by mreynolds

a year ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3748

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

a month ago