#8074 NIghtly test failure in test_backup_and_restore.py::TestBackupAndRestoreWithReplica::()::test_full_backup_and_restore_with_replica
Closed: worksforme 3 years ago by frenaud. Opened 4 years ago by frenaud.

Issue

The nightly test test_backup_and_restore.py::TestBackupAndRestoreWithReplica::()::test_full_backup_and_restore_with_replica failed during [testing_ipa-4.7] Nightly PR #3706.
The failure needs to be investigated.

Logs available at the following location.


Metadata Update from @pcech:
- Issue tagged with: Falcon

4 years ago

Also happened on testing_ipa-4-8 PR 3875

And on testing_master_latest PR #4125

And on testing_master_previous PR #4477

Also on testing_master_previous PR #4579: logs
extract:

self = <ipatests.test_integration.test_backup_and_restore.TestBackupAndRestoreWithReplica object at 0x7fbc85b4fad0>
cert_sign_request = {'master.ipa.test': '/tmp/tmp.hm76IxMjHb', 'replica0.ipa.test': '/tmp/tmp.HaIOsi4lSl', 'replica1.ipa.test': '/tmp/tmp.jlMVqZjJpj'}

    def test_full_backup_and_restore_with_replica(self, cert_sign_request):
        # check prerequisites
        self.check_replication_success(self.master)
        self.check_replication_success(self.replica1)

        self.master.run_command(
            ['ipa', 'service-add', 'TEST/' + self.master.hostname])

        tasks.user_add(self.master, 'test1_master')
        tasks.user_add(self.replica1, 'test1_replica')

        with restore_checker(self.master):
            backup_path = tasks.get_backup_dir(self.master)

            # change data after backup
            self.master.run_command(['ipa', 'user-del', 'test1_master'])
            self.replica1.run_command(['ipa', 'user-del', 'test1_replica'])
            tasks.user_add(self.master, 'test2_master')
            tasks.user_add(self.replica1, 'test2_replica')

            # simulate master crash
            self.master.run_command(['ipactl', 'stop'])
            tasks.uninstall_master(self.master, clean=False)

            logger.info("Stopping and disabling oddjobd service")
            self.master.run_command([
                "systemctl", "stop", "oddjobd"
            ])
            self.master.run_command([
                "systemctl", "disable", "oddjobd"
            ])

            self.master.run_command(['ipa-restore', '-U', backup_path])

        status = self.master.run_command([
            "systemctl", "status", "oddjobd"
        ])
        assert "active (running)" in status.stdout_text

        # replication should not work after restoration
        # create users to force master and replica to try to replicate
        tasks.user_add(self.master, 'test3_master')
        tasks.user_add(self.replica1, 'test3_replica')
        self.check_replication_error(self.master)
        self.check_replication_error(self.replica1)
        assert {'admin', 'test1_master', 'test1_replica', 'test3_master'} == \
            self.get_users(self.master)
        assert {'admin', 'test2_master', 'test2_replica', 'test3_replica'} == \
            self.get_users(self.replica1)

        # reestablish and check replication
        self.replica1.run_command(['ipa-replica-manage', 're-initialize',
                                  '--from', self.master.hostname])
        # create users to force master and replica to try to replicate
        tasks.user_add(self.master, 'test4_master')
        tasks.user_add(self.replica1, 'test4_replica')
        self.check_replication_success(self.master)
>       self.check_replication_success(self.replica1)

test_integration/test_backup_and_restore.py:616: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
test_integration/test_backup_and_restore.py:547: in check_replication_success
    raise_on_timeout=True)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 

ldap = <ipatests.pytest_ipa.integration.host.LDAPClientWithoutCertCheck object at 0x7fbc844f72d0>
timeout = 30
target_status_re = 'Error \\(0\\) Replica acquired successfully: Incremental update succeeded'
raise_on_timeout = True

    def wait_for_replication(ldap, timeout=30,
                             target_status_re=r'^0 |^Error \(0\) ',
                             raise_on_timeout=False):
        """Wait for all replication agreements to reach desired state

        With defaults waits until updates on all replication agreements are
        done (or failed) and exits without exception
        :param ldap: LDAP client
            autenticated with necessary rights to read the mapping tree
        :param timeout: Maximum time to wait, in seconds
        :param target_status_re: Regexp of status to wait for
        :param raise_on_timeout: if True, raises AssertionError if status not
            reached in specified time

        Note that this waits for updates originating on this host, not those
        coming from other hosts.
        """
        logger.debug('Waiting for replication to finish')
        start = time.time()
        while True:
            status_attr = 'nsds5replicaLastUpdateStatus'
            progress_attr = 'nsds5replicaUpdateInProgress'
            entries = ldap.get_entries(
                DN(('cn', 'mapping tree'), ('cn', 'config')),
                filter='(objectclass=nsds5replicationagreement)',
                attrs_list=[status_attr, progress_attr])
            logger.debug('Replication agreements: \n%s', _entries_to_ldif(entries))
            statuses = [entry.single_value[status_attr] for entry in entries]
            wrong_statuses = [s for s in statuses
                              if not re.match(target_status_re, s)]
            if any(e.single_value[progress_attr] == 'TRUE' for e in entries):
                msg = 'Replication not finished'
                logger.debug(msg)
            elif wrong_statuses:
                msg = 'Unexpected replication status: %s' % wrong_statuses[0]
                logger.debug(msg)
            else:
                logger.debug('Replication finished')
                return
            if time.time() - start > timeout:
                logger.error('Giving up wait for replication to finish')
                if raise_on_timeout:
>                   raise AssertionError(msg)
E                   AssertionError: Unexpected replication status: Error (11) Replication error acquiring replica: Unable to acquire replica: the replica has the same Replica ID as this one. Replication is aborting. (duplicate replica ID detected)

The failure did not happen for the last 10+ runs on master branch, closing. Feel free to re-open if a new failure is seen.

Metadata Update from @frenaud:
- Issue close_status updated to: worksforme
- Issue status updated to: Closed (was: Open)

3 years ago

Login to comment on this ticket.

Metadata