#51261 Ticket 51260 - fix potential syncrepl data corruption
Closed 3 years ago by spichugi. Opened 3 years ago by firstyear.
firstyear/389-ds-base 51171-entryuuid-syncrepl  into  master

@@ -21,6 +21,8 @@ 

  

  log = logging.getLogger(__name__)

  

+ OU_PEOPLE = "ou=people,%s" % DEFAULT_SUFFIX

+ 

  class ISyncRepl(DirSrv, SyncreplConsumer):

      """

      This implements a test harness for checking syncrepl, and allowing us to check various actions or
@@ -55,6 +57,7 @@ 

          self.delete = []

          self.present = []

          self.entries = {}

+         self.refdel = False

          self.next_cookie = None

          # Start the sync

          # If cookie is none, will call "get_cookie" we have.
@@ -89,6 +92,9 @@ 

  

      def syncrepl_present(self, uuids, refreshDeletes=False):

          log.debug(f'=====> refdel -> {refreshDeletes} uuids -> {uuids}')

+         if refreshDeletes:

+             # Indicate we recieved a refdel in the process.

+             self.refdel = True

          if uuids is not None:

              self.present = self.present + uuids

  
@@ -105,8 +111,9 @@ 

  

  def syncstate_assert(st, sync):

      # How many entries do we have?

+     # We setup sync under ou=people so we can modrdn out of the scope.

      r = st.search_ext_s(

-         base=DEFAULT_SUFFIX,

+         base=OU_PEOPLE,

          scope=ldap.SCOPE_SUBTREE,

          filterstr='(objectClass=*)',

          attrsonly=1,
@@ -115,49 +122,154 @@ 

  

      # Initial sync

      log.debug("*test* initial")

-     sync.syncrepl_search()

+     sync.syncrepl_search(base=OU_PEOPLE)

      sync.syncrepl_complete()

      # check we caught them all

      assert len(r) == len(sync.entries.keys())

      assert len(r) == len(sync.present)

      assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert False == sync.refdel

  

      # Add a new entry

- 

      account = nsUserAccounts(st, DEFAULT_SUFFIX).create_test_user()

+ 

+     # Find the primary uuid we expect to see in syncrepl.

+     # This will be None if not present.

+     acc_uuid = account.get_attr_val_utf8('entryuuid')

+     if acc_uuid is None:

+         nsid = account.get_attr_val_utf8('nsuniqueid')

+         # nsunique has a diff format, so we change it up.

+         # 431cf081-b44311ea-83fdb082-f24d490e

+         # Add a hyphen V

+         # 431cf081-b443-11ea-83fdb082-f24d490e

+         nsid_a = nsid[:13] + '-' + nsid[13:]

+         #           Add a hyphen V

+         # 431cf081-b443-11ea-83fd-b082-f24d490e

+         nsid_b = nsid_a[:23] + '-' + nsid_a[23:]

+         #             Remove a hyphen V

+         # 431cf081-b443-11ea-83fd-b082-f24d490e

+         acc_uuid = nsid_b[:28] + nsid_b[29:]

+         # Tada!

+         # 431cf081-b443-11ea-83fd-b082f24d490e

+         log.debug(f"--> expected sync uuid (from nsuniqueid): {acc_uuid}")

+     else:

+         log.debug(f"--> expected sync uuid (from entryuuid): {acc_uuid}")

+ 

      # Check

      log.debug("*test* add")

-     sync.syncrepl_search()

+     sync.syncrepl_search(base=OU_PEOPLE)

      sync.syncrepl_complete()

      sync.check_cookie()

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

+ 

      assert 1 == len(sync.entries.keys())

      assert 1 == len(sync.present)

+     ####################################

+     ## assert sync.present == [acc_uuid]

      assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert False == sync.refdel

  

      # Mod

      account.replace('description', 'change')

      # Check

      log.debug("*test* mod")

-     sync.syncrepl_search()

+     sync.syncrepl_search(base=OU_PEOPLE)

+     sync.syncrepl_complete()

+     sync.check_cookie()

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

+     assert 1 == len(sync.entries.keys())

+     assert 1 == len(sync.present)

+     ####################################

+     ## assert sync.present == [acc_uuid]

+     assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert False == sync.refdel

+ 

+     ## ModRdn (remain in scope)

+     account.rename('uid=test1_modrdn')

+     # newsuperior=None

+     # Check

+     log.debug("*test* modrdn (in scope)")

+     sync.syncrepl_search(base=OU_PEOPLE)

      sync.syncrepl_complete()

      sync.check_cookie()

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

      assert 1 == len(sync.entries.keys())

      assert 1 == len(sync.present)

+     ####################################

+     ## assert sync.present == [acc_uuid]

      assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert False == sync.refdel

+ 

+     ## Modrdn (out of scope, then back into scope)

+     account.rename('uid=test1_modrdn', newsuperior=DEFAULT_SUFFIX)

+ 

+     # Check it's gone.

+     log.debug("*test* modrdn (move out of scope)")

+     sync.syncrepl_search(base=OU_PEOPLE)

+     sync.syncrepl_complete()

+     sync.check_cookie()

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

+     assert 0 == len(sync.entries.keys())

+     assert 0 == len(sync.present)

+     assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert True == sync.refdel

+ 

+     # Put it back

+     account.rename('uid=test1_modrdn', newsuperior=OU_PEOPLE)

+     log.debug("*test* modrdn (move in to scope)")

+     sync.syncrepl_search(base=OU_PEOPLE)

+     sync.syncrepl_complete()

+     sync.check_cookie()

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

+     assert 1 == len(sync.entries.keys())

+     assert 1 == len(sync.present)

+     ####################################

+     ## assert sync.present == [acc_uuid]

+     assert 0 == len(sync.delete)

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert False == sync.refdel

  

      ## Delete

      account.delete()

  

+     # import time

+     # print("attach now ....")

+     # time.sleep(45)

+ 

      # Check

      log.debug("*test* del")

-     sync.syncrepl_search()

+     sync.syncrepl_search(base=OU_PEOPLE)

      sync.syncrepl_complete()

      # In a delete, the cookie isn't updated (?)

      sync.check_cookie()

      log.debug(f'{sync.entries.keys()}')

      log.debug(f'{sync.present}')

      log.debug(f'{sync.delete}')

+     log.debug(f"sd: {sync.delete}, sp: {sync.present} sek: {sync.entries.keys()}")

      assert 0 == len(sync.entries.keys())

      assert 0 == len(sync.present)

-     assert 1 == len(sync.delete)

+     assert 0 == len(sync.delete)

+     ####################################

+     ## assert sync.delete == [acc_uuid]

+     if sync.openldap:

+         assert True == sync.refdel

+     else:

+         assert True == sync.refdel

  

@@ -0,0 +1,42 @@ 

+ With support from a techwriter at SUSE, we managed to translate RCF4533

+ into something we can understand.

+ 

+ * Server issues Sync Operation to search.

+ * Search returns entries

+ * Each entry has a state add which includes the UUID of the entry

+ * Distinguished names can change, but UUIDs are stable.

+ * At the end of the search results, a sync done control cookie is sent to indicate that the session is done.

+ 

+ * Clients issue a Sync Operation to search, and include previously returned cookie.

+ * Server determines what would be returned in a normal search operation.

+ * Server uses cookie to work out what was previously returned.

+ * Server issues results of only the things that have changed since last search.

+ * Each result sent also includes info about the status (changed or unchanged)

+ 

+ * Server has two phases to sync deleted entries: present or delete.

+ * Each phase ends with a sync done control cookie.

+ * The present phase ends with sync done control value of searchresultdone=false

+ * the delete phase ends with sync done control value of searchresultdone=true

+ * The present phase can be followed by the delete phase.

+ * Each phase is complete only after a sync info message of refreshdone=false

+ 

+ * During the present phase, the server sends an empty entry with state=present for each unchanged entry.

+ * During the present phase, the client is changed to match the server state, in preparation for the delete phase.

+ * During the delete phase, the server determines which entries in the client copy are no longer present. It also checks that the the number of changed entries is less than the number of unchanged entries.

+  For each entry that is no longer present, the server sends a state=delete. It does not* return a state=present for each present entry.

+ 

+ * The server can send sync info messages that contain the list of UUIDs of either unchanged present entries, or deleted entries. This is instead of sending individual messages for each entry.

+ * If refreshDeletes=false the UUIDs of unchanged present entries are included in the syncUUIDs set.

+ * If refreshDeletes=true the UUIDs of unchanged deleted entries are included in the syncUUIDs set.

+ * Optionally, you can include a syncIdSet cookie to indicate the state of the content in the syncUUIDs set.

+ 

+ * If the syncDoneValue is refreshDeletes=false the new copy includes:

+ - All changed entries returned by the current sync

+ - All unchanged entries from a previous sync that have been confirmed to still be present

+ - Unchanged entries confirmed as deleted are deleted from the client. In this case, they are assumed to have been deleted or moved.

+ 

+ * If the syncDoneValue is refreshDeletes=true the new copy includes:

+ - All changed entries returned by the current sync

+ - All entries from a previous sync that have not been marked as deleted.

+ 

+ * Clients can request that the synchronized copy is refreshed at any time.

@@ -234,6 +234,8 @@ 

           * Point is, if we set refresh to true for openldap mode, it works, and if

           * it's false, the moment we send a single intermediate delete message, we

           * delete literally everything 🔥.

+          *

+          * See README.md for more about how this works.

           */

          if (info->cookie->openldap_compat) {

              sync_create_sync_done_control(&ctrl[0], 1, cookiestr);
@@ -323,8 +325,25 @@ 

      cb_data.orig_pb = pb;

      cb_data.change_start = client_cookie->cookie_change_info;

  

+     /*

+      * The client has already seen up to AND including change_info, so this should

+      * should reflect that. originally was:

+      *

+      *  filter = slapi_ch_smprintf("(&(changenumber>=%lu)(changenumber<=%lu))",

+      *                             client_cookie->cookie_change_info,

+      *                             server_cookie->cookie_change_info);

+      *

+      * which would create a situation where if the previous cn was say 5, and the next

+      * is 6, we'd get both 5 and 6, even though the client has already seen 5. But worse

+      * if 5 was an "add" of the entry, and 6 was a "delete" of the same entry then sync

+      * would over-optimise and remove the sync value because it things the add/delete was

+      * in the same operation so we'd never send it. But the client HAD seen the add, and

+      * now we'd never send the delete so this would be a bug. This created some confusion

+      * for me in the tests, but the sync repl tests now correctly work and reflect the behaviour

+      * expected.

+      */

      filter = slapi_ch_smprintf("(&(changenumber>=%lu)(changenumber<=%lu))",

-                                client_cookie->cookie_change_info,

+                                client_cookie->cookie_change_info + 1,

I agree with that change. My understanding is that starting client_cookie+1 conforms RFC

In each phase, for each entry or reference that has been added to the
content or been changed since the previous Sync Operation indicated
by the cookie, the server returns a SearchResultEntry or
SearchResultReference Message, ....

You may add this in your comment

                                 server_cookie->cookie_change_info);

'chg_count' should be server_cookie - client_cookie.
In case chg_count == 0 then the routine would exit immediately (saving internal search and lot of computation)

      slapi_search_internal_set_pb(

          seq_pb,

Bug Description: The cookie encodes which changelog entries we
have seen up to and including. However, the sync process would then
re-send the cl item from the cookie number. This could cause corruption
in some cases as some combinations of actions between two points
are no-oped in the server.

Fix Description: Fix the changelog search to always process that
entries of the CL must be greater than, but not equal to the
already seen CL items from the cookie.

fixes #51260
https://pagure.io/389-ds-base/issue/51260

Author: William Brown <william@blackhats.net.au>

NOTE: This is split into two commits. One which is supplementary improvements such as test and documentation expansion around sync repl. The second is the fix to the issue, which is isolated to allow easier backporting.

2 new commits added

  • Ticket 51260 - improve tests and improve readme re refdel
  • Ticket 51260 - fix potential syncrepl data corruption
3 years ago

I agree with that change. My understanding is that starting client_cookie+1 conforms RFC

In each phase, for each entry or reference that has been added to the
content or been changed since the previous Sync Operation indicated
by the cookie, the server returns a SearchResultEntry or
SearchResultReference Message, ....

You may add this in your comment

'chg_count' should be server_cookie - client_cookie.
In case chg_count == 0 then the routine would exit immediately (saving internal search and lot of computation)

The fix looks good to me. Let's wait feedback from @elkris and others.

Yep. I'd also want @mreynolds to comment if we should backport this fix to older versions too since I think this would affect many versions ....

Pull-Request has been closed by spichugi

3 years ago

Pull-Request has been reopened by spichugi

3 years ago

Pull-Request has been closed by spichugi

3 years ago

Pull-Request has been closed by spichugi

3 years ago

Pull-Request has been closed by spichugi

3 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/4314

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago