#49989 Ticket 49967 - entry cache corruption after failed MODRDN
Closed 3 years ago by spichugi. Opened 5 years ago by tbordaz.
tbordaz/389-ds-base ticket_49967  into  master

@@ -508,6 +508,166 @@ 

      _find_memberof(inst, dn1, g2n, True)

      _find_memberof(inst, dn2, g2n, True)

  

+ def _config_memberof_entrycache_on_modrdn_failure(server):

+ 

+     server.plugins.enable(name=PLUGIN_MEMBER_OF)

+     peoplebase = 'ou=people,%s' % SUFFIX

+     MEMBEROF_PLUGIN_DN = ('cn=' + PLUGIN_MEMBER_OF + ',cn=plugins,cn=config')

+     server.modify_s(MEMBEROF_PLUGIN_DN, [(ldap.MOD_REPLACE, 'memberOfAllBackends', b'on'),

+                                          (ldap.MOD_REPLACE, 'memberOfEntryScope', peoplebase.encode())])

+ 

+ def _disable_auto_oc_memberof(server):

+     MEMBEROF_PLUGIN_DN = ('cn=' + PLUGIN_MEMBER_OF + ',cn=plugins,cn=config')

+     server.modify_s(MEMBEROF_PLUGIN_DN,

+         [(ldap.MOD_REPLACE, 'memberOfAutoAddOC', b'nsContainer')])

+ 

+ @pytest.mark.ds49967

+ def test_entrycache_on_modrdn_failure(topology_st):

+     """Specify a test case purpose or name here

+ 

+     :id: a4d8ac0b-2448-406a-9dc2-5a72851e30b6

+     :setup: Standalone Instance

+     :steps:

+         1. configure memberof to only scope ou=people,SUFFIX

+         2. Creates 10 users

+         3. Create groups0 (in peoplebase) that contain user0 and user1

+         4. Check user0 and user1 have memberof=group0.dn

+         5. Create group1 (OUT peoplebase) that contain user0 and user1

+         6. Check user0 and user1 have NOT memberof=group1.dn

+         7. Move group1 IN peoplebase and check users0 and user1 HAVE memberof=group1.dn

+         8. Create group2 (OUT peoplebase) that contain user2 and user3. Group2 contains a specific description value

+         9. Check user2 and user3 have NOT memberof=group2.dn

+         10. configure memberof so that added objectclass does not allow 'memberof' attribute

+         11. Move group2 IN peoplebase and check move failed OPERATIONS_ERROR (because memberof failed)

+         12. Search all groups and check that the group, having the specific description value,

+             has the original DN of group2.dn

+     :expectedresults:

+         1. should succeed

+         2. should succeed

+         3. should succeed

+         4. should succeed

+         5. should succeed

+         6. should succeed

+         7. should succeed

+         8. should succeed

+         9. should succeed

+         10. should succeed

+         11. should fail OPERATION_ERROR because memberof plugin fails to add 'memberof' to members.

+         12. should succeed

+ 

+     """

+ 

+     # only scopes peoplebase

+     _config_memberof_entrycache_on_modrdn_failure(topology_st.standalone)

+     topology_st.standalone.restart(timeout=10)

+ 

+     # create 10 users

+     peoplebase = 'ou=people,%s' % SUFFIX

+     for i in range(10):

+         cn = 'user%d' % i

+         dn = 'cn=%s,%s' % (cn, peoplebase)

+         log.fatal('Adding user (%s): ' % dn)

+         topology_st.standalone.add_s(Entry((dn, {'objectclass': ['top', 'person'],

+                              'sn': 'user_%s' % cn,

+                              'description': 'add on standalone'})))

+ 

+     # Check that members of group0 (in the scope) have 'memberof

+     group0_dn = 'cn=group_in0,%s' % peoplebase

+     topology_st.standalone.add_s(Entry((group0_dn, {'objectclass': ['top', 'groupofnames'],

+                              'member': [

+                                    'cn=user0,%s' % peoplebase,

+                                    'cn=user1,%s' % peoplebase,

+                                    ],

+                              'description': 'mygroup'})))

+ 

+     # Check the those entries have memberof with group0

+     for i in range(2):

+         user_dn = 'cn=user%d,%s' % (i, peoplebase)

+         ent = topology_st.standalone.getEntry(user_dn, ldap.SCOPE_BASE, "(objectclass=*)", ['memberof'])

+         assert ent.hasAttr('memberof')

+         found = False

+         for val in ent.getValues('memberof'):

+             topology_st.standalone.log.info("!!!!!!! %s: memberof->%s (vs %s)" % (user_dn, val, group0_dn.encode().lower()))

+             if val.lower() == group0_dn.encode().lower():

+                 found = True

+                 break

+         assert found

+ 

+     # Create a group1 out of the scope

+     group1_dn = 'cn=group_out1,%s' % SUFFIX

+     topology_st.standalone.add_s(Entry((group1_dn, {'objectclass': ['top', 'groupofnames'],

+                              'member': [

+                                    'cn=user0,%s' % peoplebase,

+                                    'cn=user1,%s' % peoplebase,

+                                    ],

+                              'description': 'mygroup'})))

+ 

+     # Check the those entries have not memberof with group1

+     for i in range(2):

+         user_dn = 'cn=user%d,%s' % (i, peoplebase)

+         ent = topology_st.standalone.getEntry(user_dn, ldap.SCOPE_BASE, "(objectclass=*)", ['memberof'])

+         assert ent.hasAttr('memberof')

+         found = False

+         for val in ent.getValues('memberof'):

+             topology_st.standalone.log.info("!!!!!!! %s: memberof->%s (vs %s)" % (user_dn, val, group1_dn.encode().lower()))

+             if val.lower() == group1_dn.encode().lower():

+                 found = True

+                 break

+         assert not found

+ 

+     # move group1 into the scope and check user0 and user1 are memberof group1

+     topology_st.standalone.rename_s(group1_dn, 'cn=group_in1', newsuperior=peoplebase, delold=0)

+     new_group1_dn = 'cn=group_in1,%s' % peoplebase

+     for i in range(2):

+         user_dn = 'cn=user%d,%s' % (i, peoplebase)

+         ent = topology_st.standalone.getEntry(user_dn, ldap.SCOPE_BASE, "(objectclass=*)", ['memberof'])

+         assert ent.hasAttr('memberof')

+         found = False

+         for val in ent.getValues('memberof'):

+             topology_st.standalone.log.info("!!!!!!! %s: memberof->%s (vs %s)" % (user_dn, val, new_group1_dn.encode().lower()))

+             if val.lower() == new_group1_dn.encode().lower():

+                 found = True

+                 break

+         assert found

+ 

+     # Create a group2 out of the scope with a SPECIFIC description value

+     entry_description = "this is to check that the entry having this description has the appropriate DN"

+     group2_dn = 'cn=group_out2,%s' % SUFFIX

+     topology_st.standalone.add_s(Entry((group2_dn, {'objectclass': ['top', 'groupofnames'],

+                              'member': [

+                                    'cn=user2,%s' % peoplebase,

+                                    'cn=user3,%s' % peoplebase,

+                                    ],

+                              'description': entry_description})))

+ 

+     # Check the those entries have not memberof with group2

+     for i in (2, 3):

+         user_dn = 'cn=user%d,%s' % (i, peoplebase)

+         ent = topology_st.standalone.getEntry(user_dn, ldap.SCOPE_BASE, "(objectclass=*)", ['memberof'])

+         assert not ent.hasAttr('memberof')

+ 

+     # memberof will not add the missing objectclass

+     _disable_auto_oc_memberof(topology_st.standalone)

+     topology_st.standalone.restart(timeout=10)

+ 

+     # move group2 into the scope and check it fails

+     try:

+         topology_st.standalone.rename_s(group2_dn, 'cn=group_in2', newsuperior=peoplebase, delold=0)

+         topology_st.standalone.log.info("This is unexpected, modrdn should fail as the member entry have not the appropriate objectclass")

+         assert False

+     except ldap.OPERATIONS_ERROR:

+         pass

+ 

+     # retrieve the entry having the specific description value

+     # check that the entry DN is the original group2 DN

+     ents = topology_st.standalone.search_s(DEFAULT_SUFFIX, ldap.SCOPE_SUBTREE, '(cn=gr*)')

+     found = False

+     for ent in ents:

+         topology_st.standalone.log.info("retrieve: %s with desc=%s" % (ent.dn, ent.getValue('description')))

+         if ent.getValue('description') == entry_description.encode():

+             found = True

+             assert ent.dn == group2_dn

+     assert found

  

  if __name__ == '__main__':

      # Run isolated

@@ -1400,6 +1400,19 @@ 

                  }

              }

          }

+ 

+         if (ec && retval) {

+             /* if the operation failed, the destination entry does not exist

+              * but it has been added in dncache during cache_add_tentative

+              * we need to remove it. Else a retrieval from ep_id can give the wrong DN

+              */

+             struct backdn *bdn = dncache_find_id(&inst->inst_dncache, ec->ep_id);

+             slapi_log_err(SLAPI_LOG_CACHE, "ldbm_back_modrdn",

+                                       "operation failed, the target entry is cleared from dncache (%s)\n", slapi_entry_get_dn(ec->ep_entry));

+             CACHE_REMOVE(&inst->inst_dncache, bdn);

+             CACHE_RETURN(&inst->inst_dncache, &bdn);

+         }

+ 

          /* remove the new entry from the cache if the op failed -

             otherwise, leave it in */

          if (ec && inst) {

Bug Description:
During a MODRDN the DN cache is updated to replace
source DN with the target DN (modrdn_rename_entry_update_indexes)
If later a failure occurs (for example if BETXN_POSTOP fails) and
the txn is aborted, the target DN (for the specific entryID) remains
in the DN cache.

If the entry is returned in a search, to build the DN there is
a lookup of the DN cache with the entryID. It retrieves the target DN
rather than the source DN

Fix Description:
In case of failure of the operation, the entry (from the entryID)
need to be cleared from the DN cache

https://pagure.io/389-ds-base/issue/49967

Reviewed by: ?

Platforms tested: F27

Flag Day: no

Doc impact: no

This solves another customer's crash. ACK!

rebased onto ab4af68

5 years ago

Pull-Request has been merged by tbordaz

5 years ago

@tbordaz so your fix cleans up the DN cache, but it does not perform the same process for entry cache. In the customer case I have its the entry cache, not the dn cache, that causes the crash.

I wonder if we also have to do something like:

struct backentry *be = cache_find_id(&inst->inst_cache, ec->ep_id);
CACHE_REMOVE(&inst->inst_cache, be);
CACHE_RETURN(&inst->inst_cache, &be);

Thoughts?

@mreynolds, IMHO I think that upon failure, the destination entry in the entrycache is removed few lines below (comment "remove the new entry from the cache if the op failed...").

The entry should be 'cache_is_in_cache' because it is flagged ENTRY_STATE_CREATING at that time.

The difficulty here is that the number of possible paths is so large that without a testcase we can not be sure.

@mreynolds, IMHO I think that upon failure, the destination entry in the entrycache is removed few lines below (comment "remove the new entry from the cache if the op failed...").

I guess I was wondering if the call below that you mentioned is NOT releasing the correct entry (cache_is_in_cache returning "no"). Perhaps it should be retrieved again (cache_find_id) before we check if its in the cache? Sorry still looking through the code so maybe this is not possible, but since it could happen with the dn cache, then why not the entry cache? :)

but for the ec it is done just after this,

if (ec && inst) ....
just the removal of the dncache was missing in modrdn.

I think we need to look into possible issues in ldbm_delete

In ldbm_back_delete we have remove_e_from_cache to trigger a removal from the ec in some conditions, what about dn cache in that case ?

Mark, would this be a candiate for your debug logging ?

but for the ec it is done just after this,
if (ec && inst) ....
just the removal of the dncache was missing in modrdn.
I think we need to look into possible issues in ldbm_delete

But the crash in delete seems to happen because of a blotched modrdn cache entry. When we call id2entry_add in ldbm_delete it's finding the "previous DN" entry in the cache (which was freed but not remove from the table). Well that was the original crash - not sure hows it's crashing now with Thierry's fix. The new debug patch will tell us more...

but I didn't see a MODRDN for the entry in the latest data set :-(

In ldbm_back_delete we have remove_e_from_cache to trigger a removal from the ec in some conditions, what about dn cache in that case ?
Mark, would this be a candiate for your debug logging ?

Unfortunately it looks like that does not come into play until after the crash. It's outside of the txn at that point. But it doesn't hurt to add some logging to see what DN it is...

but I didn't see a MODRDN for the entry in the latest data set :-(

Hmmm, well I guess we need to see what the new debug patch tells us...

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3048

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago