#50269 Ticket 50260 - backend txn plugins can corrupt entry cache
Closed 3 years ago by spichugi. Opened 5 years ago by mreynolds.
mreynolds/389-ds-base ticket50260  into  master

@@ -7,22 +7,23 @@ 

  # --- END COPYRIGHT BLOCK ---

  #

  import pytest

- import six

  import ldap

  from lib389.tasks import *

  from lib389.utils import *

  from lib389.topologies import topology_st

- 

- from lib389.plugins import SevenBitCheckPlugin, AttributeUniquenessPlugin, MemberOfPlugin

- 

+ from lib389.plugins import (SevenBitCheckPlugin, AttributeUniquenessPlugin,

+                             MemberOfPlugin, ManagedEntriesPlugin,

+                             ReferentialIntegrityPlugin, MEPTemplates,

+                             MEPConfigs)

  from lib389.idm.user import UserAccounts, TEST_USER_PROPERTIES

- from lib389.idm.group import Groups

- 

- from lib389._constants import DEFAULT_SUFFIX, PLUGIN_7_BIT_CHECK, PLUGIN_ATTR_UNIQUENESS, PLUGIN_MEMBER_OF

+ from lib389.idm.organizationalunit import OrganizationalUnits

+ from lib389.idm.group import Groups, Group

+ from lib389._constants import DEFAULT_SUFFIX

  

  logging.getLogger(__name__).setLevel(logging.DEBUG)

  log = logging.getLogger(__name__)

  

+ 

  def test_betxt_7bit(topology_st):

      """Test that the 7-bit plugin correctly rejects an invalid update

  
@@ -52,7 +53,6 @@ 

      sevenbc.enable()

      topology_st.standalone.restart()

  

- 

      users = UserAccounts(topology_st.standalone, basedn=DEFAULT_SUFFIX)

      user = users.create(properties=TEST_USER_PROPERTIES)

  
@@ -69,7 +69,7 @@ 

  

      user_check = users.get("testuser")

  

-     assert user_check.dn == user.dn

+     assert user_check.dn.lower() == user.dn.lower()

  

      #

      # Cleanup - remove the user
@@ -100,9 +100,6 @@ 

              5. Test user entry should be removed

      """

  

-     USER1_DN = 'uid=test_entry1,' + DEFAULT_SUFFIX

-     USER2_DN = 'uid=test_entry2,' + DEFAULT_SUFFIX

- 

      attruniq = AttributeUniquenessPlugin(topology_st.standalone)

      attruniq.enable()

      topology_st.standalone.restart()
@@ -110,26 +107,22 @@ 

      users = UserAccounts(topology_st.standalone, basedn=DEFAULT_SUFFIX)

      user1 = users.create(properties={

          'uid': 'testuser1',

-         'cn' : 'testuser1',

-         'sn' : 'user1',

-         'uidNumber' : '1001',

-         'gidNumber' : '2001',

-         'homeDirectory' : '/home/testuser1'

+         'cn': 'testuser1',

+         'sn': 'user1',

+         'uidNumber': '1001',

+         'gidNumber': '2001',

+         'homeDirectory': '/home/testuser1'

      })

  

-     try:

-         user2 = users.create(properties={

+     with pytest.raises(ldap.LDAPError):

+         users.create(properties={

              'uid': ['testuser2', 'testuser1'],

-             'cn' : 'testuser2',

-             'sn' : 'user2',

-             'uidNumber' : '1002',

-             'gidNumber' : '2002',

-             'homeDirectory' : '/home/testuser2'

+             'cn': 'testuser2',

+             'sn': 'user2',

+             'uidNumber': '1002',

+             'gidNumber': '2002',

+             'homeDirectory': '/home/testuser2'

          })

-         log.fatal('test_betxn_attr_uniqueness: The second entry was incorrectly added.')

-         assert False

-     except ldap.LDAPError as e:

-         log.error('test_betxn_attr_uniqueness: Failed to add test user as expected:')

  

      user1.delete()

  
@@ -191,8 +184,8 @@ 

      log.info('test_betxn_memberof: PASSED')

  

  

- def test_betxn_modrdn_memberof(topology_st):

-     """Test modrdn operartions and memberOf

+ def test_betxn_modrdn_memberof_cache_corruption(topology_st):

+     """Test modrdn operations and memberOf

  

      :id: 70d0b96e-b693-4bf7-bbf5-102a66ac5994

  
@@ -227,18 +220,18 @@ 

  

      # Create user and add it to group

      users = UserAccounts(topology_st.standalone, basedn=DEFAULT_SUFFIX)

-     user = users.create(properties=TEST_USER_PROPERTIES)

+     user = users.ensure_state(properties=TEST_USER_PROPERTIES)

      if not ds_is_older('1.3.7'):

          user.remove('objectClass', 'nsMemberOf')

  

      group.add_member(user.dn)

  

      # Attempt modrdn that should fail, but the original entry should stay in the cache

-     with pytest.raises(ldap.OBJECTCLASS_VIOLATION):

+     with pytest.raises(ldap.OBJECT_CLASS_VIOLATION):

          group.rename('cn=group_to_people', newsuperior=peoplebase)

  

      # Should fail, but not with NO_SUCH_OBJECT as the original entry should still be in the cache

-     with pytest.raises(ldap.OBJECTCLASS_VIOLATION):

+     with pytest.raises(ldap.OBJECT_CLASS_VIOLATION):

          group.rename('cn=group_to_people', newsuperior=peoplebase)

  

      #
@@ -247,6 +240,108 @@ 

      log.info('test_betxn_modrdn_memberof: PASSED')

  

  

+ def test_ri_and_mep_cache_corruption(topology_st):

+     """Test RI plugin aborts change after MEP plugin fails.

+     This is really testing the entry cache for corruption

+ 

+     :id: 70d0b96e-b693-4bf7-bbf5-102a66ac5995

+ 

+     :setup: Standalone instance

+ 

+     :steps: 1. Enable and configure mep and ri plugins

+             2. Add user and add it to a group

+             3. Disable MEP plugin and remove MEP group

+             4. Delete user

+             5. Check that user is still a member of the group

+ 

+     :expectedresults:

+             1. Success

+             2. Success

+             3. Success

+             4. It fails with NO_SUCH_OBJECT

+             5. Success

+ 

+     """

+     # Start plugins

+     topology_st.standalone.config.set('nsslapd-dynamic-plugins', 'on')

+     mep_plugin = ManagedEntriesPlugin(topology_st.standalone)

+     mep_plugin.enable()

+     ri_plugin = ReferentialIntegrityPlugin(topology_st.standalone)

+     ri_plugin.enable()

+ 

+     # Add our org units

+     ous = OrganizationalUnits(topology_st.standalone, DEFAULT_SUFFIX)

+     ou_people = ous.create(properties={'ou': 'managed_people'})

+     ou_groups = ous.create(properties={'ou': 'managed_groups'})

+ 

+     # Configure MEP

+     mep_templates = MEPTemplates(topology_st.standalone, DEFAULT_SUFFIX)

+     mep_template1 = mep_templates.create(properties={

+         'cn': 'MEP template',

+         'mepRDNAttr': 'cn',

+         'mepStaticAttr': 'objectclass: posixGroup|objectclass: extensibleObject'.split('|'),

+         'mepMappedAttr': 'cn: $cn|uid: $cn|gidNumber: $uidNumber'.split('|')

+     })

+     mep_configs = MEPConfigs(topology_st.standalone)

+     mep_configs.create(properties={'cn': 'config',

+                                                 'originScope': ou_people.dn,

+                                                 'originFilter': 'objectclass=posixAccount',

+                                                 'managedBase': ou_groups.dn,

+                                                 'managedTemplate': mep_template1.dn})

+ 

+     # Add an entry that meets the MEP scope

+     users = UserAccounts(topology_st.standalone, DEFAULT_SUFFIX,

+                          rdn='ou={}'.format(ou_people.rdn))

+     user = users.create(properties={

+         'uid': 'test-user1',

+         'cn': 'test-user',

+         'sn': 'test-user',

+         'uidNumber': '10011',

+         'gidNumber': '20011',

+         'homeDirectory': '/home/test-user1'

+     })

+ 

+     # Add group

+     groups = Groups(topology_st.standalone, DEFAULT_SUFFIX)

+     user_group = groups.ensure_state(properties={'cn': 'group', 'member': user.dn})

+ 

+     # Check if a managed group entry was created

+     mep_group = Group(topology_st.standalone, dn='cn={},{}'.format(user.rdn, ou_groups.dn))

+     if not mep_group.exists():

+         log.fatal("MEP group was not created for the user")

+         assert False

+ 

+     # Mess with MEP so it fails

+     mep_plugin.disable()

+     mep_group.delete()

+     mep_plugin.enable()

+ 

+     # Add another group for verify entry cache is not corrupted

+     test_group = groups.create(properties={'cn': 'test_group'})

+ 

+     # Delete user, should fail, and user should still be a member

+     with pytest.raises(ldap.NO_SUCH_OBJECT):

+         user.delete()

+ 

+     # Verify membership is intact

+     if not user_group.is_member(user.dn):

+         log.fatal("Member was incorrectly removed from the group!!  Or so it seems")

+ 

+         # Restart server and test again in case this was a cache issue

+         topology_st.standalone.restart()

+         if user_group.is_member(user.dn):

+             log.info("The entry cache was corrupted")

+             assert False

+ 

+         assert False

+ 

+     # Verify test group is still found in entry cache by deleting it

+     test_group.delete()

+ 

+     # Success

+     log.info("Test PASSED")

+ 

+ 

  if __name__ == '__main__':

      # Run isolated

      # -s for DEBUG mode

@@ -18,7 +18,7 @@ 

  from lib389.plugins import *

  from lib389._constants import *

  from lib389.dseldif import DSEldif

- from lib389.idm.user import UserAccounts, UserAccount

+ from lib389.idm.user import UserAccounts

  from lib389.idm.group import Groups

  from lib389.idm.organizationalunit import OrganizationalUnits

  from lib389.idm.domain import Domain

@@ -312,48 +312,52 @@ 

  

  struct backcommon

  {

-     int ep_type;                   /* to distinguish backdn from backentry */

-     struct backcommon *ep_lrunext; /* for the cache */

-     struct backcommon *ep_lruprev; /* for the cache */

-     ID ep_id;                      /* entry id */

-     char ep_state;                 /* state in the cache */

- #define ENTRY_STATE_DELETED    0x1 /* entry is marked as deleted */

- #define ENTRY_STATE_CREATING   0x2 /* entry is being created; don't touch it */

- #define ENTRY_STATE_NOTINCACHE 0x4 /* cache_add failed; not in the cache */

-     int ep_refcnt;                 /* entry reference cnt */

-     size_t ep_size;                /* for cache tracking */

+     int32_t ep_type;                /* to distinguish backdn from backentry */

+     struct backcommon *ep_lrunext;  /* for the cache */

+     struct backcommon *ep_lruprev;  /* for the cache */

+     ID ep_id;                       /* entry id */

+     uint8_t ep_state;               /* state in the cache */

+ #define ENTRY_STATE_DELETED    0x1  /* entry is marked as deleted */

+ #define ENTRY_STATE_CREATING   0x2  /* entry is being created; don't touch it */

+ #define ENTRY_STATE_NOTINCACHE 0x4  /* cache_add failed; not in the cache */

+ #define ENTRY_STATE_INVALID    0x8  /* cache entry is invalid and needs to be removed */

+     int32_t ep_refcnt;              /* entry reference cnt */

+     size_t ep_size;                 /* for cache tracking */

+     struct timespec ep_create_time; /* the time the entry was added to the cache */

  };

  

- /* From ep_type through ep_size MUST be identical to backcommon */

+ /* From ep_type through ep_create_time MUST be identical to backcommon */

  struct backentry

  {

-     int ep_type;                   /* to distinguish backdn from backentry */

-     struct backcommon *ep_lrunext; /* for the cache */

-     struct backcommon *ep_lruprev; /* for the cache */

-     ID ep_id;                      /* entry id */

-     char ep_state;                 /* state in the cache */

-     int ep_refcnt;                 /* entry reference cnt */

-     size_t ep_size;                /* for cache tracking */

-     Slapi_Entry *ep_entry;         /* real entry */

+     int32_t ep_type;                /* to distinguish backdn from backentry */

+     struct backcommon *ep_lrunext;  /* for the cache */

+     struct backcommon *ep_lruprev;  /* for the cache */

+     ID ep_id;                       /* entry id */

+     uint8_t ep_state;               /* state in the cache */

+     int32_t ep_refcnt;              /* entry reference cnt */

+     size_t ep_size;                 /* for cache tracking */

+     struct timespec ep_create_time; /* the time the entry was added to the cache */

+     Slapi_Entry *ep_entry;          /* real entry */

      Slapi_Entry *ep_vlventry;

-     void *ep_dn_link;     /* linkage for the 3 hash */

-     void *ep_id_link;     /*     tables used for */

-     void *ep_uuid_link;   /*     looking up entries */

-     PRMonitor *ep_mutexp; /* protection for mods; make it reentrant */

+     void *ep_dn_link;               /* linkage for the 3 hash */

+     void *ep_id_link;               /*     tables used for */

+     void *ep_uuid_link;             /*     looking up entries */

+     PRMonitor *ep_mutexp;           /* protection for mods; make it reentrant */

  };

  

- /* From ep_type through ep_size MUST be identical to backcommon */

+ /* From ep_type through ep_create_time MUST be identical to backcommon */

  struct backdn

  {

-     int ep_type;                   /* to distinguish backdn from backentry */

-     struct backcommon *ep_lrunext; /* for the cache */

-     struct backcommon *ep_lruprev; /* for the cache */

-     ID ep_id;                      /* entry id */

-     char ep_state;                 /* state in the cache; share ENTRY_STATE_* */

-     int ep_refcnt;                 /* entry reference cnt */

-     uint64_t ep_size;                /* for cache tracking */

+     int32_t ep_type;                /* to distinguish backdn from backentry */

+     struct backcommon *ep_lrunext;  /* for the cache */

+     struct backcommon *ep_lruprev;  /* for the cache */

+     ID ep_id;                       /* entry id */

+     uint8_t ep_state;               /* state in the cache; share ENTRY_STATE_* */

+     int32_t ep_refcnt;              /* entry reference cnt */

+     uint64_t ep_size;               /* for cache tracking */

+     struct timespec ep_create_time; /* the time the entry was added to the cache */

      Slapi_DN *dn_sdn;

-     void *dn_id_link; /* for hash table */

+     void *dn_id_link;               /* for hash table */

  };

  

  /* for the in-core cache of entries */

@@ -23,7 +23,8 @@ 

          return;

      }

      ep = *bep;

-     PR_ASSERT(ep->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_NOTINCACHE));

+ 

+     PR_ASSERT(ep->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_NOTINCACHE | ENTRY_STATE_INVALID));

      if (ep->ep_entry != NULL) {

          slapi_entry_free(ep->ep_entry);

      }

@@ -56,11 +56,14 @@ 

  #define LOG(...)

  #endif

  

- #define LRU_DETACH(cache, e) lru_detach((cache), (void *)(e))

+ typedef enum {

+     ENTRY_CACHE,

+     DN_CACHE,

+ } CacheType;

  

+ #define LRU_DETACH(cache, e) lru_detach((cache), (void *)(e))

  #define CACHE_LRU_HEAD(cache, type) ((type)((cache)->c_lruhead))

  #define CACHE_LRU_TAIL(cache, type) ((type)((cache)->c_lrutail))

- 

  #define BACK_LRU_NEXT(entry, type) ((type)((entry)->ep_lrunext))

  #define BACK_LRU_PREV(entry, type) ((type)((entry)->ep_lruprev))

  
@@ -185,6 +188,7 @@ 

  int

  add_hash(Hashtable *ht, void *key, uint32_t keylen, void *entry, void **alt)

  {

+     struct backcommon *back_entry = (struct backcommon *)entry;

      u_long val, slot;

      void *e;

  
@@ -202,6 +206,7 @@ 

          e = HASH_NEXT(ht, e);

      }

      /* ok, it's not already there, so add it */

+     back_entry->ep_create_time = slapi_current_rel_time_hr();

      HASH_NEXT(ht, entry) = ht->slot[slot];

      ht->slot[slot] = entry;

      return 1;
@@ -492,6 +497,89 @@ 

      }

  }

  

+ /*

+  * Helper function for flush_hash() to calculate if the entry should be

+  * removed from the cache.

+  */

+ static int32_t

+ flush_remove_entry(struct timespec *entry_time, struct timespec *start_time)

+ {

+     struct timespec diff;

+ 

+     slapi_timespec_diff(entry_time, start_time, &diff);

+     if (diff.tv_sec >= 0) {

+         return 1;

+     } else {

+         return 0;

+     }

+ }

+ 

+ /*

+  * Flush all the cache entries that were added after the "start time"

+  * This is called when a backend transaction plugin fails, and we need

+  * to remove all the possible invalid entries in the cache.

+  *

+  * If the ref count is 0, we can straight up remove it from the cache, but

+  * if the ref count is greater than 1, then the entry is currently in use.

+  * In the later case we set the entry state to ENTRY_STATE_INVALID, and

+  * when the owning thread cache_returns() the cache entry is automatically

+  * removed so another thread can not use/lock the invalid cache entry.

+  */

+ static void

+ flush_hash(struct cache *cache, struct timespec *start_time, int32_t type)

+ {

+     void *e, *laste = NULL;

+     Hashtable *ht = cache->c_idtable;

+ 

+     cache_lock(cache);

+ 

+     for (size_t i = 0; i < ht->size; i++) {

+         e = ht->slot[i];

+         while (e) {

+             struct backcommon *entry = (struct backcommon *)e;

+             uint64_t remove_it = 0;

+             if (flush_remove_entry(&entry->ep_create_time, start_time)) {

+                 /* Mark the entry to be removed */

+                 slapi_log_err(SLAPI_LOG_CACHE, "flush_hash", "[%s] Removing entry id (%d)\n",

+                         type ? "DN CACHE" : "ENTRY CACHE", entry->ep_id);

+                 remove_it = 1;

+             }

+             laste = e;

+             e = HASH_NEXT(ht, e);

+ 

+             if (remove_it) {

+                 /* since we have the cache lock we know we can trust refcnt */

+                 entry->ep_state |= ENTRY_STATE_INVALID;

+                 if (entry->ep_refcnt == 0) {

+                     entry->ep_refcnt++;

+                     lru_delete(cache, laste);

+                     if (type == ENTRY_CACHE) {

+                         entrycache_remove_int(cache, laste);

+                         entrycache_return(cache, (struct backentry **)&laste);

+                     } else {

+                         dncache_remove_int(cache, laste);

+                         dncache_return(cache, (struct backdn **)&laste);

+                     }

+                 } else {

+                     /* Entry flagged for removal */

+                     slapi_log_err(SLAPI_LOG_CACHE, "flush_hash",

+                             "[%s] Flagging entry to be removed later: id (%d) refcnt: %d\n",

+                             type ? "DN CACHE" : "ENTRY CACHE", entry->ep_id, entry->ep_refcnt);

+                 }

+             }

+         }

+     }

+ 

+     cache_unlock(cache);

+ }

+ 

+ void

+ revert_cache(ldbm_instance *inst, struct timespec *start_time)

+ {

+     flush_hash(&inst->inst_cache, start_time, ENTRY_CACHE);

+     flush_hash(&inst->inst_dncache, start_time, DN_CACHE);

+ }

+ 

  /* initialize the cache */

  int

  cache_init(struct cache *cache, uint64_t maxsize, int64_t maxentries, int type)
@@ -1142,7 +1230,7 @@ 

      } else {

          ASSERT(e->ep_refcnt > 0);

          if (!--e->ep_refcnt) {

-             if (e->ep_state & ENTRY_STATE_DELETED) {

+             if (e->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_INVALID)) {

I think you should not test INVALID at the point, and just let the entry go in the LRU (lru_add)

                  const char *ndn = slapi_sdn_get_ndn(backentry_get_sdn(e));

                  if (ndn) {

                      /*
@@ -1154,6 +1242,13 @@ 

                          LOG("entrycache_return -Failed to remove %s from dn table\n", ndn);

                      }

                  }

+                 if (e->ep_state & ENTRY_STATE_INVALID) {

+                     /* Remove it from the hash table before we free the back entry */

+                     slapi_log_err(SLAPI_LOG_CACHE, "entrycache_return",

+                             "Finally flushing invalid entry: %d (%s)\n",

+                             e->ep_id, backentry_get_ndn(e));

+                     entrycache_remove_int(cache, e);

+                 }

                  backentry_free(bep);

              } else {

                  lru_add(cache, e);
@@ -1535,7 +1630,7 @@ 

  

      /* make sure entry hasn't been deleted now */

      cache_lock(cache);

-     if (e->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_NOTINCACHE)) {

+     if (e->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_NOTINCACHE | ENTRY_STATE_INVALID)) {

          cache_unlock(cache);

          PR_ExitMonitor(e->ep_mutexp);

          LOG("<= cache_lock_entry (DELETED)\n");
@@ -1696,7 +1791,14 @@ 

      } else {

          ASSERT((*bdn)->ep_refcnt > 0);

          if (!--(*bdn)->ep_refcnt) {

-             if ((*bdn)->ep_state & ENTRY_STATE_DELETED) {

+             if ((*bdn)->ep_state & (ENTRY_STATE_DELETED | ENTRY_STATE_INVALID)) {

+                 if ((*bdn)->ep_state & ENTRY_STATE_INVALID) {

+                     /* Remove it from the hash table before we free the back dn */

+                     slapi_log_err(SLAPI_LOG_CACHE, "dncache_return",

+                             "Finally flushing invalid entry: %d (%s)\n",

+                             (*bdn)->ep_id, slapi_sdn_get_dn((*bdn)->dn_sdn));

+                     dncache_remove_int(cache, (*bdn));

+                 }

                  backdn_free(bdn);

              } else {

                  lru_add(cache, (void *)*bdn);

@@ -97,6 +97,8 @@ 

      PRUint64 conn_id;

      int op_id;

      int result_sent = 0;

+     int32_t parent_op = 0;

+     struct timespec parent_time;

  

      if (slapi_pblock_get(pb, SLAPI_CONN_ID, &conn_id) < 0) {

          conn_id = 0; /* connection is NULL */
@@ -147,6 +149,13 @@ 

      slapi_entry_delete_values(e, numsubordinates, NULL);

  

      dblayer_txn_init(li, &txn);

+ 

+     if (txn.back_txn_txn == NULL) {

+         /* This is the parent operation, get the time */

+         parent_op = 1;

+         parent_time = slapi_current_rel_time_hr();

+     }

+ 

      /* the calls to perform searches require the parent txn if any

         so set txn to the parent_txn until we begin the child transaction */

      if (parent_txn) {
@@ -1212,6 +1221,11 @@ 

              slapi_pblock_set(pb, SLAPI_PLUGIN_OPRETURN, ldap_result_code ? &ldap_result_code : &retval);

          }

          slapi_pblock_get(pb, SLAPI_PB_RESULT_TEXT, &ldap_result_message);

+ 

+         /* Revert the caches if this is the parent operation */

+         if (parent_op) {

+             revert_cache(inst, &parent_time);

+         }

          goto error_return;

      }

  

@@ -79,6 +79,8 @@ 

      ID tomb_ep_id = 0;

      int result_sent = 0;

      Connection *pb_conn;

+     int32_t parent_op = 0;

+     struct timespec parent_time;

  

      if (slapi_pblock_get(pb, SLAPI_CONN_ID, &conn_id) < 0) {

          conn_id = 0; /* connection is NULL */
@@ -100,6 +102,13 @@ 

      dblayer_txn_init(li, &txn);

      /* the calls to perform searches require the parent txn if any

         so set txn to the parent_txn until we begin the child transaction */

+ 

+     if (txn.back_txn_txn == NULL) {

+         /* This is the parent operation, get the time */

+         parent_op = 1;

+         parent_time = slapi_current_rel_time_hr();

+     }

+ 

      if (parent_txn) {

          txn.back_txn_txn = parent_txn;

      } else {
@@ -1270,6 +1279,11 @@ 

              slapi_pblock_set(pb, SLAPI_PLUGIN_OPRETURN, &retval);

          }

          slapi_pblock_get(pb, SLAPI_PB_RESULT_TEXT, &ldap_result_message);

+ 

+         /* Revert the caches if this is the parent operation */

+         if (parent_op) {

+             revert_cache(inst, &parent_time);

+         }

          goto error_return;

      }

      if (parent_found) {

@@ -412,6 +412,8 @@ 

      int fixup_tombstone = 0;

      int ec_locked = 0;

      int result_sent = 0;

+     int32_t parent_op = 0;

+     struct timespec parent_time;

  

      slapi_pblock_get(pb, SLAPI_BACKEND, &be);

      slapi_pblock_get(pb, SLAPI_PLUGIN_PRIVATE, &li);
@@ -426,6 +428,13 @@ 

      dblayer_txn_init(li, &txn); /* must do this before first goto error_return */

      /* the calls to perform searches require the parent txn if any

         so set txn to the parent_txn until we begin the child transaction */

+ 

+     if (txn.back_txn_txn == NULL) {

+         /* This is the parent operation, get the time */

+         parent_op = 1;

+         parent_time = slapi_current_rel_time_hr();

+     }

+ 

      if (parent_txn) {

          txn.back_txn_txn = parent_txn;

      } else {
@@ -864,6 +873,11 @@ 

              slapi_pblock_set(pb, SLAPI_PLUGIN_OPRETURN, ldap_result_code ? &ldap_result_code : &retval);

          }

          slapi_pblock_get(pb, SLAPI_PB_RESULT_TEXT, &ldap_result_message);

+ 

+         /* Revert the caches if this is the parent operation */

+         if (parent_op) {

+             revert_cache(inst, &parent_time);

+         }

          goto error_return;

      }

      retval = plugin_call_mmr_plugin_postop(pb, NULL,SLAPI_PLUGIN_BE_TXN_POST_MODIFY_FN);

@@ -97,6 +97,8 @@ 

      int op_id;

      int result_sent = 0;

      Connection *pb_conn = NULL;

+     int32_t parent_op = 0;

+     struct timespec parent_time;

  

      if (slapi_pblock_get(pb, SLAPI_CONN_ID, &conn_id) < 0) {

          conn_id = 0; /* connection is NULL */
@@ -134,6 +136,13 @@ 

  

      /* dblayer_txn_init needs to be called before "goto error_return" */

      dblayer_txn_init(li, &txn);

+ 

+     if (txn.back_txn_txn == NULL) {

+         /* This is the parent operation, get the time */

+         parent_op = 1;

+         parent_time = slapi_current_rel_time_hr();

+     }

+ 

      /* the calls to perform searches require the parent txn if any

         so set txn to the parent_txn until we begin the child transaction */

      if (parent_txn) {
@@ -1208,6 +1217,11 @@ 

              slapi_pblock_set(pb, SLAPI_PLUGIN_OPRETURN, ldap_result_code ? &ldap_result_code : &retval);

          }

          slapi_pblock_get(pb, SLAPI_PB_RESULT_TEXT, &ldap_result_message);

+ 

+         /* Revert the caches if this is the parent operation */

+         if (parent_op) {

+             revert_cache(inst, &parent_time);

+         }

          goto error_return;

      }

  	retval = plugin_call_mmr_plugin_postop(pb, NULL,SLAPI_PLUGIN_BE_TXN_POST_MODRDN_FN);
@@ -1353,8 +1367,13 @@ 

                      slapi_pblock_set(pb, SLAPI_PLUGIN_OPRETURN, ldap_result_code ? &ldap_result_code : &retval);

                  }

                  slapi_pblock_get(pb, SLAPI_PB_RESULT_TEXT, &ldap_result_message);

+ 

+                 /* Revert the caches if this is the parent operation */

+                 if (parent_op) {

+                     revert_cache(inst, &parent_time);

+                 }

              }

- 	retval = plugin_call_mmr_plugin_postop(pb, NULL,SLAPI_PLUGIN_BE_TXN_POST_MODRDN_FN);

+             retval = plugin_call_mmr_plugin_postop(pb, NULL,SLAPI_PLUGIN_BE_TXN_POST_MODRDN_FN);

  

              /* Release SERIAL LOCK */

              dblayer_txn_abort(be, &txn); /* abort crashes in case disk full */
@@ -1411,17 +1430,6 @@ 

                                        "operation failed, the target entry is cleared from dncache (%s)\n", slapi_entry_get_dn(ec->ep_entry));

              CACHE_REMOVE(&inst->inst_dncache, bdn);

              CACHE_RETURN(&inst->inst_dncache, &bdn);

-             /*

-              * If the new/invalid entry (ec) is in the cache, that means we need to

-              * swap it out with the original entry (e) --> to undo the swap that

-              * modrdn_rename_entry_update_indexes() did.

-              */

-             if (cache_is_in_cache(&inst->inst_cache, ec)) {

-                 if (cache_replace(&inst->inst_cache, ec, e) != 0) {

-                         slapi_log_err(SLAPI_LOG_ALERT, "ldbm_back_modrdn",

-                                 "failed to replace cache entry after error\n");

-                  }

-             }

          }

  

          if (ec && inst) {

@@ -55,6 +55,7 @@ 

  int cache_replace(struct cache *cache, void *oldptr, void *newptr);

  int cache_has_otherref(struct cache *cache, void *bep);

  int cache_is_in_cache(struct cache *cache, void *ptr);

+ void revert_cache(ldbm_instance *inst, struct timespec *start_time);

  

  #ifdef CACHE_DEBUG

  void check_entry_cache(struct cache *cache, struct backentry *e);

@@ -6748,6 +6748,12 @@ 

   */

  struct timespec slapi_current_time_hr(void);

  /**

+  * Returns the current system time as a hr clock

+  *

+  * \return timespec of the current monotonic time.

+  */

+ struct timespec slapi_current_rel_time_hr(void);

+ /**

   * Returns the current system time as a hr clock in UTC timezone.

   * This clock adjusts with ntp steps, and should NOT be

   * used for timer information.

Bug Description:

If a nested backend txn plugin fails, any updates it made that went into the entry cache still persist after the database transaction is aborted.

Fix Description:

In order to be sure the entry cache is not corrupted after a backend txn plugin failure we need to flush all the cache entries that were added to the cache after the parent operation was started.

To do this we record the start time the original operation, (or parent operation), and we record the time any entry is added to the cache. Then on failure we do a comparison and remove the entry from the cache if it's not in use. If it is in use we add a "invalid" state flag which triggers the entry to be removed when the cache entry is returned by the owner.

https://pagure.io/389-ds-base/issue/50260

CI tested and ASAN approved.

Could this be changed to "with pytest.raises"?

Could we name the test "test_ri_and_mep_cache_corruption" then? :)

This set of defines could become and enum type instead? Not a hard request though, feel free to ignore.

Maybe a more descriptive name then? Like ep_create_time?

For my sanity, this is checking that if an entry is returned from the cache, it must not be in any of the three listed states?

Also a possible enum candidate? enums are nice because they give a "type" to where the ENTRY_CACHE/DN_CACHE can be used and means we can do better type checking of those values.

So I'm assuming here that this means "if the entry was added in the same second or newer", purge it? There is a risk here of flushing an entry from a previous operation, but I'd rather be "too broad' than miss an entry.

We could also set the "start time" to 1 sec less for paranoia, but I think this is fine since rel_time is always advancing.

Is it possible that another thread could change the content of ep_state but behind a different lock (entry lock rather than cache lock?)

Actually, there is a cache coherency bug here perhaps. If we do:

thread 1           thread 2

cache lock
   entry lock
cache unlock
                   cache lock
                        entry->ep_state invalid
                   cache unlock

read entry->ep_state

in this case, because thread 1 has no issued a new lock, no memory barrier was applied so the value of entry->ep_state may still reflect the value from when the entry was locked, and will NOT show the invalidation of the entry. So I think we need to be careful here ... We may need to lock the entry to change ep_state, but it HIGHLY depends on the usage of entry reads in other threads and how they assert safety.

Sorry for all the comments. Anyway, this is great work, especially isolating a test case that can prove entry cache corruption conditions. Awesome stuff!

Commenting on my own PR, is setting the state while another thread owns this entry safe? Perhaps a special uint64_t flag in the backend entry would be safer?

EDIT: Errr sorry missed @firstyears comments here because of Pagure webhooks not reloading the page. Okay, so we all agreed this needs more attention :-)

rebased onto e9c8a3a9318264d8d4f6a0c2514048b70568aac5

5 years ago

I have addressed @firstyear concerns. I now use a unit64_t to mark entry invalid, and use atomic store/load to set/check the value. This should address the memory barrier issues (but please correct me if I am wrong as I am not an expert on such things)

Please review...

rebased onto 6dd224997cfc6abc6b1813a9f8665a3738d68c24

5 years ago

Little nitpick (but not your fault at all) this should be uint8_t instead of char?

Ahhhh @mreynolds ... you know how you allegedly liked me or respected me ... well, I think you'll hate me for this.

I was thinking a lot about this fix this morning, like, for a good three hours or so. And I think that even with the atomic changes, it may not work, there may still be a potential corruption condition.

I might make this comment a bit longer so that my understanding of this is correct and described. So there are two levels of locking - the cache lock that describes "what is and is not in the cache", IE it protects the hashtable. And each entry in the cache has an "entry lock" to protect it's content from threaded alterations. Each entry uses a reference count to determine if it is still in use. I'm not sure of how reads use this (whether they maintain the lock during an operation, or if they copy the entry for the operation).

It's pretty clear from this design, that a goal of the entry cache was to be concurrent, both to reads and writes. Now for this thought exercise, we'll assume there is only a single BE_TXN write thread (which I hope is true, but ... it may not be), and there are multiple concurrent read threads.

So let's assume we have two threads now, one writer, one reader.

thread read            thread write
---                    ---
cache lock
  get entry a
    lock a, refcnt++
    unlock a
cache unlock           
                       cache lock
                         get entry a
                           lock a, refcnt ++
                           unlock a
                       cache unlock

                       lock a
                         modify a (corrupted)
                       unlock a
lock a
  read a
unlock a

                       cache lock
                         atomic set invalid
                         lock a, refcnt --
                         unlock a
                       cache unlock

cache lock
  lock a, refcnt -- 
  unlock a
cache unlock

So the thing is how do we actually protect our cache from this behaviour, where the read that happens exactly the same time as the corrupted write lands, is isolated? I think our cache is not designed with isolation in mind at all, so as a result, I think this situation can still happen, even with the atomic invalid now being present (because there is time between the corruption and the invalidation occuring).

So in general, I think mark's patch is good to move the bar further, but it doesn't completely fix the corruption. We are butting our heads against the fact our cache is designed for "fast, simple, correct" not "correct, simple, fast", which probably means no amount of patching will ever be able to solve this bug until we rearchitect the cache - at the minimum to guarantee a single be_txn writer thread, and that entries that are "written" are in a staging cache in parallel then only commited when we can guarantee they are valid.

More specific to this patch, although the atomics do protect from the race condition I mentioned, the issue is that atomics are costly, so this will have a performance impact on the cache I think, which will be pretty significant (worse on numa systems).

If I were a compromising william, I think that I would do the non-atomic version of the fix (we still have corurption races, but they exist anyway, because of our cache design), because we don't pay the atomic cost. We'd commit that, then immediately start to work on the "staging cache" for writes, and making be_txn writes linearised.

Does any of this seem sane? Have I missed something or misunderstood something? Hope it helps,

@firstyear, correct there is no way to prevent the cache from corrupted entries (in regards to this fix). They will exist and be available to other threads until we we realize the cache is potentially corrupted and we remove them. While this window is very small the threat is still real, but this fix is not about preventing the corruption but about safely removing the corrupted entries asap.

I'm still kind of leaning towards using atomics here because although they are costly we know there are other bottlenecks in the cache and in the server in general. So at the end of the day are we actually impacting overall server performance with the use of atomics in the entry cache? I don't know, maybe not. But I think integrity is more important than speed.

You would know this better, but from what I understand using uint64_t somewhat works like an atomic, but I don't know if that applies in regards to memory barriers (probably not). If it does we just use the new flag but without calling atomic load/store (which is what you are suggesting anyway).

So like you said, today this fix solves a number of problems (it has value), but it does not address the underlying fault in the cache design.

Yes, we will be affecting the performance, it has a much higher cost when they are contended (and they will be contended in this case given how many threads will be hitting this). I agree that integrity is more important than speed, but I worry this will be a really big hit ...

uint64_t doesn't really work like an atomic I don't think. Best way to think of it is that memory is like a "timeline" when whenever you read that memory you are looking at "any value from the present to the past" but you don't know where on the slider that is. But with an atomic you are guaranteeing that the memory you see really really is the exact present value. So this is why the non-atomic version is not too "bad", because the refcnt is still safe, it's just that we have a subtle race where an invalidated entry may be seen as "valid" briefly by another thread. As you have said yourself this is already the case given the cache design, so having atomic vs non-atomic, doesn't really change that situation much - we still have a potential ability to perceive an invalid entry.

So I think I rescind my comment, lets do the non-atomic design, because it's "better than nothing" but cache redesign is probably on the cards sooner than later. :)

I have a doubt we should rely on the timestamp (to determine all updated entries to clear) because system time can change during a TXN.
On replicated environment we could rely on original CSN.

According to the comment on top of the struct, backentry/backcommon should be identical up to ep_size. Should not we add ep_create_time and ep_invalid before ep_size ?

I have a doubt we should rely on the timestamp (to determine all updated entries to clear) because system time can change during a TXN.
On replicated environment we could rely on original CSN.

I haven't reviewed and read all of the comments, so just jumping in. I think the EC ocntains entres adde with and without being transacted, so in case of searches adding entries you don't have a CSN, and then would need to decide if to remove them or not.
And it should be a rare condition having a tranasction aborted because of a plugin failure AND setting back system time, maybe we can just take the risk

I have a doubt we should rely on the timestamp (to determine all updated entries to clear) because system time can change during a TXN.
On replicated environment we could rely on original CSN.

But here we just set the montonic time when we add an entry to the cache. Are you worried someone will change the system time intentionally in between a operation failing and the server purging the cache when it fails? That is a very small window.

According to the comment on top of the struct, backentry/backcommon should be identical up to ep_size. Should not we add ep_create_time and ep_invalid before ep_size ?

What that really means is that both structs need to be identical for the common/shared elements. So in this case the comment should be changed to say "they need to be identical up to ep_create_time".

I have a doubt we should rely on the timestamp (to determine all updated entries to clear) because system time can change during a TXN.
On replicated environment we could rely on original CSN.

I haven't reviewed and read all of the comments, so just jumping in. I think the EC ocntains entres adde with and without being transacted, so in case of searches adding entries you don't have a CSN, and then would need to decide if to remove them or not.

I'm not using CSN's, I'm just taking the monotonic clock time when we start an operation and when an entry is added to the cache.

What that really means is that both structs need to be identical for the common/shared elements. So in this case the comment should be changed to say "they need to be identical up to ep_create_time".
I agree, I was initially thinking that the struct could be copy up to ep_size offset. But it is only used with pointer, so changing the comment is enough.

rebased onto c7c8af4fd7c08ff119ab5598e8e9c530043bd303

5 years ago

I've reverted the patch back to using ep_state for tracking invalid entries, and I fixed the comments in back-ldbm.h about the shared struct elements.

Please review...

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0

instead of 'entry' why not using 'laste'

Would it be possibly to simply set the state to ENTRY_STATE_INVALID, whether or not refcnt is 0.
Any access to an invalid entry would be ignored and we can let the LRU do its refcnt jobs.

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0

You are correct, but we need to explicitly remove it because it does not get flushed from the LRU otherwise, and at shutdown the server silently crashes because the backend entry was already freed.

instead of 'entry' why not using 'laste'

Because laste is a void pointer and can not be dereferenced

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?

It's already checked in the cache find function because the ep_state is not 0

Is it still use ?

Looks like pagure did not put this comment in the diff view. What are you referring to?

rebased onto 17b961afdc7b5fdc4f5a454c511053b4940ccbc7

5 years ago

Would it be possibly to simply set the state to ENTRY_STATE_INVALID, whether or not refcnt is 0.
Any access to an invalid entry would be ignored and we can let the LRU do its refcnt jobs.

I set the flag regardless of the refcnt and the tests still pass. (but I need to keep the remaining code or things start to fail or crash)

PR is rebased...

I've reverted the patch back to using ep_state for tracking invalid entries, and I fixed the comments in back-ldbm.h about the shared struct elements.
Please review...

Thanks mate, sorry for the run around :(

Is it still use ?

Looks like pagure did not put this comment in the diff view. What are you referring to?

I was referring to the enum type 'CacheType'

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0

You are correct, but we need to explicitly remove it because it does not get flushed from the LRU otherwise, and at shutdown the server silently crashes because the backend entry was already freed.

I think the backend entry should not be freed by the flush_hash. flush_hash sould only set it to invalid state. Sooner or later, the entry will be on the LRU (refcnt=0) and if there is need to reduce cache memory footprint then the LRU will free it.
The only thing is to make sure that lookup of the entry (find_hash) will not return an entry in invalid state

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?

It's already checked in the cache find function because the ep_state is not 0

You are right.
Each time we call find_hash we are testing the ep_state of the returned entry to ignore the entry if its ep_state is not 0.
you may also move this test inside find_hash.

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0
You are correct, but we need to explicitly remove it because it does not get flushed from the LRU otherwise, and at shutdown the server silently crashes because the backend entry was already freed.

I think the backend entry should not be freed by the flush_hash. flush_hash sould only set it to invalid state. Sooner or later, the entry will be on the LRU (refcnt=0) and if there is need to reduce cache memory footprint then the LRU will free it.

But that never happens. Just marking it invalid does not work in the case the refcnt is already zero. If I don't free it exactly they way I do it all kinds of things break. Tests fail and the server crashes at shutdown.

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?
It's already checked in the cache find function because the ep_state is not 0

You are right.
Each time we call find_hash we are testing the ep_state of the returned entry to ignore the entry if its ep_state is not 0.
you may also move this test inside find_hash.

Umm it does do this check in find_hash(), are you referring to a different function?

Is it still use ?
Looks like pagure did not put this comment in the diff view. What are you referring to?

I was referring to the enum type 'CacheType'

This was one of @firstyear requests

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0
You are correct, but we need to explicitly remove it because it does not get flushed from the LRU otherwise, and at shutdown the server silently crashes because the backend entry was already freed.
I think the backend entry should not be freed by the flush_hash. flush_hash sould only set it to invalid state. Sooner or later, the entry will be on the LRU (refcnt=0) and if there is need to reduce cache memory footprint then the LRU will free it.

But that never happens. Just marking it invalid does not work in the case the refcnt is already zero. If I don't free it exactly they way I do it all kinds of things break. Tests fail and the server crashes at shutdown.

The issue why just setting it invalid doesn't work is because the entry was already returned to the cache. Then looking for it never finds it again because it is invalid, and it's never removed from the cache or LRU once its invalid. So we have to do the purge ourselves in this case where it already has a refcount of zero. To do what you want requires a bit of hacking of other cache functions and to be honest feels a little riskier. I will see if I can get it to work, but I'm worried it will get a bit messy (we'll see)...

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?
It's already checked in the cache find function because the ep_state is not 0
You are right.
Each time we call find_hash we are testing the ep_state of the returned entry to ignore the entry if its ep_state is not 0.
you may also move this test inside find_hash.

Umm it does do this check in find_hash(), are you referring to a different function?

Oh ? sorry I do not see that test in find_hash(). Do you mean it is part of 'testfn' callback ?

I would expect 'laste' (or 'entry') to already be on the lru. Is on LRU all entries with refcnt==0
You are correct, but we need to explicitly remove it because it does not get flushed from the LRU otherwise, and at shutdown the server silently crashes because the backend entry was already freed.
I think the backend entry should not be freed by the flush_hash. flush_hash sould only set it to invalid state. Sooner or later, the entry will be on the LRU (refcnt=0) and if there is need to reduce cache memory footprint then the LRU will free it.
But that never happens. Just marking it invalid does not work in the case the refcnt is already zero. If I don't free it exactly they way I do it all kinds of things break. Tests fail and the server crashes at shutdown.

The issue why just setting it invalid doesn't work is because the entry was already returned to the cache. Then looking for it never finds it again because it is invalid, and it's never removed from the cache or LRU once its invalid. So we have to do the purge ourselves in this case where it already has a refcount of zero. To do what you want requires a bit of hacking of other cache functions and to be honest feels a little riskier. I will see if I can get it to work, but I'm worried it will get a bit messy (we'll see)...

In my mind I wanted to simplify the fix, letting the LRU free the entry when it has the opportunity to do so.

The LRU is a list of entries that are still in the hash but with refcnt=0.
find_hash can find the invalid entry but do not return it, the entry will stay in the LRU.
Later when the LRU will need some memory it will just remove the entry from the hash and free it.

It looks like 'find_hash' can return an entry even if it is state_invalid. Should it test the state and do not return the entry even if it finds it in the hash ?
It's already checked in the cache find function because the ep_state is not 0
You are right.
Each time we call find_hash we are testing the ep_state of the returned entry to ignore the entry if its ep_state is not 0.
you may also move this test inside find_hash.
Umm it does do this check in find_hash(), are you referring to a different function?

Oh ? sorry I do not see that test in find_hash(). Do you mean it is part of 'testfn' callback ?

Sorry it's in cache_find_id()

I think you should not test INVALID at the point, and just let the entry go in the LRU (lru_add)

With offline discussion/test, the solution I was thinking about (let the LRU remove/free the invalid entry) is still giving a entry cache corruption.

About the idea to move the test of ep_state into find_hash, it is not possible because find_hash does not know the structure of the entry.

Thanks Mark for your patience. You have my ACK

For the record, you have my ack too :)

rebased onto 7ba8a80

5 years ago

Pull-Request has been merged by mreynolds

5 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3328

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago