From 4561c4cda15754f98028910f129a951df98e92ff Mon Sep 17 00:00:00 2001 From: Rich Megginson Date: Sep 07 2012 16:28:51 +0000 Subject: Ticket #374 - consumer can go into total update mode for no reason https://fedorahosted.org/389/ticket/374 Reviewed by: rmeggins Contributed by: telackey (Thanks!) Branch: master Fix Description: In the consumer replication extop code, ensure exclusive access to the connection extension by checking the new fields, connid and opid. If the connid and opid in the extension are not the same as the current thread, just return an LDAP_BUSY to the supplier. Otherwise, set the connid and opid of the current thread in the extension. This should ensure that no other thread can access the connection extension. Includes some test code that should help cause the problem during an incremental update. Set the environment variable export SLAPD_TEST_TICKET_374=1 in /etc/sysconfig/dirsrv-INST and restart the server. Platforms tested: RHEL6 x86_64 Flag Day: no Doc impact: no (cherry picked from commit f1e01ac448ad5eb3a42b380d8a1f53d35bcf9fed) --- diff --git a/ldap/servers/plugins/replication/repl.h b/ldap/servers/plugins/replication/repl.h index 433be34..802e69c 100644 --- a/ldap/servers/plugins/replication/repl.h +++ b/ldap/servers/plugins/replication/repl.h @@ -290,7 +290,13 @@ void ldapi_initialize_changenumbers(chglog4Info *cl4, changeNumber first, change #define REPL_PROTOCOL_40 1 #define REPL_PROTOCOL_50_INCREMENTAL 2 #define REPL_PROTOCOL_50_TOTALUPDATE 3 -#define REPL_PROTOCOL_71_TOTALUPDATE 4 +/* TEL 20120529: REPL_PROTOCOL_71_TOTALUPDATE is never used in the code, and the + * equivalent code checking the 7.1 OID for incremental updates assigns the + * protocol version as REPL_PROTOCOL_50_INCREMENTAL. One or the other of these + * is wrong, but there are many tests for REPL_PROTOCOL_50_TOTALUPDATE that would + * need to be rewritten to to make use of REPL_PROTOCOL_71_TOTALUPDATE, so it + * seems safer and simpler to take this definition out. */ +/* #define REPL_PROTOCOL_71_TOTALUPDATE 4 */ /* In repl_globals.c */ int decrement_repl_active_threads(); @@ -355,12 +361,18 @@ typedef struct consumer_connection_extension void *supplier_ruv; /* RUV* */ int isreplicationsession; Slapi_Connection *connection; + PRLock *lock; /* protects entire structure */ + int in_use_opid; /* the id of the operation actively using this, else -1 */ } consumer_connection_extension; /* extension construct/destructor */ void* consumer_connection_extension_constructor (void *object,void *parent); void consumer_connection_extension_destructor (void* ext,void *object,void *parent); +/* extension helpers for managing exclusive access */ +consumer_connection_extension* consumer_connection_extension_acquire_exclusive_access(void* conn, PRUint64 connid, int opid); +int consumer_connection_extension_relinquish_exclusive_access(void* conn, PRUint64 connid, int opid, PRBool force); + /* mapping tree extension - stores replica object */ typedef struct multimaster_mtnode_extension { diff --git a/ldap/servers/plugins/replication/repl5_plugins.c b/ldap/servers/plugins/replication/repl5_plugins.c index 964dd88..f7677dd 100644 --- a/ldap/servers/plugins/replication/repl5_plugins.c +++ b/ldap/servers/plugins/replication/repl5_plugins.c @@ -1372,6 +1372,9 @@ static const char *replica_get_purl_for_op (const Replica *r, Slapi_PBlock *pb, Slapi_Connection *conn; consumer_connection_extension *connext; slapi_pblock_get(pb, SLAPI_CONNECTION, &conn); + /* TEL 20120531: There is a slim chance we want to take exclusive access + * to this instead. However, it isn't clear to me that it is worth the + * risk of changing this working code. */ connext = (consumer_connection_extension *)repl_con_get_ext( REPL_CON_EXT_CONN, conn); if (NULL == connext || NULL == connext->supplier_ruv) diff --git a/ldap/servers/plugins/replication/repl5_replica.c b/ldap/servers/plugins/replication/repl5_replica.c index b79280c..9e0fb24 100644 --- a/ldap/servers/plugins/replication/repl5_replica.c +++ b/ldap/servers/plugins/replication/repl5_replica.c @@ -494,9 +494,11 @@ replica_relinquish_exclusive_access(Replica *r, PRUint64 connid, int opid) } else { slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, "conn=%" NSPRIu64 " op=%d repl=\"%s\": " - "Released replica\n", + "Released replica held by locking_purl=%s\n", connid, opid, - slapi_sdn_get_dn(r->repl_root)); + slapi_sdn_get_dn(r->repl_root), + r->locking_purl); + slapi_ch_free_string(&r->locking_purl); r->repl_state_flags &= ~(REPLICA_IN_USE); if (isInc) diff --git a/ldap/servers/plugins/replication/repl_bind.c b/ldap/servers/plugins/replication/repl_bind.c index dbc77a6..c9b9229 100644 --- a/ldap/servers/plugins/replication/repl_bind.c +++ b/ldap/servers/plugins/replication/repl_bind.c @@ -72,6 +72,7 @@ legacy_preop_bind( Slapi_PBlock *pb ) slapi_log_error(SLAPI_LOG_REPL, REPLICATION_SUBSYSTEM, "legacy_preop_bind: begin\n"); #endif slapi_pblock_get( pb, SLAPI_CONNECTION, &conn ); + /* TEL 20120529 - Is there any reason we must protect this connext access? */ connext = (consumer_connection_extension*) repl_con_get_ext (REPL_CON_EXT_CONN, conn); if (NULL != connext) { diff --git a/ldap/servers/plugins/replication/repl_connext.c b/ldap/servers/plugins/replication/repl_connext.c index 2969b15..1430aad 100644 --- a/ldap/servers/plugins/replication/repl_connext.c +++ b/ldap/servers/plugins/replication/repl_connext.c @@ -70,6 +70,15 @@ void* consumer_connection_extension_constructor (void *object, void *parent) ext->isreplicationsession= 0; ext->supplier_ruv = NULL; ext->connection = NULL; + ext->in_use_opid = -1; + ext->lock = PR_NewLock(); + if (NULL == ext->lock) + { + slapi_log_error( SLAPI_LOG_PLUGIN, repl_plugin_name, "unable to create replication consumer connection extension lock - out of memory\n" ); + /* no need to go through the full destructor, but still need to free up this memory */ + slapi_ch_free((void **)&ext); + ext = NULL; + } } return ext; @@ -125,7 +134,154 @@ void consumer_connection_extension_destructor (void *ext, void *object, void *pa { ruv_destroy ((RUV **)&connext->supplier_ruv); } + + if (connext->lock) + { + PR_DestroyLock(connext->lock); + connext->lock = NULL; + } + + connext->in_use_opid = -1; + connext->connection = NULL; slapi_ch_free((void **)&ext); } } + +/* Obtain exclusive access to this connection extension. + * Returns the consumer_connection_extension* if successful, else NULL. + * + * This is similar to obtaining exclusive access to the replica, but not identical. + * For the connection extension, you only want to hold on to exclusive access as + * long as it is being actively used to process an operation. Mainly that means + * while processing either a 'start' or an 'end' extended operation. This makes + * certain that if another 'start' or 'end' operation is received on the connection, + * the ops will not trample on each other's state. As soon as you are done with + * that single operation, it is time to relinquish the connection extension. + * That differs from acquiring exclusive access to the replica, which is held over + * after the 'start' operation and relinquished during the 'end' operation. + */ +consumer_connection_extension* +consumer_connection_extension_acquire_exclusive_access(void* conn, PRUint64 connid, int opid) +{ + consumer_connection_extension* ret = NULL; + + /* step 1, grab the connext */ + consumer_connection_extension* connext = (consumer_connection_extension*) + repl_con_get_ext(REPL_CON_EXT_CONN, conn); + + if (NULL != connext) + { + /* step 2, acquire its lock */ + PR_Lock(connext->lock); + + /* step 3, see if it is not in use, or in use by us */ + if (0 > connext->in_use_opid) + { + /* step 4, take it! */ + connext->in_use_opid = opid; + ret = connext; + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Acquired consumer connection extension\n", + connid, opid); + } + else if (opid == connext->in_use_opid) + { + ret = connext; + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Reacquired consumer connection extension\n", + connid, opid); + } + else + { + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Could not acquire consumer connection extension; it is in use by op=%d\n", + connid, opid, connext->in_use_opid); + } + + /* step 5, drop the lock */ + PR_Unlock(connext->lock); + } + else + { + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Could not acquire consumer extension, it is NULL!\n", + connid, opid); + } + + return ret; +} + +/* Relinquish exclusive access to this connection extension. + * Returns 0 if exclusive access could NOT be relinquished, and non-zero if it was. + * Specifically: + * 1 if the extension was in use and was relinquished. + * 2 if the extension was not in use to begin with. + * + * The extension will only be relinquished if it was first acquired by this op, + * or if 'force' is TRUE. Do not use 'force' without a legitimate reason, such + * as when destroying the parent connection. + * + * cf. consumer_connection_extension_acquire_exclusive_access() for details on how, + * when, and why you would want to acquire and relinquish exclusive access. + */ +int +consumer_connection_extension_relinquish_exclusive_access(void* conn, PRUint64 connid, int opid, PRBool force) +{ + int ret = 0; + + /* step 1, grab the connext */ + consumer_connection_extension* connext = (consumer_connection_extension*) + repl_con_get_ext(REPL_CON_EXT_CONN, conn); + + if (NULL != connext) + { + /* step 2, acquire its lock */ + PR_Lock(connext->lock); + + /* step 3, see if it is in use */ + if (0 > connext->in_use_opid) + { + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Consumer connection extension is not in use\n", + connid, opid); + ret = 2; + } + else if (opid == connext->in_use_opid) + { + /* step 4, relinquish it (normal) */ + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Relinquishing consumer connection extension\n", + connid, opid); + connext->in_use_opid = -1; + ret = 1; + } + else if (force) + { + /* step 4, relinquish it (forced) */ + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Forced to relinquish consumer connection extension held by op=%d\n", + connid, opid, connext->in_use_opid); + connext->in_use_opid = -1; + ret = 1; + } + else + { + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Not relinquishing consumer connection extension, it is held by op=%d!\n", + connid, opid, connext->in_use_opid); + } + + /* step 5, drop the lock */ + PR_Unlock(connext->lock); + } + else + { + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d Could not relinquish consumer extension, it is NULL!\n", + connid, opid); + } + + return ret; +} + diff --git a/ldap/servers/plugins/replication/repl_extop.c b/ldap/servers/plugins/replication/repl_extop.c index 334b86b..ca8fc55 100644 --- a/ldap/servers/plugins/replication/repl_extop.c +++ b/ldap/servers/plugins/replication/repl_extop.c @@ -660,12 +660,16 @@ multimaster_extop_StartNSDS50ReplicationRequest(Slapi_PBlock *pb) * make sure it's there. */ slapi_pblock_get(pb, SLAPI_CONNECTION, &conn); - connext = (consumer_connection_extension *)repl_con_get_ext( - REPL_CON_EXT_CONN, conn); + connext = consumer_connection_extension_acquire_exclusive_access(conn, connid, opid); if (NULL == connext) { - /* Something bad happened. Don't go any further */ - response = NSDS50_REPL_INTERNAL_ERROR; + /* TEL 20120531: This used to be a much worse and unexpected thing + * before acquiring exclusive access to the connext. Now it should + * be highly unusual, but not completely unheard of. We don't want to + * return an internal error here as before, because it will eventually + * result in a fatal error on the other end. Better to tell it + * we are busy instead--which is also probably true. */ + response = NSDS50_REPL_REPLICA_BUSY; goto send_response; } @@ -733,7 +737,7 @@ multimaster_extop_StartNSDS50ReplicationRequest(Slapi_PBlock *pb) /* Stash info that this is a total update session */ if (NULL != connext) { - connext->repl_protocol_version = REPL_PROTOCOL_71_TOTALUPDATE; + connext->repl_protocol_version = REPL_PROTOCOL_50_TOTALUPDATE; } slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, "conn=%" NSPRIu64 " op=%d repl=\"%s\": Begin 7.1 total protocol\n", @@ -897,6 +901,28 @@ multimaster_extop_StartNSDS50ReplicationRequest(Slapi_PBlock *pb) replica_object = NULL; } + /* remove this code once ticket 374 is fixed */ +#define ENABLE_TEST_TICKET_374 +#ifdef ENABLE_TEST_TICKET_374 + if (getenv("SLAPD_TEST_TICKET_374") && (opid > 20)) { + int i = 0; + int max = 480 * 5; + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%d op=%d repl=\"%s\": " + "374 - Starting sleep: connext->repl_protocol_version == %d\n", + connid, opid, repl_root, connext->repl_protocol_version); + + while (REPL_PROTOCOL_50_INCREMENTAL == connext->repl_protocol_version && i++ < max) { + usleep(200000); + } + + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%d op=%d repl=\"%s\": " + "374 - Finished sleep: connext->repl_protocol_version == %d\n", + connid, opid, repl_root, connext->repl_protocol_version); + } +#endif + /* If this is incremental protocol get replica's ruv to return to the supplier */ if (connext->repl_protocol_version == REPL_PROTOCOL_50_INCREMENTAL) { @@ -921,12 +947,14 @@ multimaster_extop_StartNSDS50ReplicationRequest(Slapi_PBlock *pb) */ ruv_copy_and_destroy(&supplier_ruv, (RUV **)&connext->supplier_ruv); + /* incremental update protocol */ if (connext->repl_protocol_version == REPL_PROTOCOL_50_INCREMENTAL) { /* The supplier ruv may have changed, so let's update the referrals */ consumer5_set_mapping_tree_state_for_replica(replica, connext->supplier_ruv); } - else /* full protocol */ + /* total update protocol */ + else if (connext->repl_protocol_version == REPL_PROTOCOL_50_TOTALUPDATE) { char *mtnstate = slapi_mtn_get_state(repl_root_sdn); char **mtnreferral = slapi_mtn_get_referral(repl_root_sdn); @@ -969,6 +997,26 @@ multimaster_extop_StartNSDS50ReplicationRequest(Slapi_PBlock *pb) charray_free(mtnreferral); mtnreferral = NULL; } + /* something unexpected at this point, like REPL_PROTOCOL_UNKNOWN */ + else + { + /* TEL 20120529: This condition isn't supposed to happen, but it + * has been observed in the past when the consumer is under such + * stress that the supplier sends additional start extops before + * the consumer has finished processing an earlier one. Fixing + * the underlying race should prevent this from happening in the + * future at all, but just in case it is still worth testing the + * requested protocol explictly and returning an error here rather + * than assuming a total update was requested. + * https://fedorahosted.org/389/ticket/374 */ + response = NSDS50_REPL_INTERNAL_ERROR; + slapi_log_error(SLAPI_LOG_REPL, repl_plugin_name, + "conn=%" NSPRIu64 " op=%d repl=\"%s\": " + "Unexpected update protocol received: %d. " + "Expected incremental or total.\n", + connid, opid, repl_root, connext->repl_protocol_version); + goto send_response; + } response = NSDS50_REPL_REPLICA_READY; /* Set the "is replication session" flag in the connection extension */ @@ -1148,6 +1196,13 @@ send_response: { ber_bvecfree(ruv_bervals); } + /* connext (our hold on it at least) */ + if (NULL != connext) + { + /* don't free it, just let go of it */ + consumer_connection_extension_relinquish_exclusive_access(conn, connid, opid, PR_FALSE); + connext = NULL; + } return return_value; } @@ -1186,8 +1241,15 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb) */ /* Get a hold of the connection extension object */ slapi_pblock_get(pb, SLAPI_CONNECTION, &conn); - connext = (consumer_connection_extension *)repl_con_get_ext( - REPL_CON_EXT_CONN, conn); + slapi_pblock_get (pb, SLAPI_OPERATION_ID, &opid); + if (opid) slapi_pblock_get (pb, SLAPI_CONN_ID, &connid); + + /* TEL 20120531: unlike the replica, exclusive access to the connext should + * have been dropped at the end of the 'start' op. the only reason we couldn't + * get access to it would be if some other start or end op currently has it. + * if that is the case, the result of our getting it would be unpredictable anyway. + */ + connext = consumer_connection_extension_acquire_exclusive_access(conn, connid, opid); if (NULL != connext && NULL != connext->replica_acquired) { int zero= 0; @@ -1265,8 +1327,6 @@ multimaster_extop_EndNSDS50ReplicationRequest(Slapi_PBlock *pb) } /* Relinquish control of the replica */ - slapi_pblock_get (pb, SLAPI_OPERATION_ID, &opid); - if (opid) slapi_pblock_get (pb, SLAPI_CONN_ID, &connid); replica_relinquish_exclusive_access(r, connid, opid); object_release ((Object*)connext->replica_acquired); connext->replica_acquired = NULL; @@ -1310,6 +1370,13 @@ free_and_return: { ber_bvfree(resp_bval); } + /* connext (our hold on it at least) */ + if (NULL != connext) + { + /* don't free it, just let go of it */ + consumer_connection_extension_relinquish_exclusive_access(conn, connid, opid, PR_FALSE); + connext = NULL; + } return return_value; }