From b05d4f5b97db5662bb617b312818fb20b8433d28 Mon Sep 17 00:00:00 2001 From: Rich Megginson Date: Jul 08 2011 21:07:17 +0000 Subject: Bug 697694 - rhds82 - incr update state stop_fatal_error "requires administrator action", with extop_result: 9 https://bugzilla.redhat.com/show_bug.cgi?id=697694 Resolves: bug 697694 Bug Description: incr update state stop_fatal_error "requires administrator action", with extop_result: 9 Reviewed by: nkinder, nhosoi (Thanks!) Branch: Directory_Server_8_2_Branch Fix Description: Calling ldap_result with LDAP_RES_ANY will return the first msgid available. Because the operation sending and receiving is async done in separate threads, we may not get the msgid corresponding to the request we sent. This is the cause of the "Bad parameter to an LDAP routine" errors. We call ldap_parse_result expecting an EXTENDED operation but we get some other operation. This causes hard failures to eventually propagate up and halt replication with fatal errors. The solution is to call ldap_result with the actual msgid corresponding to the operation that was sent, instead of LDAP_RES_ANY. This is hard to reproduce. One way I have found to consistently reproduce the error is to set up a 4-way MMR and run one of the masters in the debugger. Break the debugger and let the server sit idle for several minutes until you see errors in the errors logs of the other masters. Break and continue like this several times, and you will eventually see "Bad parameter to an LDAP routine" errors. Platforms tested: RHEL6 x86_64 Flag Day: no Doc impact: no --- diff --git a/ldap/servers/plugins/replication/repl5.h b/ldap/servers/plugins/replication/repl5.h index 6be21ce..6c322c2 100644 --- a/ldap/servers/plugins/replication/repl5.h +++ b/ldap/servers/plugins/replication/repl5.h @@ -398,7 +398,7 @@ void conn_set_timeout(Repl_Connection *conn, long timeout); long conn_get_timeout(Repl_Connection *conn); void conn_set_agmt_changed(Repl_Connection *conn); ConnResult conn_read_result(Repl_Connection *conn, int *message_id); -ConnResult conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap, LDAPControl ***returned_controls, int *message_id, int noblock); +ConnResult conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap, LDAPControl ***returned_controls, int send_msgid, int *resp_msgid, int noblock); /* In repl5_protocol.c */ typedef struct repl_protocol Repl_Protocol; diff --git a/ldap/servers/plugins/replication/repl5_connection.c b/ldap/servers/plugins/replication/repl5_connection.c index 927fb20..11bf3f7 100644 --- a/ldap/servers/plugins/replication/repl5_connection.c +++ b/ldap/servers/plugins/replication/repl5_connection.c @@ -284,7 +284,7 @@ conn_get_error_ex(Repl_Connection *conn, int *operation, int *error, char **erro /* The _ex version handles a bunch of parameters (retoidp et al) that were present in the original * sync operation functions, but were never actually used) */ ConnResult -conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap, LDAPControl ***returned_controls, int *message_id, int block) +conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retdatap, LDAPControl ***returned_controls, int send_msgid, int *resp_msgid, int block) { LDAPMessage *res = NULL; int setlevel = 0; @@ -323,7 +323,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda break; } - rc = ldap_result(conn->ld, LDAP_RES_ANY , 1, &local_timeout, &res); + rc = ldap_result(conn->ld, send_msgid, 1, &local_timeout, &res); PR_Unlock(conn->lock); if (0 != rc) @@ -412,9 +412,9 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda char **referrals = NULL; char *matched = NULL; - if (message_id) + if (resp_msgid) { - *message_id = ldap_msgid(res); + *resp_msgid = ldap_msgid(res); } rc = ldap_parse_result(conn->ld, res, &err, &matched, @@ -483,7 +483,7 @@ conn_read_result_ex(Repl_Connection *conn, char **retoidp, struct berval **retda ConnResult conn_read_result(Repl_Connection *conn, int *message_id) { - return conn_read_result_ex(conn,NULL,NULL,NULL,message_id,1); + return conn_read_result_ex(conn,NULL,NULL,NULL,LDAP_RES_ANY,message_id,1); } /* Because the SDK isn't really thread-safe (it can deadlock between diff --git a/ldap/servers/plugins/replication/repl5_inc_protocol.c b/ldap/servers/plugins/replication/repl5_inc_protocol.c index d999d3b..4de9fc7 100644 --- a/ldap/servers/plugins/replication/repl5_inc_protocol.c +++ b/ldap/servers/plugins/replication/repl5_inc_protocol.c @@ -293,7 +293,7 @@ static void repl5_inc_result_threadmain(void *param) while (!finished) { - conres = conn_read_result_ex(conn, NULL, NULL, NULL, &message_id, 0); + conres = conn_read_result_ex(conn, NULL, NULL, NULL, LDAP_RES_ANY, &message_id, 0); slapi_log_error(SLAPI_LOG_REPL, NULL, "repl5_inc_result_threadmain: read result for message_id %d\n", message_id); /* Timeout here means that we didn't block, not a real timeout */ if (CONN_TIMEOUT == conres) diff --git a/ldap/servers/plugins/replication/repl5_protocol_util.c b/ldap/servers/plugins/replication/repl5_protocol_util.c index 3fbc978..54f2a22 100644 --- a/ldap/servers/plugins/replication/repl5_protocol_util.c +++ b/ldap/servers/plugins/replication/repl5_protocol_util.c @@ -212,6 +212,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv) current_csn = get_current_csn(replarea_sdn); if (NULL != current_csn) { + int send_msgid = 0; struct berval *payload = NSDS50StartReplicationRequest_new( prot_oid, slapi_sdn_get_ndn(replarea_sdn), NULL /* XXXggood need to provide referral(s) */, current_csn); @@ -219,7 +220,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv) csn_free(¤t_csn); current_csn = NULL; crc = conn_send_extended_operation(conn, - REPL_START_NSDS50_REPLICATION_REQUEST_OID, payload, NULL /* update control */, NULL /* Message ID */); + REPL_START_NSDS50_REPLICATION_REQUEST_OID, payload, NULL /* update control */, &send_msgid /* Message ID */); if (CONN_OPERATION_SUCCESS != crc) { int operation, error; @@ -234,7 +235,7 @@ acquire_replica(Private_Repl_Protocol *prp, char *prot_oid, RUV **ruv) error ? ldap_err2string(error) : "unknown error"); } /* Since the operation request is async, we need to wait for the response here */ - crc = conn_read_result_ex(conn,&retoid,&retdata,NULL,NULL,1); + crc = conn_read_result_ex(conn,&retoid,&retdata,NULL,send_msgid,NULL,1); ber_bvfree(payload); payload = NULL; /* Look at the response we got. */ @@ -482,7 +483,7 @@ release_replica(Private_Repl_Protocol *prp) goto error; } /* Since the operation request is async, we need to wait for the response here */ - conres = conn_read_result_ex(prp->conn,&retoid,&retdata,NULL,&ret_message_id,1); + conres = conn_read_result_ex(prp->conn,&retoid,&retdata,NULL,sent_message_id,&ret_message_id,1); if (CONN_OPERATION_SUCCESS != conres) { int operation, error; diff --git a/ldap/servers/plugins/replication/repl5_tot_protocol.c b/ldap/servers/plugins/replication/repl5_tot_protocol.c index 7bd6e25..8ceee7a 100644 --- a/ldap/servers/plugins/replication/repl5_tot_protocol.c +++ b/ldap/servers/plugins/replication/repl5_tot_protocol.c @@ -153,7 +153,7 @@ static void repl5_tot_result_threadmain(void *param) while (!finished) { - conres = conn_read_result_ex(conn, NULL, NULL, NULL, &message_id, 0); + conres = conn_read_result_ex(conn, NULL, NULL, NULL, LDAP_RES_ANY, &message_id, 0); /* Timeout here means that we didn't block, not a real timeout */ if (CONN_TIMEOUT == conres) {