#654 Handling NSS request after reconnect and memory hierarchy

Created 6 years ago by sbose
Modified 3 months ago

There are two related issues in the NSS responder:
- requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
- there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370..af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req *breq)
     const char *err = "Unknown Error";
     int ret = EOK;

+    ar->entry_type = 123;
+
     ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);

     if (be_is_offline(ctx->be)) {
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..5cd1e49 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
     struct sss_dp_req *sdp_req = NULL;
     struct sss_dp_callback *cb;

+    timeout = 10;
+
     /* either, or, not both */
     if (opt_name && opt_id) {
         return EINVAL;

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561..783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version *register_cli_protocol_version(void);
 typedef void (*sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
                                   const char *err_msg, void *ptr);

+void handle_requests_after_reconnect(void);
+
 int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
                          sss_dp_callback_t callback, void *callback_ctx,
                          int timeout, const char *domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976..0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
     return 0;
 }

+static bool reconnect_handler(hash_entry_t *item, void *user_data)
+{
+    struct sss_dp_req *sdp_req = talloc_get_type(item->value.ptr,
+                                                 struct sss_dp_req);
+
+    return (talloc_free(sdp_req) == EOK ? true : false);
+}
+
+void handle_requests_after_reconnect(void)
+{
+    int ret;
+
+    ret = hash_iterate(dp_requests, reconnect_handler, NULL);
+    if (ret != HASH_SUCCESS) {
+        DEBUG(1, ("hash_iterate failed, "
+                  "not all request might be handled after reconnect.\n"));
+    }
+}
+
 static void sdp_req_timeout(struct tevent_context *ev,
                             struct tevent_timer *te,
                             struct timeval t, void *ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,

     if (dp_requests == NULL) {
         /* Create a hash table to handle queued update requests */
-        ret = hash_create(10, &dp_requests, NULL, NULL);
+        ret = sss_hash_create(NULL, 10, &dp_requests);
         if (ret != HASH_SUCCESS) {
             fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
             return EIO;
@@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx *rctx, TALLOC_CTX *callback_memctx,
                 goto done;
             }

-            cb = talloc_zero(callback_memctx, struct sss_dp_callback);
+            cb = talloc_zero(sdp_req, struct sss_dp_callback);
             if (!cb) {
                 ret = ENOMEM;
                 goto done;
@@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
     sdp_req->pending_reply = pending_reply;

     if (callback) {
-        cb = talloc_zero(callback_memctx, struct sss_dp_callback);
+        cb = talloc_zero(sdp_req, struct sss_dp_callback);
         if (!cb) {
             talloc_zfree(sdp_req);
             return ENOMEM;
diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
index 4f5aa62..dfb0312 100644
--- a/src/responder/nss/nsssrv.c
+++ b/src/responder/nss/nsssrv.c
@@ -39,6 +39,7 @@
 #include "dbus/dbus.h"
 #include "sbus/sssd_dbus.h"
 #include "responder/common/responder_packet.h"
+#include "responder/common/responder.h"
 #include "providers/data_provider.h"
 #include "monitor/monitor_interfaces.h"
 #include "sbus/sbus_client.h"
@@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection *conn,
                                 DATA_PROVIDER_VERSION,
                                 "NSS");
         /* all fine */
-        if (ret == EOK) return;
+        if (ret == EOK) {
+            handle_requests_after_reconnect();
+            return;
+        }
     }

     /* Failed to reconnect */

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy.

Segfault issues are blockers. This needs to be fixed for 1.4.1.

milestone: NEEDS_TRIAGE => SSSD 1.4.1
owner: somebody => sbose
priority: major => blocker

I think I figured out how this double free issue works:
- cb is allocated on cmdctx
- if a timeout occurs sdp_req_timeout() calls talloc_free(sdp_req)
- the sdp destructor sss_dp_req_destructor() calls one callback after the other and talloc_free(cb) after the callback is run
- the callback itself, e.g. nss_cmd_getgrnam_dp_callback() calls sss_cmd_done() which frees cmdctx
- so if the callback returns to sss_dp_req_destructor() cb is already freed, because it is a child of cmdctx and talloc_free(cb) must fail

I'm just wondering why we do not see this more often

description: There are two related issues in the NSS responder:
- requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
- there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

{{{
diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370. af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req breq)
const char
err = "Unknown Error";
int ret = EOK;

  • ar->entry_type = 123;
    +
    ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);

    if (be_is_offline(ctx->be)) {
    diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
    index fd7b976. 5cd1e49 100644
    --- a/src/responder/common/responder_dp.c
    +++ b/src/responder/common/responder_dp.c
    @@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
    struct sss_dp_req sdp_req = NULL;
    struct sss_dp_callback
    cb;

  • timeout = 10;
    +
    / either, or, not both /
    if (opt_name && opt_id) {
    return EINVAL;
    }}}

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

{{{
diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561. 783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version register_cli_protocol_version(void);
typedef void (
sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
const char err_msg, void ptr);

+void handle_requests_after_reconnect(void);
+
int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
sss_dp_callback_t callback, void callback_ctx,
int timeout, const char
domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976. 0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
return 0;
}

+static bool reconnect_handler(hash_entry_t item, void user_data)
+{
+ struct sss_dp_req sdp_req = talloc_get_type(item->value.ptr,
+ struct sss_dp_req);
+
+ return (talloc_free(sdp_req) == EOK ? true : false);
+}
+
+void handle_requests_after_reconnect(void)
+{
+ int ret;
+
+ ret = hash_iterate(dp_requests, reconnect_handler, NULL);
+ if (ret != HASH_SUCCESS) {
+ DEBUG(1, ("hash_iterate failed, "
+ "not all request might be handled after reconnect.\n"));
+ }
+}
+
static void sdp_req_timeout(struct tevent_context
ev,
struct tevent_timer te,
struct timeval t, void
ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,

 if (dp_requests == NULL) {
     /* Create a hash table to handle queued update requests */
  • ret = hash_create(10, &dp_requests, NULL, NULL);
  • ret = sss_hash_create(NULL, 10, &dp_requests);
    if (ret != HASH_SUCCESS) {
    fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
    return EIO;
    @@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
    goto done;
    }

  • cb = talloc_zero(callback_memctx, struct sss_dp_callback);

  • cb = talloc_zero(sdp_req, struct sss_dp_callback);
    if (!cb) {
    ret = ENOMEM;
    goto done;
    @@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
    sdp_req->pending_reply = pending_reply;

    if (callback) {
    - cb = talloc_zero(callback_memctx, struct sss_dp_callback);
    + cb = talloc_zero(sdp_req, struct sss_dp_callback);
    if (!cb) {
    talloc_zfree(sdp_req);
    return ENOMEM;
    diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
    index 4f5aa62. dfb0312 100644
    --- a/src/responder/nss/nsssrv.c
    +++ b/src/responder/nss/nsssrv.c
    @@ -39,6 +39,7 @@
    #include "dbus/dbus.h"
    #include "sbus/sssd_dbus.h"
    #include "responder/common/responder_packet.h"
    +#include "responder/common/responder.h"
    #include "providers/data_provider.h"
    #include "monitor/monitor_interfaces.h"
    #include "sbus/sbus_client.h"
    @@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection conn,
    DATA_PROVIDER_VERSION,
    "NSS");
    /
    all fine */
    - if (ret == EOK) return;
    + if (ret == EOK) {
    + handle_requests_after_reconnect();
    + return;
    + }
    }

    / Failed to reconnect /
    }}}

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy. => There are two related issues in the NSS responder:
- requests are not handled after the data provider dies and the responder reconnects, they will timeout after some time
- there is a memory hierarchy bug which lets the NSS responder die if the not handled requests timeout

The following patch shall help to reproduce both issues:

{{{
diff --git a/src/providers/ldap/ldap_id.c b/src/providers/ldap/ldap_id.c
index 135e370. af69566 100644
--- a/src/providers/ldap/ldap_id.c
+++ b/src/providers/ldap/ldap_id.c
@@ -657,6 +657,8 @@ void sdap_account_info_handler(struct be_req breq)
const char
err = "Unknown Error";
int ret = EOK;

  • ar->entry_type = 123;
    +
    ctx = talloc_get_type(breq->be_ctx->bet_info[BET_ID].pvt_bet_data, struct sdap_id_ctx);

    if (be_is_offline(ctx->be)) {
    diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
    index fd7b976. 5cd1e49 100644
    --- a/src/responder/common/responder_dp.c
    +++ b/src/responder/common/responder_dp.c
    @@ -256,6 +256,8 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
    struct sss_dp_req sdp_req = NULL;
    struct sss_dp_callback
    cb;

  • timeout = 10;
    +
    / either, or, not both /
    if (opt_name && opt_id) {
    return EINVAL;
    }}}

The first hunk makes the LDAP ID provider die on every request. The second hunk reduces the timeout to 10s so you do not have to wait too much.

I would like to propose a fix similar to the next patch

{{{
diff --git a/src/responder/common/responder.h b/src/responder/common/responder.h
index 980e561. 783f9e4 100644
--- a/src/responder/common/responder.h
+++ b/src/responder/common/responder.h
@@ -161,6 +161,8 @@ struct cli_protocol_version register_cli_protocol_version(void);
typedef void (
sss_dp_callback_t)(uint16_t err_maj, uint32_t err_min,
const char err_msg, void ptr);

+void handle_requests_after_reconnect(void);
+
int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
sss_dp_callback_t callback, void callback_ctx,
int timeout, const char
domain,
diff --git a/src/responder/common/responder_dp.c b/src/responder/common/responder_dp.c
index fd7b976. 0f1f7a8 100644
--- a/src/responder/common/responder_dp.c
+++ b/src/responder/common/responder_dp.c
@@ -107,6 +107,25 @@ static int sss_dp_req_destructor(void *ptr)
return 0;
}

+static bool reconnect_handler(hash_entry_t item, void user_data)
+{
+ struct sss_dp_req sdp_req = talloc_get_type(item->value.ptr,
+ struct sss_dp_req);
+
+ return (talloc_free(sdp_req) == EOK ? true : false);
+}
+
+void handle_requests_after_reconnect(void)
+{
+ int ret;
+
+ ret = hash_iterate(dp_requests, reconnect_handler, NULL);
+ if (ret != HASH_SUCCESS) {
+ DEBUG(1, ("hash_iterate failed, "
+ "not all request might be handled after reconnect.\n"));
+ }
+}
+
static void sdp_req_timeout(struct tevent_context
ev,
struct tevent_timer te,
struct timeval t, void
ptr)
@@ -288,7 +307,7 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,

 if (dp_requests == NULL) {
     /* Create a hash table to handle queued update requests */
  • ret = hash_create(10, &dp_requests, NULL, NULL);
  • ret = sss_hash_create(NULL, 10, &dp_requests);
    if (ret != HASH_SUCCESS) {
    fprintf(stderr, "cannot create hash table (%s)\n", hash_error_string(ret));
    return EIO;
    @@ -336,7 +355,7 @@ int sss_dp_send_acct_req(struct resp_ctx rctx, TALLOC_CTX callback_memctx,
    goto done;
    }

  • cb = talloc_zero(callback_memctx, struct sss_dp_callback);

  • cb = talloc_zero(sdp_req, struct sss_dp_callback);
    if (!cb) {
    ret = ENOMEM;
    goto done;
    @@ -482,7 +501,7 @@ static int sss_dp_send_acct_req_create(struct resp_ctx *rctx,
    sdp_req->pending_reply = pending_reply;

    if (callback) {
    - cb = talloc_zero(callback_memctx, struct sss_dp_callback);
    + cb = talloc_zero(sdp_req, struct sss_dp_callback);
    if (!cb) {
    talloc_zfree(sdp_req);
    return ENOMEM;
    diff --git a/src/responder/nss/nsssrv.c b/src/responder/nss/nsssrv.c
    index 4f5aa62. dfb0312 100644
    --- a/src/responder/nss/nsssrv.c
    +++ b/src/responder/nss/nsssrv.c
    @@ -39,6 +39,7 @@
    #include "dbus/dbus.h"
    #include "sbus/sssd_dbus.h"
    #include "responder/common/responder_packet.h"
    +#include "responder/common/responder.h"
    #include "providers/data_provider.h"
    #include "monitor/monitor_interfaces.h"
    #include "sbus/sbus_client.h"
    @@ -143,7 +144,10 @@ static void nss_dp_reconnect_init(struct sbus_connection conn,
    DATA_PROVIDER_VERSION,
    "NSS");
    /
    all fine */
    - if (ret == EOK) return;
    + if (ret == EOK) {
    + handle_requests_after_reconnect();
    + return;
    + }
    }

    / Failed to reconnect /
    }}}

This patch makes the sdp_req the parent of the callbacks (yes, some additional cleanup is needed because callback_memctx is not needed anymore) and introduces a reconnect handler which just terminates the current request.

I think it is better to just kill the running requests instead of resending them, because one of these requests might have caused the termination of the DP and resending it will kill the DP again and again. And the DP has to be fixed wither way.

And last but not least it might be a good time to find a place for dp_requests in the memory hierarchy.

Fixed by:
- 98e3c547d6ebc7a5906c49af2bd47f0aeff80a3f
- c475a1ce4d407f722551f3d35d2f9e50fe139d95
- 09aa3d91f8fa667b09494612d5696da2b8461cb2

fixedin: => 1.4.1
resolution: => fixed
status: new => closed
tests: 0 => 1

Fields changed

coverity: =>
patch: => 0
tests: 1 => 0
upgrade: => 0

Fields changed

rhbz: => 0

3 months ago

Metadata Update from @sbose:
- Issue assigned to sbose
- Issue set to the milestone: SSSD 1.4.1

Login to comment on this ticket.

defect

NSS

1.4.0

0

0

0

cancel