#4104 Improve "unlock" time when user session already active
Opened 4 months ago by simo. Modified 3 months ago
SSSD/ simo/sssd i4098  into  master

@@ -121,6 +121,7 @@ 

                   enum cache_req_dom_type req_dom_type)

  {

      struct cache_req *cr;

+     bool bypass_cache;

      errno_t ret;

  

      cr = talloc_zero(mem_ctx, struct cache_req);

@@ -144,15 +145,22 @@ 

          return NULL;

      }

  

-     cr->cache_first = rctx->cache_first;

-     cr->bypass_cache = cr->plugin->bypass_cache || cr->data->bypass_cache;

-     cr->bypass_dp = cr->data->bypass_dp;

-     if (cr->bypass_cache && cr->bypass_dp) {

+     bypass_cache = cr->plugin->bypass_cache || cr->data->bypass_cache;

+     if (bypass_cache && cr->data->bypass_dp) {

          CACHE_REQ_DEBUG(SSSDBG_CRIT_FAILURE, cr,

                          "Cannot bypass cache and dp at the same time!");

          talloc_free(cr);

          return NULL;

      }

+     if (rctx->cache_first) {

+         cr->cache_behavior = CACHE_REQ_CACHE_FIRST;

+     }

+     /* it is ok to override cache_first here */

+     if (bypass_cache) {

+         cr->cache_behavior = CACHE_REQ_BYPASS_CACHE;

+     } else if (cr->data->bypass_dp) {

+         cr->cache_behavior = CACHE_REQ_BYPASS_PROVIDER;

+     }

  

      return cr;

  }

@@ -493,8 +501,8 @@ 

      subreq = cache_req_search_send(state,

                                     state->ev,

                                     state->cr,

-                                    false,       /* Don't bypass cache */

-                                    true);       /* Do bypass DP */

+                                    false,

+                                    true);

      if (subreq == NULL) {

          ret = ENOMEM;

          goto immediately;

@@ -677,8 +685,8 @@ 

      size_t num_results;

      bool check_next;

      bool dp_success;

-     bool bypass_cache;

-     bool bypass_dp;

+     bool first_iteration;

+     enum cache_req_behavior cache_behavior;

  };

  

  static errno_t cache_req_search_domains_next(struct tevent_req *req);

@@ -695,9 +703,7 @@ 

                                struct cache_req *cr,

                                struct cache_req_domain *cr_domain,

                                bool check_next,

-                               bool first_iteration,

-                               bool bypass_cache,

-                               bool bypass_dp)

+                               bool first_iteration)

  {

      struct tevent_req *req;

      struct cache_req_search_domains_state *state = NULL;

@@ -717,12 +723,12 @@ 

      state->req_domains = cr_domain;

      state->check_next = check_next;

      state->dp_success = true;

-     state->bypass_cache = bypass_cache;

-     state->bypass_dp = bypass_dp;

+     state->first_iteration = first_iteration;

  

      if (cr->plugin->dp_get_domain_send_fn != NULL

              && ((state->check_next && cr_domain->next != NULL)

-                 || (state->bypass_cache && !first_iteration))) {

+                 || ((state->cr->cache_behavior == CACHE_REQ_CACHE_FIRST)

+                     && !first_iteration))) {

          /* If the request is not qualified with a domain name AND

           * there are multiple domains to search OR if this is the second

           * pass during the "check-cache-first" schema, it makes sense

@@ -814,7 +820,8 @@ 

          }

  

          subreq = cache_req_search_send(state, state->ev, cr,

-                                        state->bypass_cache, state->bypass_dp);

+                                        state->first_iteration,

+                                        false);

          if (subreq == NULL) {

              return ENOMEM;

          }

@@ -1018,66 +1025,6 @@ 

      return EOK;

  }

  

- /**

-  * Return true if we should issue another search.

-  */

- static bool cache_req_search_schema(struct cache_req *cr,

-                                     const char *input_domain,

-                                     bool first_iteration,

-                                     bool *_bypass_cache,

-                                     bool *_bypass_dp)

- {

-     bool bypass_cache;

-     bool bypass_dp;

- 

-     if (cr->bypass_cache) {

-         /* The caller wants to contact Data Provider first

-          * or it is inferred by cache_req plug-in. */

-         bypass_cache = true;

-         bypass_dp = false;

- 

-         if (!first_iteration) {

-             return false;

-         }

-     } else if (cr->bypass_dp) {

-         /* The caller wants to lookup only in the cache */

-         bypass_cache = false;

-         bypass_dp = true;

- 

-         if (!first_iteration) {

-             return false;

-         }

-     } else if (input_domain != NULL) {

-         /* We will search only one domain. */

-         bypass_cache = false;

-         bypass_dp = false;

- 

-         if (!first_iteration) {

-             return false;

-         }

-      } else if (!cr->cache_first) {

-         /* We will search cache and on cache-miss

-          * contact domain provider sequentially. */

-         bypass_cache = false;

-         bypass_dp = false;

- 

-         if (!first_iteration) {

-             return false;

-         }

-     } else {

-         /* We will first search the cache in all domains. If we don't get

-          * any match we will then contact Data Provider starting with the

-          * first domain again. */

-         bypass_cache = first_iteration ? false : true;

-         bypass_dp = first_iteration ? true : false;

-     }

- 

-     *_bypass_cache = bypass_cache;

-     *_bypass_dp = bypass_dp;

- 

-     return true;

- }

- 

  struct cache_req_state {

      /* input data */

      struct tevent_context *ev;

@@ -1111,9 +1058,7 @@ 

  static errno_t

  cache_req_search_domains(struct tevent_req *req,

                           struct cache_req_domain *oredered_domain,

-                          bool check_next,

-                          bool bypass_cache,

-                          bool bypass_dp);

+                          bool check_next);

  

  static void cache_req_process_result(struct tevent_req *subreq);

  

@@ -1337,19 +1282,17 @@ 

      struct cache_req_state *state = NULL;

      struct cache_req_domain *cr_domain;

      bool check_next;

-     bool bypass_cache;

-     bool bypass_dp;

-     bool search;

      errno_t ret;

  

      state = tevent_req_data(req, struct cache_req_state);

  

-     search = cache_req_search_schema(state->cr, domain_name,

-                                      state->first_iteration,

-                                      &bypass_cache, &bypass_dp);

-     if (!search) {

-         /* We're done here. */

-         return EOK;

+     if ((state->cr->cache_behavior != CACHE_REQ_CACHE_FIRST)

+         || (domain_name != NULL)) {

+ 

+         if (!state->first_iteration) {

+             /* We're done here. */

+             return EOK;

+         }

      }

  

      ret = cache_req_domain_copy_cr_domains(state,

@@ -1378,31 +1321,46 @@ 

          check_next = true;

      }

  

-     return cache_req_search_domains(req, cr_domain, check_next,

-                                     bypass_cache, bypass_dp);

+     return cache_req_search_domains(req, cr_domain, check_next);

  }

  

  static errno_t

  cache_req_search_domains(struct tevent_req *req,

                           struct cache_req_domain *cr_domain,

-                          bool check_next,

-                          bool bypass_cache,

-                          bool bypass_dp)

+                          bool check_next)

  {

      struct tevent_req *subreq;

      struct cache_req_state *state = NULL;

+     const char *cache_action;

+     const char *provider_action;

  

      state = tevent_req_data(req, struct cache_req_state);

  

+     switch (state->cr->cache_behavior) {

+     case CACHE_REQ_CACHE_FIRST:

+         cache_action = (state->first_iteration) ? "check" : "bypass";

+         provider_action = (state->first_iteration) ? "bypass" : "check";

+         break;

+     case CACHE_REQ_BYPASS_CACHE:

+         cache_action = "bypass";

+         provider_action = "check";

+         break;

+     case CACHE_REQ_BYPASS_PROVIDER:

+         cache_action = "check";

+         provider_action = "bypass";

+         break;

+     default:

+         cache_action = "check";

+         provider_action = "check";

+         break;

+     }

      CACHE_REQ_DEBUG(SSSDBG_TRACE_FUNC, state->cr,

                      "Search will %s the cache and %s the data provider\n",

-                     bypass_cache ? "bypass" : "check",

-                     bypass_dp ? "bypass" : "check");

+                     cache_action, provider_action);

  

      subreq = cache_req_search_domains_send(state, state->ev, state->cr,

                                             cr_domain, check_next,

-                                            state->first_iteration,

-                                            bypass_cache, bypass_dp);

+                                            state->first_iteration);

      if (subreq == NULL) {

          return ENOMEM;

      }

@@ -73,6 +73,17 @@ 

      CACHE_REQ_ANY_DOM

  };

  

+ /* Controls behavior about how to use cached information during

+  * a lookup, this is to fine tune some behaviors for specific

+  * situations

+  */

+ enum cache_req_behavior {

+     CACHE_REQ_NORMAL,

+     CACHE_REQ_CACHE_FIRST,

+     CACHE_REQ_BYPASS_CACHE,

+     CACHE_REQ_BYPASS_PROVIDER,

+ };

+ 

  /* Input data. */

  

  struct cache_req_data;

@@ -40,9 +40,10 @@ 

  

      /* Domain related information. */

      struct sss_domain_info *domain;

-     bool cache_first;

-     bool bypass_cache;

-     bool bypass_dp;

+ 

+     /* wanted cache behavior */

+     enum cache_req_behavior cache_behavior;

+ 

      /* Only contact domains with this type */

      enum cache_req_dom_type req_dom_type;

  

@@ -99,8 +100,8 @@ 

  cache_req_search_send(TALLOC_CTX *mem_ctx,

                        struct tevent_context *ev,

                        struct cache_req *cr,

-                       bool bypass_cache,

-                       bool bypass_dp);

+                       bool first_iteration,

+                       bool cache_only_override);

  

  errno_t cache_req_search_recv(TALLOC_CTX *mem_ctx,

                                struct tevent_req *req,

@@ -301,12 +301,14 @@ 

  cache_req_search_send(TALLOC_CTX *mem_ctx,

                        struct tevent_context *ev,

                        struct cache_req *cr,

-                       bool bypass_cache,

-                       bool bypass_dp)

+                       bool first_iteration,

+                       bool cache_only_override)

  {

      struct cache_req_search_state *state;

      enum cache_object_status status;

      struct tevent_req *req;

+     bool bypass_cache = false;

+     bool bypass_dp = false;

      errno_t ret;

  

      req = tevent_req_create(mem_ctx, &state, struct cache_req_search_state);

@@ -325,6 +327,25 @@ 

          goto done;

      }

  

+     if (cache_only_override) {

+         bypass_dp = true;

+     } else {

+         switch (cr->cache_behavior) {

+         case CACHE_REQ_CACHE_FIRST:

+             bypass_cache = first_iteration ? false : true;

+             bypass_dp = first_iteration ? true : false;

+             break;

+         case CACHE_REQ_BYPASS_CACHE:

+             bypass_cache = true;

+             break;

+         case CACHE_REQ_BYPASS_PROVIDER:

+             bypass_dp = true;

+             break;

+         default:

+             break;

+         }

+     }

+ 

      /* If bypass_cache is enabled we always contact data provider before

       * searching the cache. Thus we set expiration status to missing,

       * which will trigger data provider request later.

file modified
+109 -23

@@ -23,6 +23,7 @@ 

  #include <time.h>

  #include "util/util.h"

  #include "util/auth_utils.h"

+ #include "util/find_uid.h"

  #include "db/sysdb.h"

  #include "confdb/confdb.h"

  #include "responder/common/responder_packet.h"

@@ -1846,12 +1847,16 @@ 

      pam_check_user_done(preq, ret);

  }

  

- static void pam_dp_send_acct_req_done(struct tevent_req *req);

+ static void pam_check_user_search_next(struct tevent_req *req);

+ static void pam_check_user_search_lookup(struct tevent_req *req);

+ static void pam_check_user_search_done(struct pam_auth_req *preq, int ret,

+                                        struct cache_req_result *result);

+ 

+ /* lookup the user uid from the cache first,

+  * then we'll refresh initgroups if needed */

  static int pam_check_user_search(struct pam_auth_req *preq)

  {

-     int ret;

      struct tevent_req *dpreq;

-     struct pam_ctx *pctx;

      struct cache_req_data *data;

  

      data = cache_req_data_name(preq,

@@ -1861,23 +1866,8 @@ 

          return ENOMEM;

      }

  

-     pctx = talloc_get_type(preq->cctx->rctx->pvt_ctx, struct pam_ctx);

- 

-     /* The initgr cache is used to make sure that during a single PAM session

-      * (auth, acct_mgtm, ....) the backend is contacted only once. logon_name

-      * is the name provided by the PAM client and will not be modified during

-      * the request, so it makes sense to use it here instead od the pd->user. */

-     ret = pam_initgr_check_timeout(pctx->id_table, preq->pd->logon_name);

-     if (ret == EOK) {

-         /* Entry is still valid, force to lookup in the cache first */

-         cache_req_data_set_bypass_cache(data, false);

-     } else if (ret == ENOENT) {

-         /* Call the data provider first */

-         cache_req_data_set_bypass_cache(data, true);

-     } else {

-         DEBUG(SSSDBG_OP_FAILURE, "Could not look up initgroup timeout\n");

-         return EIO;

-     }

+     cache_req_data_set_bypass_cache(data, false);

+     cache_req_data_set_bypass_dp(data, true);

  

      dpreq = cache_req_send(preq,

                             preq->cctx->rctx->ev,

@@ -1893,17 +1883,19 @@ 

          return ENOMEM;

      }

  

-     tevent_req_set_callback(dpreq, pam_dp_send_acct_req_done, preq);

+     tevent_req_set_callback(dpreq, pam_check_user_search_next, preq);

  

      /* tell caller we are in an async call */

      return EAGAIN;

  }

  

- static void pam_dp_send_acct_req_done(struct tevent_req *req)

+ static void pam_check_user_search_next(struct tevent_req *req)

  {

-     struct cache_req_result *result;

      struct pam_auth_req *preq;

      struct pam_ctx *pctx;

+     struct cache_req_result *result;

+     struct cache_req_data *data;

+     struct tevent_req *dpreq;

      int ret;

  

      preq = tevent_req_callback_data(req, struct pam_auth_req);

@@ -1919,6 +1911,100 @@ 

      }

  

      if (ret == EOK) {

+         bool user_has_session = false;

+         uid_t uid = ldb_msg_find_attr_as_uint64(result->msgs[0], SYSDB_UIDNUM, 0);

+         if (!uid) {

+             DEBUG(SSSDBG_CRIT_FAILURE, "A user with no UID?\n");

+             talloc_zfree(preq->cctx);

+             return;

+         }

+ 

+         /* If a user already has a session on the system, we take the

+          * cache for granted and do not force an online lookup. This is

+          * because in most cases the user is just trying to authenticate

+          * but not create a new session (sudo, lockscreen, polkit, etc.)

+          * An online refresh in this situation would just delay operations

+          * without providing any useful additional information.

+          */

+         (void)check_if_uid_is_active(uid, &user_has_session);

+ 

+         /* The initgr cache is used to make sure that during a single PAM

+          * session (auth, acct_mgtm, ....) the backend is contacted only

+          * once. logon_name is the name provided by the PAM client and

+          * will not be modified during the request, so it makes sense to

+          * use it here instead od the pd->user.

+          */

+         ret = pam_initgr_check_timeout(pctx->id_table, preq->pd->logon_name);

+         if (ret != EOK && ret != ENOENT) {

+             DEBUG(SSSDBG_OP_FAILURE, "Could not look up initgroup timeout\n");

+         }

+ 

+         if ((ret == EOK) || user_has_session) {

+             pam_check_user_search_done(preq, EOK, result);

+             return;

+         }

+     }

+ 

+     /* If we get here it means the user was not found or does not have a

+      * session, or initgr has not been cached before, so we force a new

+      * online lookup */

+     data = cache_req_data_name(preq,

+                                CACHE_REQ_INITGROUPS,

+                                preq->pd->logon_name);

+     if (data == NULL) {

+         DEBUG(SSSDBG_CRIT_FAILURE, "Out of memory\n");

+         talloc_zfree(preq->cctx);

+         return;

+     }

+     cache_req_data_set_bypass_cache(data, true);

+     cache_req_data_set_bypass_dp(data, false);

+ 

+     dpreq = cache_req_send(preq,

+                            preq->cctx->rctx->ev,

+                            preq->cctx->rctx,

+                            preq->cctx->rctx->ncache,

+                            0,

+                            preq->req_dom_type,

+                            NULL,

+                            data);

+     if (!dpreq) {

+         DEBUG(SSSDBG_CRIT_FAILURE,

+               "Out of memory sending data provider request\n");

+         talloc_zfree(preq->cctx);

+         return;

+     }

+ 

+     tevent_req_set_callback(dpreq, pam_check_user_search_lookup, preq);

+ }

+ 

+ static void pam_check_user_search_lookup(struct tevent_req *req)

+ {

+     struct cache_req_result *result;

+     struct pam_auth_req *preq;

+     int ret;

+ 

+     preq = tevent_req_callback_data(req, struct pam_auth_req);

+ 

+     ret = cache_req_single_domain_recv(preq, req, &result);

+     talloc_zfree(req);

+     if (ret != EOK && ret != ENOENT) {

+         DEBUG(SSSDBG_CRIT_FAILURE,

+               "Fatal error, killing connection!\n");

+         talloc_zfree(preq->cctx);

+         return;

+     }

+ 

+     pam_check_user_search_done(preq, ret, result);

+ }

+ 

+ static void pam_check_user_search_done(struct pam_auth_req *preq, int ret,

+                                        struct cache_req_result *result)

+ {

+     struct pam_ctx *pctx;

+ 

+     pctx = talloc_get_type(preq->cctx->rctx->pvt_ctx, struct pam_ctx);

+ 

+     if (ret == EOK) {

          preq->user_obj = result->msgs[0];

          pd_set_primary_name(preq->user_obj, preq->pd);

          preq->domain = result->domain;

@@ -1950,8 +1950,9 @@ 

      assert_int_equal(ret, EOK);

  

      mock_input_pam_ex(pam_test_ctx, "upn@"TEST_DOM_NAME, "12345", NULL, NULL,

-                       true);

+                       false);

      mock_account_recv_simple();

+     mock_parse_inp("upn@"TEST_DOM_NAME, NULL, EOK);

  

      will_return(__wrap_sss_packet_get_cmd, SSS_PAM_AUTHENTICATE);

      will_return(__wrap_sss_packet_get_body, WRAP_CALL_REAL);

@@ -1991,6 +1992,8 @@ 

                        true);

      mock_account_recv_simple();

  

+     mock_parse_inp("pamuser", NULL, EOK);

+ 

      will_return(__wrap_sss_packet_get_cmd, SSS_PAM_AUTHENTICATE);

      will_return(__wrap_sss_packet_get_body, WRAP_CALL_REAL);

  

@@ -2270,6 +2273,8 @@ 

                          test_lookup_by_cert_cb, SSSD_TEST_CERT_0001, false);

      mock_account_recv_simple();

      mock_parse_inp("pamuser", NULL, EOK);

+     mock_account_recv_simple();

+     mock_parse_inp("pamuser", NULL, EOK);

  

      will_return(__wrap_sss_packet_get_cmd, SSS_PAM_PREAUTH);

      will_return(__wrap_sss_packet_get_body, WRAP_CALL_REAL);

@@ -2496,6 +2501,8 @@ 

  

      mock_account_recv_simple();

      mock_parse_inp("pamuser", NULL, EOK);

+     mock_account_recv_simple();

+     mock_parse_inp("pamuser", NULL, EOK);

  

      will_return(__wrap_sss_packet_get_cmd, SSS_PAM_AUTHENTICATE);

      will_return(__wrap_sss_packet_get_body, WRAP_CALL_REAL);

@@ -2928,6 +2935,7 @@ 

  

      /* The domain is POSIX, the request will skip over it */

      mock_input_pam_ex(pam_test_ctx, "pamuser", NULL, NULL, "app_svc", false);

+     mock_parse_inp("pamuser", NULL, EOK);

      pam_test_ctx->exp_pam_status = PAM_USER_UNKNOWN;

  

      will_return(__wrap_sss_packet_get_cmd, SSS_PAM_AUTHENTICATE);

@@ -3012,6 +3020,7 @@ 

  

      /* A different service than the app one can authenticate against a POSIX domain */

      mock_input_pam_ex(pam_test_ctx, "pamuser", NULL, NULL, "not_app_svc", false);

+     mock_parse_inp("pamuser", NULL, EOK);

  

      pam_test_ctx->exp_pam_status = PAM_USER_UNKNOWN;

  

These two patches change the beahvior of the sssd pam provider to avoid costly online initgroups calls when a user session is already active on the system.

The reasoning is that if a user session is active then most of the authentications happening for the user are for tools that are not instantiating a new login sessions, therefore paying the price of an online initgroups is not worth it. And for the rare actual login session (say ssh into the machine from another) then we know that initgroups was already recently run and we get the same answer making sessions consistent.

If an admin change groups this is not a security issue because an active session means the user already has whatever group memebrships an can still do any operation on the machine. Admins already know that if they want to take away privileges they need to purge any user process from the machine first.

I tested personally these two patches on my machine, they reduced login time from 10-20 seconds down to a few seconds.
note that this also reduces load on a central LDAP/IPA/NIS/whatever server so it is a new win-win in all cases.

Hi,

thank you for the patch. I like the changes but since this is a change of behavior I think an option to return to the old behavior would be useful.

We regular have questions about e.g. updating sudo rules more often. Also we currently try to refresh HBAC rules whenever there is a new access control check requested. Both often depend on group memberships and are tested by opening a new session, often while there is another session still running. While it would be possible to tell SSSD to update the cached initgroups data more often by tuning entry_cache_user_timeout this might have the negative effect that now all user data is updated more often. So being able to return to the old behavior seems to be better.

Additionally with such an option it would be possible to allow that the cached data is always used for every PAM operation. E.g. an option pam_id_use_cache can have the values never (old behavior), if_user_session_exists (your patch and new default) and always.

bye,
Sumit

Hi Sumit,
can you think of a case where the old behavior is desirable ?
initgroups does not update sudo or hbac rules.

I am really not a fan of adding more knobs here, I would actually love to start deprecating options in sssd, the code is drowning and the test matrix can't possibly cover all cases already.

One thing I thought we may do is to run a initgroups call in the background, when a cached one is done (Ie not waiting on it to finish but still triggering it). Would that achieve your goal and avoid the need for an option ?

Hi Sumit,
can you think of a case where the old behavior is desirable ?
initgroups does not update sudo or hbac rules.

No, but the hbac rules are updated during every login attempt during the access control step and since the group memberships of a user are typically part of the rules it would be good to have the current group memberships as well so that the evaluation of the rules lead to the expected result.

My point is just that it is currently expected behavior that the group memberships are updated for every new login independent if you are already logged in or not and people might rely on this. E.g. you ask helpdesk to be added to a new group and then check with 'su - yourusername' if you are are now a member of the group.

I am really not a fan of adding more knobs here, I would actually love to start deprecating options in sssd, the code is drowning and the test matrix can't possibly cover all cases already.

I think at least in this case it might be acceptable because the option will only tell the cache_req in which order backend and cache should be checked, which is already covered by existing tests.

One thing I thought we may do is to run a initgroups call in the background, when a cached one is done (Ie not waiting on it to finish but still triggering it). Would that achieve your goal and avoid the need for an option ?

But this would mean that only a second login attempt will the updated group memberships. Btw, Jakub recently improved and fixed the background refresh where one of the steps was that initgroups is handled properly as well, which would make sure that the cached entries do not expire.

bye,
Sumit

I think that if update of groups is desired at every login that we should have it done asynchronously when a session is active.
The problem is that it currently blocks the login leading to unacceptable user experience for interactive sessions.
I also think that should be done as a separate improvement though.

If you insist (then say so :) I can add an option to control this behavior that defaults to this behavior being enabled, but I am really not too happy to add more and more options. I would rather have heuristics that work well and do not need options to tweak if at all possible,

Hi,

Yes, please add such an option or let me know if I should add it. Since it is a change of behavior I still think we need a way to switch back to the old behavior although the new one might be better for most use-cases.

bye,
Sumit

FYI, previously I backported these patches to RHEL 8.0 (SSSD 2.0) and all was good, now I did it for RHEL 8.1 (SSSD 2.2) and these patches seem not sufficient anymore. Either something else is forcing a lookup of some kind in the authentication path, or something is not working in detecting an active session.