#50885 WIP Ticket 50618 - support cgroupv2
Closed 2 years ago by spichugi. Opened 2 years ago by firstyear.
firstyear/389-ds-base 50618-cgroup-v2  into  master

file modified
+116 -17
@@ -27,6 +27,9 @@ 

  #include <sys/time.h>

  #include <sys/resource.h>

  

+ /* directory check */

+ #include <dirent.h>

+ 

  #ifdef OS_solaris

  #include <sys/procfs.h>

  #endif
@@ -35,6 +38,14 @@ 

  #include <sys/pstat.h>

  #endif

  

+ #include <sys/param.h>

+ 

+ /* This warns if we have less than 128M avail */

+ #define SPAL_WARN_MIN_BYTES 134217728

+ 

+ #define CG2_HEADER_FORMAT "0::"

+ #define CG2_HEADER_LEN strlen(CG2_HEADER_FORMAT)

+ 

  static int_fast32_t

  _spal_rlimit_get(int resource, uint64_t *soft_limit, uint64_t *hard_limit)

  {
@@ -101,6 +112,48 @@ 

      return retval;

  }

  

+ static int_fast32_t

+ _spal_dir_exist(char *path)

+ {

+     DIR* dir = opendir(path);

+     if (dir) {

+         closedir(dir);

+         return 1;

+     }

+     return 0;

+ }

+ 

+ static char *

+ _spal_cgroupv2_path() {

+     FILE *f;

+     char s[256] = {0};

+     char *res = NULL;

+     /* We discover our path by looking at /proc/self/cgroup */

+     f = fopen("/proc/self/cgroup", "r");

+     if (!f) {

+         int errsrv = errno;

+         slapi_log_err(SLAPI_LOG_ERR, "_spal_get_uint64_t_file", "Unable to open file \"/proc/self/cgroup\". errno=%d\n", errsrv);

+         return NULL;

+     }

+ 

+     if (feof(f) == 0) {

+         if (fgets(s, MAXPATHLEN, f) != NULL) {

+             /* we now have a path in s, and the last byte must be NULL */

+             if ((strlen(s) >= CG2_HEADER_LEN) && strncmp(s, CG2_HEADER_FORMAT, CG2_HEADER_LEN) == 0) {

+                 res = slapi_ch_calloc(1, MAXPATHLEN + 17);

+                 snprintf(res, MAXPATHLEN + 16, "/sys/fs/cgroup%s", s + CG2_HEADER_LEN);

+                 /* This always has a new line, so replace it if possible. */

+                 size_t nl = strlen(res) - 1;

+                 res[nl] = '\0';

+             }

+         }

+     }

+     /* Will return something like /sys/fs/cgroup/system.slice/system-dirsrv.slice/dirsrv@standalone1.service */

+ 

+     fclose(f);

+     return res;

+ }

+ 

  

  slapi_pal_meminfo *

  spal_meminfo_get()
@@ -172,34 +225,73 @@ 

      uint64_t cg_mem_usage = 0;

      uint64_t cg_mem_soft_avail = 0;

  

-     char *f_cg_soft = "/sys/fs/cgroup/memory/memory.soft_limit_in_bytes";

-     char *f_cg_hard = "/sys/fs/cgroup/memory/memory.limit_in_bytes";

-     char *f_cg_usage = "/sys/fs/cgroup/memory/memory.usage_in_bytes";

+     /* We have cgroup v1, so attempt to use it. */

+     if (_spal_dir_exist("/sys/fs/cgroup/memory")) {

+         char *f_cg_soft = "/sys/fs/cgroup/memory/memory.soft_limit_in_bytes";

+         char *f_cg_hard = "/sys/fs/cgroup/memory/memory.limit_in_bytes";

+         char *f_cg_usage = "/sys/fs/cgroup/memory/memory.usage_in_bytes";

+         slapi_log_err(SLAPI_LOG_INFO, "spal_meminfo_get", "Found cgroup v1\n");

  

-     if (_spal_uint64_t_file_get(f_cg_soft, NULL, &cg_mem_soft)) {

-         slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_soft);

-     }

+         if (_spal_uint64_t_file_get(f_cg_soft, NULL, &cg_mem_soft)) {

+             slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_soft);

+         }

  

-     if (_spal_uint64_t_file_get(f_cg_hard, NULL, &cg_mem_hard)) {

-         slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_hard);

-     }

+         if (_spal_uint64_t_file_get(f_cg_hard, NULL, &cg_mem_hard)) {

+             slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_hard);

+         }

  

-     if (_spal_uint64_t_file_get(f_cg_usage, NULL, &cg_mem_usage)) {

-         slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_hard);

+         if (_spal_uint64_t_file_get(f_cg_usage, NULL, &cg_mem_usage)) {

+             slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", f_cg_usage);

+         }

+ 

+     } else {

+         /* We might have cgroup v2. Attempt to get the controller path ... */

+         char *ctrlpath = _spal_cgroupv2_path();

+         if (ctrlpath != NULL) {

+ 

+             char s[MAXPATHLEN + 33] = {0};

+             slapi_log_err(SLAPI_LOG_INFO, "spal_meminfo_get", "Found cgroup v2 -> %s\n", ctrlpath);

+             /* There are now three files we care about - memory.current, memory.high and memory.max */

+             /* For simplicity we re-use soft and hard */

+             /* If _spal_uint64_t_file_get() hit's "max" then these remain at 0 */

+             snprintf(s, MAXPATHLEN + 32, "%s/memory.current", ctrlpath);

+             if (_spal_uint64_t_file_get(s, NULL, &cg_mem_usage)) {

+                 slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", s);

+             }

+ 

+             snprintf(s, MAXPATHLEN + 32, "%s/memory.high", ctrlpath);

+             if (_spal_uint64_t_file_get(s, NULL, &cg_mem_soft)) {

+                 slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", s);

+             }

+ 

+             snprintf(s, MAXPATHLEN + 32, "%s/memory.max", ctrlpath);

+             if (_spal_uint64_t_file_get(s, NULL, &cg_mem_hard)) {

+                 slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "Unable to retrieve %s. There may be no cgroup support on this platform\n", s);

+             }

+ 

+             slapi_ch_free_string(&ctrlpath);

+         } else {

+             slapi_log_err(SLAPI_LOG_WARNING, "spal_meminfo_get", "cgroups v1 or v2 unable to be read - may not be on this platform ...\n");

+         }

      }

  

      /*

       * In some conditions, like docker, we only have a *hard* limit set.

       * This obviously breaks our logic, so we need to make sure we correct this

       */

- 

-     if (cg_mem_hard != 0 && cg_mem_soft != 0 && cg_mem_hard < cg_mem_soft) {

-         /* Right, we only have a hard limit. Impose a 10% watermark. */

-         cg_mem_soft = cg_mem_hard * 0.9;

+     if ((cg_mem_hard != 0 && cg_mem_soft == 0) || (cg_mem_hard < cg_mem_soft)) {

+         /* Right, we only have a hard limit. Impose a 20% watermark. */

+         cg_mem_soft = cg_mem_hard * 0.8;

      }

  

-     if (cg_mem_soft != 0 && cg_mem_usage != 0 && cg_mem_soft > cg_mem_usage) {

-         cg_mem_soft_avail = cg_mem_soft - cg_mem_usage;

+     if (cg_mem_usage != 0 && (cg_mem_soft != 0 || cg_mem_hard != 0)) {

+         if (cg_mem_soft > cg_mem_usage) {

+             cg_mem_soft_avail = cg_mem_soft - cg_mem_usage;

+         } else if (cg_mem_hard > cg_mem_usage) {

+             cg_mem_soft_avail = cg_mem_hard - cg_mem_usage;

+         } else {

+             slapi_log_err(SLAPI_LOG_CRIT, "spal_meminfo_get", "Your cgroup memory usage exceeds your hard limit?");

+         }

      }

  

  
@@ -253,6 +345,13 @@ 

          return NULL;

      }

  

+     if (mi->system_available_bytes < SPAL_WARN_MIN_BYTES) {

+         slapi_log_err(SLAPI_LOG_CRIT, "spal_meminfo_get", "Your system is reporting %" PRIu64" bytes available, which is less than the minimum recommended %" PRIu64 " bytes\n",

+             mi->system_available_bytes, SPAL_WARN_MIN_BYTES);

+         slapi_log_err(SLAPI_LOG_CRIT, "spal_meminfo_get", "This indicates heavy memory pressure or incorrect system resource allocation\n");

+         slapi_log_err(SLAPI_LOG_CRIT, "spal_meminfo_get", "Directory Server *may* crash as a result!!!\n");

+     }

+ 

      slapi_log_err(SLAPI_LOG_TRACE, "spal_meminfo_get", "{pagesize_bytes = %" PRIu64 ", system_total_pages = %" PRIu64 ", system_total_bytes = %" PRIu64 ", process_consumed_pages = %" PRIu64 ", process_consumed_bytes = %" PRIu64 ", system_available_pages = %" PRIu64 ", system_available_bytes = %" PRIu64 "},\n",

                    mi->pagesize_bytes, mi->system_total_pages, mi->system_total_bytes, mi->process_consumed_pages, mi->process_consumed_bytes, mi->system_available_pages, mi->system_available_bytes);

  

Work in progress (on pagure so I can send to a test machine)

Bug Description: fedora 31 changes to cgroup v2 and I expect suse
to do the same soon. We should support this natively as part of
the memory limit detection.

Fix Description: Add support for cgroup v2

https://pagure.io/389-ds-base/issue/50618

Author: William Brown william@blackhats.net.au

Review by: ???

1 new commit added

  • Fix
2 years ago

1 new commit added

  • Fix
2 years ago

1 new commit added

  • Fix
2 years ago

1 new commit added

  • Fix
2 years ago

rebased onto adf0e7d217fcf2592fb0aa86ac1dcff7016c1597

2 years ago

255 looks good but any reason why not using MAXPATHLEN ?

Could define HEADER_FORMAT="0::" and strlen

Adding a comment that on cgroupv2 system it will return something like '/sys/fs/cgroup/system.slice/system-dirsrv.slice/dirsrv@standalone1.service'

Not expert at all on vgroup. I noticed in my tests that memory.max and memory.high were 'max' (not uint)

In case cg_mem* are unchanged because it is neither cgroup V1/V2. Is there fallback values ?

Is this possible ? If current vsz overpass the process limit, I would expect it exits at the next malloc.

Not expert at all on vgroup. I noticed in my tests that memory.max and memory.high were 'max' (not uint)

If these are max, then the _spal_uint64_t_file_get() call to sscanf will fail, causing dest to be left as 0, indicating no limit.

In case cg_mem* are unchanged because it is neither cgroup V1/V2. Is there fallback values ?

Yes, it remains at 0, so we bypass those checks.

Is this possible ? If current vsz overpass the process limit, I would expect it exits at the next malloc.

This is saying "if the softlimit has room given our current usage". So if the soft or hardlimit is say 1GB and we are using 256mb, then only 768mb is available, rather than the full gb. So these are actually checks to make sure that we don't underflow to a huge value of memory available, or over allocate based on the current cgroup usage.

So the logic is correct, and the final } else { here catches when usage > soft/hard limits, to give a warning about the condition.

2 new commits added

  • thierry feedback
  • thierry feedback
2 years ago

CHanges to your other comments have been made :) thanks for the thorough review mate

"..now two files.." -> "..now three files.."

Something like this would be stronger check we are in CGv2
if ((strcmp(s, CG2_HEADER_FORMAT, CG2_HEADER_LEN) && (strlen(s) >= CG2_HEADER_LEN))...

Not expert at all on vgroup. I noticed in my tests that memory.max and memory.high were 'max' (not uint)

If these are max, then the _spal_uint64_t_file_get() call to sscanf will fail, causing dest to be left as 0, indicating no limit.

In case cg_mem* are unchanged because it is neither cgroup V1/V2. Is there fallback values ?

Yes, it remains at 0, so we bypass those checks.

Is this possible ? If current vsz overpass the process limit, I would expect it exits at the next malloc.

This is saying "if the softlimit has room given our current usage". So if the soft or hardlimit is say 1GB and we are using 256mb, then only 768mb is available, rather than the full gb. So these are actually checks to make sure that we don't underflow to a huge value of memory available, or over allocate based on the current cgroup usage.
So the logic is correct, and the final } else { here catches when usage > soft/hard limits, to give a warning about the condition.

I agree the logic is correct. My remark was a question, how is it possible to have memory.usage > memory.soft_limit. I would expect some alloc failure if usage>soft_limit

If max/soft limit are 'max', then cg_mem_soft = cg_mem_hard = 0.
So we will get this wrong alarming message in error log.

@firstyear, beside those cosmetic comments the patch looks valid to me. You have my ACK

Something like this would be stronger check we are in CGv2
if ((strcmp(s, CG2_HEADER_FORMAT, CG2_HEADER_LEN) && (strlen(s) >= CG2_HEADER_LEN))...

True, that would be a better check, because I don't know if that header ever changes, it seems like the whole format is a bit undocumented :(

If max/soft limit are 'max', then cg_mem_soft = cg_mem_hard = 0.
So we will get this wrong alarming message in error log.

Yep, corrected by checking that soft or hard are non zero now.

I'll ask you to review again to be 100% sure given I changed a bit of logic in a few places @tbordaz

rebased onto 3702f858d2530918ed9ff301f691670db5787908

2 years ago

Thanks @firstyear , the patch looks good to me. Ack

rebased onto 0e6a04a

2 years ago

Thanks for your very thorough review @tbordaz I really appreciate it!

Pull-Request has been merged by firstyear

2 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/3938

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

2 years ago
Metadata