PR#51073: Ticket 51072 - improve autotune defaults - 389-ds-base

389-ds-base

#51073 Ticket 51072 - improve autotune defaults

Closed 3 years ago by spichugi. Opened 3 years ago by firstyear.

firstyear/389-ds-base 51072-improve-autotuning into master

Ticket 51072 - improve autotune defaults

William Brown • 3 years ago

9a06935

ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c

file modified

+1 -1

		`@@ -1398,7 +1398,7 @@`
		`{CONFIG_DB_DEBUG_CHECKPOINTING, CONFIG_TYPE_ONOFF, "off", &bdb_config_db_debug_checkpointing_get, &bdb_config_db_debug_checkpointing_set, 0},`
		`{CONFIG_DB_HOME_DIRECTORY, CONFIG_TYPE_STRING, "", &bdb_config_db_home_directory_get, &bdb_config_db_home_directory_set, 0},`
		`{CONFIG_IMPORT_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "-1", &bdb_config_import_cache_autosize_get, &bdb_config_import_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`- {CONFIG_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "10", &bdb_config_cache_autosize_get, &bdb_config_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`+ {CONFIG_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "25", &bdb_config_cache_autosize_get, &bdb_config_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_CACHE_AUTOSIZE_SPLIT, CONFIG_TYPE_INT, "25", &bdb_config_cache_autosize_split_get, &bdb_config_cache_autosize_split_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_IMPORT_CACHESIZE, CONFIG_TYPE_UINT64, "16777216", &bdb_config_import_cachesize_get, &bdb_config_import_cachesize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_BYPASS_FILTER_TEST, CONFIG_TYPE_STRING, "on", &bdb_config_get_bypass_filter_test, &bdb_config_set_bypass_filter_test, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`

ldap/servers/slapd/back-ldbm/db-bdb/bdb_misc.c

file modified

+1 -1

		`@@ -193,7 +193,7 @@`
		`* sane defaults and populate these values, but it's only on first run.`
		`*/`
		`msg = "This can be corrected by altering the values of nsslapd-dbcachesize, nsslapd-cachememsize and nsslapd-dncachememsize\n";`
		`- autosize_percentage = 10;`
		`+ autosize_percentage = 25;`
		`} else {`
		`/* In this case we really are setting the values each start up, so`
		`* change the msg.`

ldap/servers/slapd/util.c

file modified

+5 -28

		`@@ -1491,34 +1491,11 @@`
		`long hw_threads = sysconf(_SC_NPROCESSORS_ONLN);`
		`long threads = 0;`
		`slapi_log_err(SLAPI_LOG_TRACE, "util_get_hardware_threads", "Detected %lu hardware threads\n", threads);`
		`- /*`
		`- * Now we determine the number to run with based on threads. Initially, for`
		`- * low processor counts we ramp up quickly, we plateau a little, then, we`
		`- * at high numbers start to plateau and increase slowly.`
		`- * Should be`
		`- * 1 -> 16`
		`- * 2 -> 16`
		`- * 4 -> 24`
		`- * 8 -> 32`
		`- * 16 -> 48`
		`- * 32 -> 64`
		`- * 64 -> 96`
		`- * 128 -> 192`
		`- * 256 -> 384`
		`- * 512 -> 512`
		`- * 1024 -> 512`
		`- * 2048 -> 512`
		`- */`
		`-`
		`- if (hw_threads >= 0 && hw_threads < 4) {`
		`- threads = 16;`
		`- } else if (hw_threads >= 4 && hw_threads < 32) {`
		`- threads = 16 + (hw_threads * 2);`
		`- } else if (hw_threads >= 32 && hw_threads < 64) {`
		`- threads = (hw_threads * 2);`
		`- } else if (hw_threads >= 64 && hw_threads < 512) {`
		`- /* Same as 1.5 /`
		`- threads = (hw_threads * 2) - (hw_threads / 2);`
		`+ if (hw_threads == 0) {`
		`+ /* Error! */`
		`+ threads = -1;`
		`+ } else if (hw_threads < 512) {`
tbordaz commented 3 years ago Is 'threads' the final nsslapd-threadnumber ? If yes, it looks very high. The fix looks good but I have a doubt. Default was 30. Unless workers are very slow or doing very long job, I would expect 30 workers would be enough for most of load with rapid operations. By any chance do you have searchrate/modrate numbers that shows #workers > 50 or 100 are beneficial ?
		`+ threads = hw_threads;`
		`} else {`
		`/* Cap at 512 for now ... */`
		`threads = 512;`

src/lib389/cli/dscontainer

file modified

+12

		`@@ -42,6 +42,7 @@`
		`from lib389.passwd import password_generate`
		`from lib389.nss_ssl import NssSsl, CERT_NAME`
		`from lib389.paths import Paths`
		`+ from lib389.config import LDBMConfig`
		`from lib389._constants import (`
		`DSRC_CONTAINER,`
		`CONTAINER_TLS_SERVER_KEY,`
		`@@ -81,10 +82,21 @@`
		`# TODO: Should we reset cn=Directory Manager from env?`
		`dm_pass = os.getenv("DS_DM_PASSWORD", None)`
		`if dm_pass is not None:`
		`+ log.debug("Setting Directory Manager Password ...")`
		`dm = DirectoryManager(inst)`
		`dm.change_password(dm_pass)`
		`# TODO: Should we set replica id from env?`
		`# TODO: Should we set replication agreements from env?`
		`+ autotune_pct = os.getenv("DS_MEMORY_PERCENTAGE", None)`
		`+ if autotune_pct is not None:`
		`+ try:`
		`+ autotune_pct = int(autotune_pct)`
		`+ except:`
		`+ log.error("Invalid DS_MEMORY_PERCENTAGE - resetting to system default value")`
		`+ autotune_pct = 0`
		`+ log.debug("Setting LDBM Autotune Percentage to: %s", autotune_pct)`
		`+ ldbmconfig = LDBMConfig(inst)`
		`+ ldbmconfig.set("nsslapd-cache-autosize", str(autotune_pct))`

		`inst.close()`

firstyear commented 3 years ago

Bug Description: we have learnt that the CPU autotuning is too aggresive, potentially
decreasing throughput due to overhead in context switching and lock contention, and
that our memory tuning is not aggressive enough, at only 10% of the system memory.
Additionally, in containers, we are able to have access to different memory limits
and reservations, so we can choose to be even more forward in our selection.

Fix Description: Change thread tuning to match the number of threads available on
the system. Change memory tuning to 25% of system memory by default. Finally add
an environment variable to containers allowing more aggressive tuning to be
set DS_MEMORY_PERCENTAGE. Later this could be set to a higher default value.

https://pagure.io/389-ds-base/issue/51072

Author: William Brown william@blackhats.net.au

Review by: ???

firstyear commented 3 years ago

It's worth noting, that even on basic tests, this was significantly faster on my machine:

before:
35 passed, 3 skipped, 150 warnings in 239.14s (0:03:59)
after:
35 passed, 3 skipped, 150 warnings in 173.03s (0:02:53)

That's roughly a 25% speedup.

tbordaz commented on line 36 of ldap/servers/slapd/util.c 3 years ago

Is 'threads' the final nsslapd-threadnumber ?
If yes, it looks very high.

The fix looks good but I have a doubt. Default was 30. Unless workers are very slow or doing very long job, I would expect 30 workers would be enough for most of load with rapid operations. By any chance do you have searchrate/modrate numbers that shows #workers > 50 or 100 are beneficial ?

firstyear commented 3 years ago

@tbordaz it sets the number of threads based on how many CPU hardware threads are presented by the OS. so if you have a 4 core machine, it's 4. If it's 256 it's 256. If you have an 8 core with hyper threads, it would be 16.

The <512 there is a cap that we don't exceed. Not a minimum.

vashirov commented 3 years ago

Another data point: search rate on a 4 CPU machine, 1 client with 1-10 threads:

thread #	4	24
1	5837.800	6160.100
2	8549.308	11939.750
3	7687.200	17349.333
4	9962.475	7191.825
5	16807.538	8386.949
6	17051.641	9447.744
7	17788.385	9990.026
8	18838.359	10417.077
9	22419.949	10889.000
10	23950.950	11561.769

firstyear commented 3 years ago

@vashirov wow, those numbers are stunning. I think the scaling at the high end (10 client threads) is more important than the low numbbers since we do need to consider high concurrency as a key workload for us.

lkrispen commented 3 years ago

yes, it looks good at first sight. But you need to see what happens in a mixed load, I think write operations can easiiy block 4 threads and delay all binds and searches.

We should not optimize for one specific load pattern

firstyear commented 3 years ago

I think that this may not be true going forward, as with lmdb on a concurrent cache design we can only have a single active writer, which means that we could distinguish between read operations and write operations in the thread pool, to guarantee that bind/read is always seperate.

Flip side of this, is we could just also have many many readers stalling writers causing them to stall too.

Saying this, I still agree that @vashirov can do some more of his excellent load testing to check this patch, :)

vashirov commented 3 years ago

Here are the test results with 1-30 client threads and 4, 8, 16, 24 worker threads:
https://fedorapeople.org/groups/389ds/ci/pr51073/search.html
https://fedorapeople.org/groups/389ds/ci/pr51073/modify.html
https://fedorapeople.org/groups/389ds/ci/pr51073/mixed.html
https://fedorapeople.org/groups/389ds/ci/pr51073/auth.html

This is only with this patch applied, no other tunings.

tbordaz commented 3 years ago

Thanks for all these runs !

@vashirov for mixed load is it sync operation ? is it accounting MOD+SEARCH as one operation ? It is looking like the search rate is hidden by the mod rate.

For MODs at the moment we can not really conclude of a benefit of high/lower workers. For searchs there is still an unexpected significant negative impact of #workers.

vashirov commented 3 years ago

@vashirov for mixed load is it sync operation ? is it accounting MOD+SEARCH as one operation ? It is looking like the search rate is hidden by the mod rate.

It was a SRCH followed by the MOD:

[12/May/2020:08:23:26.169046247 +0200] conn=1868 op=3715 SRCH base="dc=example,dc=com" scope=2 filter="(uid=user.8306)" attrs="givenName sn mail"
[12/May/2020:08:23:26.169258473 +0200] conn=1868 op=3715 RESULT err=0 tag=101 nentries=1 etime=0.000289852
[12/May/2020:08:23:26.169500575 +0200] conn=1868 op=3716 MOD dn="uid=user.8306,ou=People,dc=example,dc=com"
[12/May/2020:08:23:26.181633127 +0200] conn=1868 op=3716 RESULT err=0 tag=103 nentries=0 etime=0.012312424

I will add another test with async SRCH and MOD.

firstyear commented 3 years ago

My analysis of what this shows is that search seems to improve with more threads, but something causes contention leading to the loss - so lower threads == less contention yielding the cpu-matched threads to give better search throughput and latency. It appears in the mixed workload our writes are heavily impacting the searches, so I think our write path is likely to be preventing search performance improvement. Regardless, it didn't make it worse, so I'm of course in favour of this change :)

mreynolds commented 3 years ago

IMHO LGTM, like William said it's not hurting the numbers. It's definitely an improvement, and if we need to fine tune at a later date so be it.

mreynolds commented 3 years ago

Actually there was something I'd like to see tested with this change. A machine with more CPU/cores.

So we tested a 4 core machine and setting the thread number to 4 was great, but what about a 16 core system with varying worker threads. Do we same improvement if we set the thread number to 16 vs 32 or 8 or 4?

@vashirov - would it be hard to reserve a system with this hardware and run one more rounds of tests?

mreynolds commented 3 years ago

When I did investigation for another potential customer a few years ago I also saw that setting the thread number to the number of cores gave the best performance. I think this is definitely an improvement over what we had, ack

rebased onto 5eacf45e7caa50de2721f85d7fbee58767bcb8f0

3 years ago

rebased onto 9a06935

3 years ago

Pull-Request has been merged by firstyear

3 years ago

spichugi commented 3 years ago

389-ds-base is moving from Pagure to Github. This means that new issues and pull requests
will be accepted only in 389-ds-base's github repository.

This pull request has been cloned to Github as issue and is available here:
- https://github.com/389ds/389-ds-base/issues/4126

If you want to continue to work on the PR, please navigate to the github issue,
download the patch from the attachments and file a new pull request.

Thank you for understanding. We apologize for all inconvenience.

Pull-Request has been closed by spichugi

3 years ago

Metadata

Assignee

None

Tags

No Tags

Changes Summary 4

+1 -1

file changed

ldap/servers/slapd/back-ldbm/db-bdb/bdb_config.c

+1 -1

file changed

ldap/servers/slapd/back-ldbm/db-bdb/bdb_misc.c

+5 -28

file changed

ldap/servers/slapd/util.c

+12 -0

file changed

src/lib389/cli/dscontainer

		`@@ -1398,7 +1398,7 @@`
		`{CONFIG_DB_DEBUG_CHECKPOINTING, CONFIG_TYPE_ONOFF, "off", &bdb_config_db_debug_checkpointing_get, &bdb_config_db_debug_checkpointing_set, 0},`
		`{CONFIG_DB_HOME_DIRECTORY, CONFIG_TYPE_STRING, "", &bdb_config_db_home_directory_get, &bdb_config_db_home_directory_set, 0},`
		`{CONFIG_IMPORT_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "-1", &bdb_config_import_cache_autosize_get, &bdb_config_import_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`- {CONFIG_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "10", &bdb_config_cache_autosize_get, &bdb_config_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`+ {CONFIG_CACHE_AUTOSIZE, CONFIG_TYPE_INT, "25", &bdb_config_cache_autosize_get, &bdb_config_cache_autosize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_CACHE_AUTOSIZE_SPLIT, CONFIG_TYPE_INT, "25", &bdb_config_cache_autosize_split_get, &bdb_config_cache_autosize_split_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_IMPORT_CACHESIZE, CONFIG_TYPE_UINT64, "16777216", &bdb_config_import_cachesize_get, &bdb_config_import_cachesize_set, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`
		`{CONFIG_BYPASS_FILTER_TEST, CONFIG_TYPE_STRING, "on", &bdb_config_get_bypass_filter_test, &bdb_config_set_bypass_filter_test, CONFIG_FLAG_ALWAYS_SHOW \| CONFIG_FLAG_ALLOW_RUNNING_CHANGE},`

389-ds-base

Source Code

#51073 Ticket 51072 - improve autotune defaults Closed 3 years ago by spichugi. Opened 3 years ago by firstyear. firstyear/389-ds-base 51072-improve-autotuning into master

Metadata

Changes Summary 4

#51073 Ticket 51072 - improve autotune defaults

Closed 3 years ago by spichugi. Opened 3 years ago by firstyear.

firstyear/389-ds-base 51072-improve-autotuning into master