From f49110b426222c931e604e63ee38552d549d0eac Mon Sep 17 00:00:00 2001 From: David Teigland Date: May 08 2020 20:45:03 +0000 Subject: sanlock: warn about scheduling delay Two new config file settings: sched_warn_main_ms = sched_warn_renew_sec = If the main loop check interval is delayed more than milliseconds longer than intended, log a warning. If the lockspace thread sleep between renewals is seconds longer than intended, log a warning. --- diff --git a/src/cmd.c b/src/cmd.c index 23522ba..5c91181 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -2259,6 +2259,8 @@ static int print_state_daemon(char *str) "debug_clients=%d " "debug_cmds=0x%llx " "renewal_history_size=%d " + "sched_warn_main_ms=%u " + "sched_warn_renew_sec=%u " "gid=%d " "uid=%d " "sh_retries=%d " @@ -2285,6 +2287,8 @@ static int print_state_daemon(char *str) com.debug_clients, (unsigned long long)com.debug_cmds, com.renewal_history_size, + com.sched_warn_main_ms, + com.sched_warn_renew_sec, com.gid, com.uid, com.sh_retries, diff --git a/src/lockspace.c b/src/lockspace.c index 41f96cb..dd4875f 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -670,6 +670,8 @@ static void *lockspace_thread(void *arg_in) struct space *sp; struct leader_record leader; uint64_t delta_begin, last_success = 0; + uint64_t wait_interval_begin = 0, wait_interval_want = 0; + uint64_t now; int sector_size = 0; int align_size = 0; int max_hosts = 0; @@ -815,9 +817,24 @@ static void *lockspace_thread(void *arg_in) /* * wait between each renewal + * + * T=10 delta renew begins and ends, wrote leader.timesamp=10, last_success=10 + * T=30 delta renew begins, write leader.timestamp=30 + * T=35 delta renew ends after write being delayed + * renewal_interval = leader.timestamp - last_success (30-10) = 20 + * last_success = leader.timestamp = 30 + * + * now - last_success (35 - 30) = 5 < id_renewal_seconds (20) + * so delay for 15 sec before starting next renewal + * delay = id_renewal_seconds - (now - last_success) */ + now = monotime(); - if (monotime() - last_success < id_renewal_seconds) { + if (now - last_success < id_renewal_seconds) { + if (!wait_interval_begin) { + wait_interval_begin = now; + wait_interval_want = id_renewal_seconds - (now - last_success); + } sleep(1); continue; } else { @@ -826,6 +843,14 @@ static void *lockspace_thread(void *arg_in) usleep(500000); } + /* Warn about scheduling delays */ + if (com.sched_warn_renew_sec && wait_interval_begin && wait_interval_want && + (now - wait_interval_begin > wait_interval_want + com.sched_warn_renew_sec)) { + log_warns(sp, "renew interval is %u sec want %u warn %u", + (uint32_t)(now - wait_interval_begin), (uint32_t)wait_interval_want, com.sched_warn_renew_sec); + } + wait_interval_begin = 0; + wait_interval_want = 0; /* * do a renewal, measuring length of time spent in renewal, diff --git a/src/main.c b/src/main.c index 8c6eef8..4afd4db 100644 --- a/src/main.c +++ b/src/main.c @@ -823,6 +823,10 @@ static int main_loop(void) poll_timeout = check_interval - ms; continue; } + if (ms >= (check_interval + com.sched_warn_main_ms)) { + log_warn("main check interval is %u ms want %u warn %u", + ms, check_interval, com.sched_warn_main_ms); + } last_check = now; check_interval = STANDARD_CHECK_INTERVAL; @@ -2872,6 +2876,14 @@ static void read_config_file(void) } else { log_error("ignore unknown max_sectors_kb %s", str); } + + } else if (!strcmp(str, "sched_warn_main_ms")) { + get_val_int(line, &val); + com.sched_warn_main_ms = val; + + } else if (!strcmp(str, "sched_warn_renew_sec")) { + get_val_int(line, &val); + com.sched_warn_renew_sec = val; } } @@ -3860,6 +3872,8 @@ int main(int argc, char *argv[]) com.renewal_read_extend_sec_set = 0; com.renewal_read_extend_sec = 0; com.renewal_history_size = DEFAULT_RENEWAL_HISTORY_SIZE; + com.sched_warn_main_ms = 2000; + com.sched_warn_renew_sec = 2; com.paxos_debug_all = 0; com.max_sectors_kb_ignore = DEFAULT_MAX_SECTORS_KB_IGNORE; com.max_sectors_kb_align = DEFAULT_MAX_SECTORS_KB_ALIGN; diff --git a/src/sanlock.8 b/src/sanlock.8 index d75211d..0ee8d95 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -1365,6 +1365,19 @@ for that command. Special values +all and -all can be used to enable or disable all commands, and can be used before or after other debug_cmd lines. +.IP \[bu] 2 +sched_warn_main_ms = +.br +Log a warning if the intended check interval of the main processing +loop is run this many milliseconds later than intended. Set to 0 +to disable. + +.IP \[bu] 2 +sched_warn_renew_sec = +.br +Log a warning if the intended delay between lockspace renewals +is this many seconds later than intended. Set to 0 to disable. + .SH SEE ALSO .BR wdmd (8) diff --git a/src/sanlock.conf b/src/sanlock.conf index e52da2b..5d47ad1 100644 --- a/src/sanlock.conf +++ b/src/sanlock.conf @@ -63,3 +63,9 @@ # debug_cmd = - # ... # command line: n/a +# +# sched_warn_main_ms = +# command line: n/a +# +# sched_warn_renew_sec = +# command line: n/a diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index cd4d23a..b103209 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -385,6 +385,8 @@ struct command_line { int renewal_history_size; int renewal_read_extend_sec_set; /* 1 if renewal_read_extend_sec is configured */ uint32_t renewal_read_extend_sec; + uint32_t sched_warn_main_ms; + uint32_t sched_warn_renew_sec; char our_host_name[SANLK_NAME_LEN+1]; char *file_path; char *dump_path;