From b04d58c76001f3b78be147324f976daef14a7e0b Mon Sep 17 00:00:00 2001 From: David Teigland Date: Jan 10 2024 22:46:35 +0000 Subject: wdmd: fix timing for iTCO_wdt iTCO_wdt does not fire until two successive timeouts, so the values for set/get need to be adjusted by a factor of 2 to make the watchdog fire at the correct time. --- diff --git a/wdmd/main.c b/wdmd/main.c index 3c60b4e..bbb4356 100644 --- a/wdmd/main.c +++ b/wdmd/main.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -55,6 +56,7 @@ #define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP) #define WDPATH_SIZE 64 +#define WD_ID_SIZE 64 static int standard_test_interval = DEFAULT_TEST_INTERVAL; static int test_interval= DEFAULT_TEST_INTERVAL; @@ -72,6 +74,7 @@ static char lockfile_path[PATH_MAX]; static int test_loop_enable; static int dev_fd = -1; static int shm_fd; +static int itco; /* watchdog_identity is "iTCO_wdt" */ static int allow_scripts; static int kill_script_sec; @@ -79,6 +82,7 @@ static const char *scripts_dir = "/etc/wdmd.d"; static char watchdog_path[WDPATH_SIZE]; static char option_path[WDPATH_SIZE]; static char saved_path[WDPATH_SIZE]; +static char watchdog_identity[WD_ID_SIZE]; struct script_status { uint64_t start; @@ -115,11 +119,10 @@ static int client_size = 0; static struct client *client = NULL; static struct pollfd *pollfd = NULL; - #define log_debug(fmt, args...) \ do { \ if (daemon_debug) \ - fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ + fprintf(stderr, "%llu %s " fmt "\n", (unsigned long long)time(NULL), time_str(), ##args); \ } while (0) #define log_error(fmt, args...) \ @@ -146,6 +149,19 @@ static uint64_t monotime(void) return ts.tv_sec; } +char time_str_buf[128]; + +static char *time_str(void) +{ + struct timeval cur_time; + struct tm time_info; + + gettimeofday(&cur_time, NULL); + localtime_r(&cur_time.tv_sec, &time_info); + strftime(time_str_buf, sizeof(time_str_buf), "%Y-%m-%d %H:%M:%S ", &time_info); + return time_str_buf; +} + /* * test clients */ @@ -282,12 +298,13 @@ static void dump_debug(int fd) now = monotime(); memset(line, 0, sizeof(line)); - snprintf(line, 255, "wdmd %d socket_gid %d high_priority %d now %llu last_keepalive %llu last_closeunclean %llu allow_scripts %d kill_script_sec %d fire_timeout %d\n", + snprintf(line, 255, "wdmd %d socket_gid %d high_priority %d now %llu last_keepalive %llu last_closeunclean %llu allow_scripts %d kill_script_sec %d fire_timeout %d identity \"%s\"\n", getpid(), socket_gid, high_priority, (unsigned long long)now, (unsigned long long)last_keepalive, (unsigned long long)last_closeunclean, - allow_scripts, kill_script_sec, fire_timeout); + allow_scripts, kill_script_sec, fire_timeout, + watchdog_identity); line_len = strlen(line); strncat(debug_buf, line, LINE_SIZE); @@ -416,11 +433,93 @@ static void pet_watchdog(void) log_debug("keepalive %d", rv); } +static int _open_watchdog_itco(struct wdmd_header *h) +{ + int get_timeout_itco, get_timeout_real, set_timeout_itco, set_timeout_real; + int rv; + + /* Don't check dev_fd for -1 because dev_fd will be closed + and set to -1 prior to timeout in close_watchdog_unclean(). */ + + if (test_loop_enable) + return 0; + + if (!h->fire_timeout) + return -1; + + rv = open_dev(); + if (rv < 0) + return -1; + + get_timeout_real = 0; + get_timeout_itco = 0; + + rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout_itco); + if (rv < 0) { + log_error("open_watchdog gettimeout error %d", errno); + close_watchdog(); + return -1; + } + + get_timeout_real = get_timeout_itco * 2; + + if (get_timeout_real == h->fire_timeout) { + /* success, requested value matches the default value */ + fire_timeout = get_timeout_real; + _init_test_interval(); + log_error("%s open with timeout %d", watchdog_path, get_timeout_real); + pet_watchdog(); + test_loop_enable = 1; + return 0; + } + + set_timeout_real = h->fire_timeout; + set_timeout_itco = set_timeout_real / 2; + + rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &set_timeout_itco); + if (rv < 0) { + log_error("open_watchdog settimeout %d error %d", set_timeout_real, errno); + close_watchdog(); + return -1; + } + + get_timeout_real = 0; + get_timeout_itco = 0; + + rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout_itco); + if (rv < 0) { + log_error("open_watchdog gettimeout check error %d", errno); + close_watchdog(); + return -1; + } + + get_timeout_real = get_timeout_itco * 2; + + if (get_timeout_real == set_timeout_real) { + /* success setting a custom timeout */ + fire_timeout = get_timeout_real; + _init_test_interval(); + log_error("%s open with timeout %d", watchdog_path, get_timeout_real); + pet_watchdog(); + test_loop_enable = 1; + return 0; + } + + /* failed to set a custom timeout */ + log_error("open_watchdog gettimeout value real %d itco %d expect real %d", + get_timeout_real, get_timeout_itco, set_timeout_real); + close_watchdog(); + return -1; +} + static int _open_watchdog(struct wdmd_header *h) { int get_timeout, set_timeout; int rv; + if (itco) + return _open_watchdog_itco(h); + /* Don't check dev_fd for -1 because dev_fd will be closed and set to -1 prior to timeout in close_watchdog_unclean(). */ @@ -1112,6 +1211,49 @@ static int test_scripts(void) return fail_count; } +static int setup_identity(char *wdpath) +{ + char sysfs_path[PATH_MAX] = { 0 }; + char *base, *p; + int fd, rv; + + /* + * This function will be called multiple times when probing + * different watchdog paths for one that works. + */ + itco = 0; + memset(watchdog_identity, 0, sizeof(watchdog_identity)); + + /* + * $ cat /sys/class/watchdog/watchdog0/identity + * iTCO_wdt + */ + if (!(base = basename(wdpath))) + return -1; + + snprintf(sysfs_path, PATH_MAX-1, "/sys/class/watchdog/%s/identity", base); + + if ((fd = open(sysfs_path, O_RDONLY)) < 0) + return -1; + + rv = read(fd, watchdog_identity, WD_ID_SIZE-1); + + close(fd); + + if (rv <= 0) + return -1; + + if ((p = strchr(watchdog_identity, '\n'))) + *p = '\0'; + + log_debug("%s %s %s", wdpath, sysfs_path, watchdog_identity); + + if (!strcmp(watchdog_identity, "iTCO_wdt")) + itco = 1; + + return 0; +} + static int _setup_watchdog(char *path) { struct stat buf; @@ -1148,6 +1290,7 @@ static int _setup_watchdog(char *path) } /* + * Success: returns 0 with watchdog_path set. * Order of preference: * . saved path (path used before daemon restart) * . command line option (-w) @@ -1208,11 +1351,133 @@ static int setup_watchdog(void) } +/* + * iTCO_wdt actual firing timeout is double the value used in get/set! + * https://bugzilla.kernel.org/show_bug.cgi?id=213809 + */ +static int _try_timeout_itco(const char *path) +{ + struct stat buf; + int try_timeout_real, try_timeout_itco, get_timeout_real, get_timeout_itco, set_timeout_real, set_timeout_itco; + int unused, fd, err, rv, rv2; + + rv = stat(path, &buf); + if (rv < 0) { + fprintf(stderr, "%s stat error %d\n", path, errno); + return -1; + } + + fd = open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "%s open error %d\n", path, errno); + return fd; + } + + printf("%s %s open fd %d\n", time_str(), path, fd); + + get_timeout_real = 0; + get_timeout_itco = 0; + + rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout_itco); + if (rv < 0) { + fprintf(stderr, "%s gettimeout error %d\n", path, errno); + rv = -1; + goto out; + } + + get_timeout_real = get_timeout_itco * 2; + + printf("%s %s gettimeout real %d itco %d\n", time_str(), path, get_timeout_real, get_timeout_itco); + + if (get_timeout_real == try_timeout) + goto keepalive; + + try_timeout_real = try_timeout; + try_timeout_itco = try_timeout_real / 2; + set_timeout_real = try_timeout; + set_timeout_itco = set_timeout_real / 2; + + rv = ioctl(fd, WDIOC_SETTIMEOUT, &set_timeout_itco); + if (rv < 0) { + fprintf(stderr, "%s settimeout real %d itco %d error %d\n", path, set_timeout_real, set_timeout_itco, errno); + rv = -1; + goto out; + } + + set_timeout_real = set_timeout_itco * 2; + + printf("%s %s settimeout real %d itco %d result real %d itco %d\n", time_str(), path, + try_timeout_real, try_timeout_itco, set_timeout_real, set_timeout_itco); + + if (set_timeout_itco != try_timeout_itco) { + fprintf(stderr, "%s settimeout real %d itco %d failed\n", path, try_timeout_real, try_timeout_itco); + rv = -1; + goto out; + } + + get_timeout_real = 0; + get_timeout_itco = 0; + + rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout_itco); + if (rv < 0) { + fprintf(stderr, "%s gettimeout error %d\n", path, errno); + rv = -1; + goto out; + } + + get_timeout_real = get_timeout_itco * 2; + + printf("%s %s gettimeout real %d itco %d\n", time_str(), path, get_timeout_real, get_timeout_itco); + + keepalive: + + rv = ioctl(fd, WDIOC_KEEPALIVE, &unused); + if (rv < 0) { + fprintf(stderr, "%s keepalive error %d\n", path, errno); + rv = -1; + goto out; + } + + printf("%s %s keepalive fd %d result %d\n", time_str(), path, fd, rv); + + if (forcefire) { + int sleep_sec = 0; + int i; + setbuf(stdout, NULL); + printf("%s waiting for watchdog to reset machine:\n", time_str()); + for (i = 1; i < get_timeout_real + 5; i++) { + sleep(1); + sleep_sec++; + if (sleep_sec >= get_timeout_real+1) + printf("%s %d %s failed to fire after timeout %d seconds\n", time_str(), i, path, get_timeout_real); + else + printf("%s %d\n", time_str(), i); + } + } + + rv = 0; + out: + err = write(fd, "V", 1); + if (err < 0) { + fprintf(stderr, "trytimeout failed to disarm %s error %d %d\n", path, err, errno); + openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); + syslog(LOG_ERR, "trytimeout failed to disarm %s error %d %d\n", path, err, errno); + } + + printf("%s %s disarm write V fd %d result %d\n", time_str(), path, fd, rv); + + rv2 = close(fd); + + printf("%s %s close fd %d result %d\n", time_str(), path, fd, rv2); + + return rv; +} + static int _try_timeout(const char *path) { struct stat buf; int get_timeout, set_timeout; - int unused, fd, err, rv; + int unused, fd, err, rv, rv2; rv = stat(path, &buf); if (rv < 0) { @@ -1226,6 +1491,8 @@ static int _try_timeout(const char *path) return fd; } + printf("%s %s open fd %d\n", time_str(), path, fd); + get_timeout = 0; rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout); @@ -1235,7 +1502,10 @@ static int _try_timeout(const char *path) goto out; } - printf("%s gettimeout %d\n", path, get_timeout); + printf("%s %s gettimeout %d\n", time_str(), path, get_timeout); + + if (get_timeout == try_timeout) + goto keepalive; set_timeout = try_timeout; @@ -1246,7 +1516,7 @@ static int _try_timeout(const char *path) goto out; } - printf("%s settimeout %d result %d\n", path, try_timeout, set_timeout); + printf("%s %s settimeout %d result %d\n", time_str(), path, try_timeout, set_timeout); if (set_timeout != try_timeout) { fprintf(stderr, "%s settimeout %d failed\n", path, try_timeout); @@ -1263,7 +1533,9 @@ static int _try_timeout(const char *path) goto out; } - printf("%s gettimeout %d\n", path, get_timeout); + printf("%s %s gettimeout %d\n", time_str(), path, get_timeout); + + keepalive: rv = ioctl(fd, WDIOC_KEEPALIVE, &unused); if (rv < 0) { @@ -1272,22 +1544,20 @@ static int _try_timeout(const char *path) goto out; } + printf("%s %s keepalive fd %d result %d\n", time_str(), path, fd, rv); + if (forcefire) { int sleep_sec = 0; int i; setbuf(stdout, NULL); - printf("waiting for watchdog to reset machine:\n"); + printf("%s waiting for watchdog to reset machine:\n", time_str()); for (i = 1; i < get_timeout + 5; i++) { sleep(1); sleep_sec++; - if (sleep_sec == get_timeout+1) { - printf("\n"); - printf("%d %s failed to fire after timeout %d seconds\n", i, path, get_timeout); - } else if (sleep_sec > get_timeout+1) { - printf("%d %s failed to fire after timeout %d seconds\n", i, path, get_timeout); - } else { - printf("%d ", i); - } + if (sleep_sec >= get_timeout+1) + printf("%s %d %s failed to fire after timeout %d seconds\n", time_str(), i, path, get_timeout); + else + printf("%s %d\n", time_str(), i); } } @@ -1300,6 +1570,79 @@ static int _try_timeout(const char *path) syslog(LOG_ERR, "trytimeout failed to disarm %s error %d %d\n", path, err, errno); } + printf("%s %s disarm write V fd %d result %d\n", time_str(), path, fd, rv); + + rv2 = close(fd); + + printf("%s %s close fd %d result %d\n", time_str(), path, fd, rv2); + + return rv; +} + +static int _probe_dev_itco(const char *path) +{ + struct stat buf; + int fd, err, rv, timeout_real, timeout_itco; + + rv = stat(path, &buf); + if (rv < 0) { + fprintf(stderr, "error %d stat %s\n", errno, path); + return -1; + } + + fd = open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "error %d open %s\n", errno, path); + return fd; + } + + timeout_real = 0; + timeout_itco = 0; + + rv = ioctl(fd, WDIOC_GETTIMEOUT, &timeout_itco); + if (rv < 0) { + fprintf(stderr, "error %d ioctl gettimeout %s\n", errno, path); + rv = -1; + goto out; + } + + timeout_real = timeout_itco * 2; + + if (timeout_real == fire_timeout) { + printf("%s\n", path); + rv = 0; + goto out; + } + + timeout_real = fire_timeout; + timeout_itco = timeout_real / 2; + + rv = ioctl(fd, WDIOC_SETTIMEOUT, &timeout_itco); + if (rv < 0) { + fprintf(stderr, "error %d ioctl settimeout %s\n", errno, path); + rv = -1; + goto out; + } + + timeout_real = timeout_itco * 2; + + if (timeout_real != fire_timeout) { + fprintf(stderr, "error %d invalid timeout %s\n", errno, path); + rv = -1; + goto out; + } + + printf("%s\n", path); + rv = 0; + + out: + err = write(fd, "V", 1); + if (err < 0) { + fprintf(stderr, "probe failed to disarm %s error %d %d\n", path, err, errno); + openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); + syslog(LOG_ERR, "probe failed to disarm %s error %d %d\n", path, err, errno); + } + close(fd); return rv; } @@ -1366,14 +1709,37 @@ static int _probe_dev(const char *path) return rv; } -static int probe_dev(const char *path) +static int probe_dev(const char *wdpath) { - if (try_timeout) - return _try_timeout(path); - else - return _probe_dev(path); + char *path = (char *)wdpath; + + setup_identity(path); /* sets itco=1 if iTCO_wdt */ + + if (try_timeout) { + /* + * Used to test support for a given timeout with: wdmd -t + * or to test firing for a given timeout with: wdmd -F -t + */ + if (itco) + return _try_timeout_itco(path); + else + return _try_timeout(path); + } else { + /* + * Used to print on stdout just the path of the watchdog device + * that wdmd would use with: wdmd -p + */ + if (itco) + return _probe_dev_itco(path); + else + return _probe_dev(path); + } } +/* + * Confusingly, this is the top level function for both + * wdmd -t (test timeout) and wdmd -p (print functional watchdog device). + */ static int probe_watchdog(void) { int rv; @@ -1861,6 +2227,11 @@ int main(int argc, char *argv[]) } } + if (forcefire && !do_probe) { + fprintf(stderr, "Use force fire (-F) with a timeout (-t).\n"); + exit(EXIT_FAILURE); + } + if (do_probe) { rv = setup_shm(); if (rv < 0) { @@ -1891,9 +2262,6 @@ int main(int argc, char *argv[]) openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); - log_error("wdmd started S%d H%d G%d", allow_scripts, high_priority, - socket_gid); - setup_priority(); rv = lockfile(); @@ -1920,10 +2288,17 @@ int main(int argc, char *argv[]) if (rv < 0) goto out_files; + /* Sets watchdog_path */ rv = setup_watchdog(); if (rv < 0) goto out_clients; + /* Sets watchdog_identity and itco */ + setup_identity(watchdog_path); + + log_error("wdmd started S%d H%d G%d using %s \"%s\"", allow_scripts, high_priority, + socket_gid, watchdog_path, watchdog_identity[0] ? watchdog_identity : "unknown"); + rv = test_loop(); close_watchdog();