From 148e37e0f0d71abd4d0060959a1e8a9323eb173d Mon Sep 17 00:00:00 2001 From: David Teigland Date: Aug 08 2012 21:57:48 +0000 Subject: wdmd: close device when test fails Instead of just not petting the device after a test fails, close the device. Because the close generates a ping, we want to get it done early, otherwise if wdmd exited (e.g. crash or sigkill) just before the device was ready to fire, the close generated by the kernel extends the life of the machine by an extra 60 sec. This means we need to re-open the device if we want to resume petting it. So, depending on whether the tests happen just prior to the expiry or just after the expiry, the watchdog will fire between 60 and 70 seconds after the expiry time. It would be 70 seconds if: we do the check just before the expiration, the client expires, 10 seconds (TEST_INTERVAL) later, we see the expiration, close the device, which generates a ping, which causes the firing to be 60 seconds after the close, which is already 10 seconds after the expiration. It would be 60 seconds if: we do the check just after the expiration, we see the expiration, close the device, which generates a ping, which causes the firing to be 60 seconds after the close, which is just after at the expiration time. Previously, the assumption was that the host would be reset between 50 and 60 seconds from the expiration time, but this did not account for the fact that the daemon could exit just before the host reset, which would lead the kernel to generate a new ping. If we can patch the kernel so that a device close does not generate a ping, then we do not need to close the device when a test fails, but we can simply not pet the device, as we've been doing. Signed-off-by: David Teigland --- diff --git a/wdmd/main.c b/wdmd/main.c index 5ed2cd6..eafbf03 100644 --- a/wdmd/main.c +++ b/wdmd/main.c @@ -58,7 +58,7 @@ static int daemon_debug; static int socket_gid; static time_t last_keepalive; static char lockfile_path[PATH_MAX]; -static int dev_fd; +static int dev_fd = -1; static int shm_fd; struct script_status { @@ -657,10 +657,46 @@ static int test_scripts(void) { return 0; } #endif /* TEST_SCRIPTS */ +static int open_dev(void) +{ + int fd; + + if (dev_fd != -1) { + log_error("/dev/watchdog already open fd %d", dev_fd); + return -1; + } + + fd = open("/dev/watchdog", O_WRONLY | O_CLOEXEC); + if (fd < 0) { + log_error("no /dev/watchdog, load a watchdog driver"); + return fd; + } + + dev_fd = fd; + return 0; +} + +static void close_watchdog_unclean(void) +{ + if (dev_fd == -1) { + log_debug("close_watchdog_unclean already closed"); + return; + } + + log_error("/dev/watchdog closed unclean"); + close(dev_fd); + dev_fd = -1; +} + static void close_watchdog(void) { int rv; + if (dev_fd == -1) { + log_error("close_watchdog already closed"); + return; + } + rv = write(dev_fd, "V", 1); if (rv < 0) log_error("/dev/watchdog disarm write error %d", errno); @@ -668,17 +704,16 @@ static void close_watchdog(void) log_error("/dev/watchdog disarmed"); close(dev_fd); + dev_fd = -1; } static int setup_watchdog(void) { int rv, timeout; - dev_fd = open("/dev/watchdog", O_WRONLY | O_CLOEXEC); - if (dev_fd < 0) { - log_error("no /dev/watchdog, load a watchdog driver"); - return dev_fd; - } + rv = open_dev(); + if (rv < 0) + return -1; timeout = 0; @@ -844,8 +879,20 @@ static int test_loop(void) fail_count += test_scripts(); fail_count += test_clients(); - if (!fail_count) - pet_watchdog(); + if (!fail_count) { + if (dev_fd == -1) { + log_error("/dev/watchdog reopen"); + open_dev(); + } else { + pet_watchdog(); + } + } else { + /* If we can patch the kernel so that close + does not generate a ping, then we can skip + this close, and just not pet the device in + this case. */ + close_watchdog_unclean(); + } } sleep_seconds = test_time + test_interval - monotime();