This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch master
in repository sanlock.
commit b04d58c76001f3b78be147324f976daef14a7e0b
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Jan 10 16:46:35 2024 -0600
wdmd: fix timing for iTCO_wdt
iTCO_wdt does not fire until two successive timeouts, so the
values for set/get need to be adjusted by a factor of 2 to
make the watchdog fire at the correct time.
---
wdmd/main.c | 425 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++----
1 file changed, 400 insertions(+), 25 deletions(-)
diff --git a/wdmd/main.c b/wdmd/main.c
index 3c60b4e..bbb4356 100644
--- a/wdmd/main.c
+++ b/wdmd/main.c
@@ -31,6 +31,7 @@
#include <sys/un.h>
#include <sys/stat.h>
#include <sys/mman.h>
+#include <sys/time.h>
#include <sys/signalfd.h>
#include <linux/watchdog.h>
@@ -55,6 +56,7 @@
#define DEFAULT_SOCKET_MODE (S_IRUSR|S_IWUSR|S_IRGRP|S_IWGRP)
#define WDPATH_SIZE 64
+#define WD_ID_SIZE 64
static int standard_test_interval = DEFAULT_TEST_INTERVAL;
static int test_interval= DEFAULT_TEST_INTERVAL;
@@ -72,6 +74,7 @@ static char lockfile_path[PATH_MAX];
static int test_loop_enable;
static int dev_fd = -1;
static int shm_fd;
+static int itco; /* watchdog_identity is "iTCO_wdt" */
static int allow_scripts;
static int kill_script_sec;
@@ -79,6 +82,7 @@ static const char *scripts_dir = "/etc/wdmd.d";
static char watchdog_path[WDPATH_SIZE];
static char option_path[WDPATH_SIZE];
static char saved_path[WDPATH_SIZE];
+static char watchdog_identity[WD_ID_SIZE];
struct script_status {
uint64_t start;
@@ -115,11 +119,10 @@ static int client_size = 0;
static struct client *client = NULL;
static struct pollfd *pollfd = NULL;
-
#define log_debug(fmt, args...) \
do { \
if (daemon_debug) \
- fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL),
##args); \
+ fprintf(stderr, "%llu %s " fmt "\n", (unsigned long
long)time(NULL), time_str(), ##args); \
} while (0)
#define log_error(fmt, args...) \
@@ -146,6 +149,19 @@ static uint64_t monotime(void)
return ts.tv_sec;
}
+char time_str_buf[128];
+
+static char *time_str(void)
+{
+ struct timeval cur_time;
+ struct tm time_info;
+
+ gettimeofday(&cur_time, NULL);
+ localtime_r(&cur_time.tv_sec, &time_info);
+ strftime(time_str_buf, sizeof(time_str_buf), "%Y-%m-%d %H:%M:%S ",
&time_info);
+ return time_str_buf;
+}
+
/*
* test clients
*/
@@ -282,12 +298,13 @@ static void dump_debug(int fd)
now = monotime();
memset(line, 0, sizeof(line));
- snprintf(line, 255, "wdmd %d socket_gid %d high_priority %d now %llu last_keepalive
%llu last_closeunclean %llu allow_scripts %d kill_script_sec %d fire_timeout %d\n",
+ snprintf(line, 255, "wdmd %d socket_gid %d high_priority %d now %llu last_keepalive
%llu last_closeunclean %llu allow_scripts %d kill_script_sec %d fire_timeout %d identity
\"%s\"\n",
getpid(), socket_gid, high_priority,
(unsigned long long)now,
(unsigned long long)last_keepalive,
(unsigned long long)last_closeunclean,
- allow_scripts, kill_script_sec, fire_timeout);
+ allow_scripts, kill_script_sec, fire_timeout,
+ watchdog_identity);
line_len = strlen(line);
strncat(debug_buf, line, LINE_SIZE);
@@ -416,11 +433,93 @@ static void pet_watchdog(void)
log_debug("keepalive %d", rv);
}
+static int _open_watchdog_itco(struct wdmd_header *h)
+{
+ int get_timeout_itco, get_timeout_real, set_timeout_itco, set_timeout_real;
+ int rv;
+
+ /* Don't check dev_fd for -1 because dev_fd will be closed
+ and set to -1 prior to timeout in close_watchdog_unclean(). */
+
+ if (test_loop_enable)
+ return 0;
+
+ if (!h->fire_timeout)
+ return -1;
+
+ rv = open_dev();
+ if (rv < 0)
+ return -1;
+
+ get_timeout_real = 0;
+ get_timeout_itco = 0;
+
+ rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout_itco);
+ if (rv < 0) {
+ log_error("open_watchdog gettimeout error %d", errno);
+ close_watchdog();
+ return -1;
+ }
+
+ get_timeout_real = get_timeout_itco * 2;
+
+ if (get_timeout_real == h->fire_timeout) {
+ /* success, requested value matches the default value */
+ fire_timeout = get_timeout_real;
+ _init_test_interval();
+ log_error("%s open with timeout %d", watchdog_path, get_timeout_real);
+ pet_watchdog();
+ test_loop_enable = 1;
+ return 0;
+ }
+
+ set_timeout_real = h->fire_timeout;
+ set_timeout_itco = set_timeout_real / 2;
+
+ rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &set_timeout_itco);
+ if (rv < 0) {
+ log_error("open_watchdog settimeout %d error %d", set_timeout_real, errno);
+ close_watchdog();
+ return -1;
+ }
+
+ get_timeout_real = 0;
+ get_timeout_itco = 0;
+
+ rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout_itco);
+ if (rv < 0) {
+ log_error("open_watchdog gettimeout check error %d", errno);
+ close_watchdog();
+ return -1;
+ }
+
+ get_timeout_real = get_timeout_itco * 2;
+
+ if (get_timeout_real == set_timeout_real) {
+ /* success setting a custom timeout */
+ fire_timeout = get_timeout_real;
+ _init_test_interval();
+ log_error("%s open with timeout %d", watchdog_path, get_timeout_real);
+ pet_watchdog();
+ test_loop_enable = 1;
+ return 0;
+ }
+
+ /* failed to set a custom timeout */
+ log_error("open_watchdog gettimeout value real %d itco %d expect real %d",
+ get_timeout_real, get_timeout_itco, set_timeout_real);
+ close_watchdog();
+ return -1;
+}
+
static int _open_watchdog(struct wdmd_header *h)
{
int get_timeout, set_timeout;
int rv;
+ if (itco)
+ return _open_watchdog_itco(h);
+
/* Don't check dev_fd for -1 because dev_fd will be closed
and set to -1 prior to timeout in close_watchdog_unclean(). */
@@ -1112,6 +1211,49 @@ static int test_scripts(void)
return fail_count;
}
+static int setup_identity(char *wdpath)
+{
+ char sysfs_path[PATH_MAX] = { 0 };
+ char *base, *p;
+ int fd, rv;
+
+ /*
+ * This function will be called multiple times when probing
+ * different watchdog paths for one that works.
+ */
+ itco = 0;
+ memset(watchdog_identity, 0, sizeof(watchdog_identity));
+
+ /*
+ * $ cat /sys/class/watchdog/watchdog0/identity
+ * iTCO_wdt
+ */
+ if (!(base = basename(wdpath)))
+ return -1;
+
+ snprintf(sysfs_path, PATH_MAX-1, "/sys/class/watchdog/%s/identity", base);
+
+ if ((fd = open(sysfs_path, O_RDONLY)) < 0)
+ return -1;
+
+ rv = read(fd, watchdog_identity, WD_ID_SIZE-1);
+
+ close(fd);
+
+ if (rv <= 0)
+ return -1;
+
+ if ((p = strchr(watchdog_identity, '\n')))
+ *p = '\0';
+
+ log_debug("%s %s %s", wdpath, sysfs_path, watchdog_identity);
+
+ if (!strcmp(watchdog_identity, "iTCO_wdt"))
+ itco = 1;
+
+ return 0;
+}
+
static int _setup_watchdog(char *path)
{
struct stat buf;
@@ -1148,6 +1290,7 @@ static int _setup_watchdog(char *path)
}
/*
+ * Success: returns 0 with watchdog_path set.
* Order of preference:
* . saved path (path used before daemon restart)
* . command line option (-w)
@@ -1208,11 +1351,133 @@ static int setup_watchdog(void)
}
+/*
+ * iTCO_wdt actual firing timeout is double the value used in get/set!
+ *
https://bugzilla.kernel.org/show_bug.cgi?id=213809
+ */
+static int _try_timeout_itco(const char *path)
+{
+ struct stat buf;
+ int try_timeout_real, try_timeout_itco, get_timeout_real, get_timeout_itco,
set_timeout_real, set_timeout_itco;
+ int unused, fd, err, rv, rv2;
+
+ rv = stat(path, &buf);
+ if (rv < 0) {
+ fprintf(stderr, "%s stat error %d\n", path, errno);
+ return -1;
+ }
+
+ fd = open(path, O_WRONLY | O_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "%s open error %d\n", path, errno);
+ return fd;
+ }
+
+ printf("%s %s open fd %d\n", time_str(), path, fd);
+
+ get_timeout_real = 0;
+ get_timeout_itco = 0;
+
+ rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout_itco);
+ if (rv < 0) {
+ fprintf(stderr, "%s gettimeout error %d\n", path, errno);
+ rv = -1;
+ goto out;
+ }
+
+ get_timeout_real = get_timeout_itco * 2;
+
+ printf("%s %s gettimeout real %d itco %d\n", time_str(), path,
get_timeout_real, get_timeout_itco);
+
+ if (get_timeout_real == try_timeout)
+ goto keepalive;
+
+ try_timeout_real = try_timeout;
+ try_timeout_itco = try_timeout_real / 2;
+ set_timeout_real = try_timeout;
+ set_timeout_itco = set_timeout_real / 2;
+
+ rv = ioctl(fd, WDIOC_SETTIMEOUT, &set_timeout_itco);
+ if (rv < 0) {
+ fprintf(stderr, "%s settimeout real %d itco %d error %d\n", path,
set_timeout_real, set_timeout_itco, errno);
+ rv = -1;
+ goto out;
+ }
+
+ set_timeout_real = set_timeout_itco * 2;
+
+ printf("%s %s settimeout real %d itco %d result real %d itco %d\n",
time_str(), path,
+ try_timeout_real, try_timeout_itco, set_timeout_real, set_timeout_itco);
+
+ if (set_timeout_itco != try_timeout_itco) {
+ fprintf(stderr, "%s settimeout real %d itco %d failed\n", path,
try_timeout_real, try_timeout_itco);
+ rv = -1;
+ goto out;
+ }
+
+ get_timeout_real = 0;
+ get_timeout_itco = 0;
+
+ rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout_itco);
+ if (rv < 0) {
+ fprintf(stderr, "%s gettimeout error %d\n", path, errno);
+ rv = -1;
+ goto out;
+ }
+
+ get_timeout_real = get_timeout_itco * 2;
+
+ printf("%s %s gettimeout real %d itco %d\n", time_str(), path,
get_timeout_real, get_timeout_itco);
+
+ keepalive:
+
+ rv = ioctl(fd, WDIOC_KEEPALIVE, &unused);
+ if (rv < 0) {
+ fprintf(stderr, "%s keepalive error %d\n", path, errno);
+ rv = -1;
+ goto out;
+ }
+
+ printf("%s %s keepalive fd %d result %d\n", time_str(), path, fd, rv);
+
+ if (forcefire) {
+ int sleep_sec = 0;
+ int i;
+ setbuf(stdout, NULL);
+ printf("%s waiting for watchdog to reset machine:\n", time_str());
+ for (i = 1; i < get_timeout_real + 5; i++) {
+ sleep(1);
+ sleep_sec++;
+ if (sleep_sec >= get_timeout_real+1)
+ printf("%s %d %s failed to fire after timeout %d seconds\n", time_str(), i,
path, get_timeout_real);
+ else
+ printf("%s %d\n", time_str(), i);
+ }
+ }
+
+ rv = 0;
+ out:
+ err = write(fd, "V", 1);
+ if (err < 0) {
+ fprintf(stderr, "trytimeout failed to disarm %s error %d %d\n", path, err,
errno);
+ openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON);
+ syslog(LOG_ERR, "trytimeout failed to disarm %s error %d %d\n", path, err,
errno);
+ }
+
+ printf("%s %s disarm write V fd %d result %d\n", time_str(), path, fd, rv);
+
+ rv2 = close(fd);
+
+ printf("%s %s close fd %d result %d\n", time_str(), path, fd, rv2);
+
+ return rv;
+}
+
static int _try_timeout(const char *path)
{
struct stat buf;
int get_timeout, set_timeout;
- int unused, fd, err, rv;
+ int unused, fd, err, rv, rv2;
rv = stat(path, &buf);
if (rv < 0) {
@@ -1226,6 +1491,8 @@ static int _try_timeout(const char *path)
return fd;
}
+ printf("%s %s open fd %d\n", time_str(), path, fd);
+
get_timeout = 0;
rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout);
@@ -1235,7 +1502,10 @@ static int _try_timeout(const char *path)
goto out;
}
- printf("%s gettimeout %d\n", path, get_timeout);
+ printf("%s %s gettimeout %d\n", time_str(), path, get_timeout);
+
+ if (get_timeout == try_timeout)
+ goto keepalive;
set_timeout = try_timeout;
@@ -1246,7 +1516,7 @@ static int _try_timeout(const char *path)
goto out;
}
- printf("%s settimeout %d result %d\n", path, try_timeout, set_timeout);
+ printf("%s %s settimeout %d result %d\n", time_str(), path, try_timeout,
set_timeout);
if (set_timeout != try_timeout) {
fprintf(stderr, "%s settimeout %d failed\n", path, try_timeout);
@@ -1263,7 +1533,9 @@ static int _try_timeout(const char *path)
goto out;
}
- printf("%s gettimeout %d\n", path, get_timeout);
+ printf("%s %s gettimeout %d\n", time_str(), path, get_timeout);
+
+ keepalive:
rv = ioctl(fd, WDIOC_KEEPALIVE, &unused);
if (rv < 0) {
@@ -1272,22 +1544,20 @@ static int _try_timeout(const char *path)
goto out;
}
+ printf("%s %s keepalive fd %d result %d\n", time_str(), path, fd, rv);
+
if (forcefire) {
int sleep_sec = 0;
int i;
setbuf(stdout, NULL);
- printf("waiting for watchdog to reset machine:\n");
+ printf("%s waiting for watchdog to reset machine:\n", time_str());
for (i = 1; i < get_timeout + 5; i++) {
sleep(1);
sleep_sec++;
- if (sleep_sec == get_timeout+1) {
- printf("\n");
- printf("%d %s failed to fire after timeout %d seconds\n", i, path,
get_timeout);
- } else if (sleep_sec > get_timeout+1) {
- printf("%d %s failed to fire after timeout %d seconds\n", i, path,
get_timeout);
- } else {
- printf("%d ", i);
- }
+ if (sleep_sec >= get_timeout+1)
+ printf("%s %d %s failed to fire after timeout %d seconds\n", time_str(), i,
path, get_timeout);
+ else
+ printf("%s %d\n", time_str(), i);
}
}
@@ -1300,6 +1570,79 @@ static int _try_timeout(const char *path)
syslog(LOG_ERR, "trytimeout failed to disarm %s error %d %d\n", path, err,
errno);
}
+ printf("%s %s disarm write V fd %d result %d\n", time_str(), path, fd, rv);
+
+ rv2 = close(fd);
+
+ printf("%s %s close fd %d result %d\n", time_str(), path, fd, rv2);
+
+ return rv;
+}
+
+static int _probe_dev_itco(const char *path)
+{
+ struct stat buf;
+ int fd, err, rv, timeout_real, timeout_itco;
+
+ rv = stat(path, &buf);
+ if (rv < 0) {
+ fprintf(stderr, "error %d stat %s\n", errno, path);
+ return -1;
+ }
+
+ fd = open(path, O_WRONLY | O_CLOEXEC);
+ if (fd < 0) {
+ fprintf(stderr, "error %d open %s\n", errno, path);
+ return fd;
+ }
+
+ timeout_real = 0;
+ timeout_itco = 0;
+
+ rv = ioctl(fd, WDIOC_GETTIMEOUT, &timeout_itco);
+ if (rv < 0) {
+ fprintf(stderr, "error %d ioctl gettimeout %s\n", errno, path);
+ rv = -1;
+ goto out;
+ }
+
+ timeout_real = timeout_itco * 2;
+
+ if (timeout_real == fire_timeout) {
+ printf("%s\n", path);
+ rv = 0;
+ goto out;
+ }
+
+ timeout_real = fire_timeout;
+ timeout_itco = timeout_real / 2;
+
+ rv = ioctl(fd, WDIOC_SETTIMEOUT, &timeout_itco);
+ if (rv < 0) {
+ fprintf(stderr, "error %d ioctl settimeout %s\n", errno, path);
+ rv = -1;
+ goto out;
+ }
+
+ timeout_real = timeout_itco * 2;
+
+ if (timeout_real != fire_timeout) {
+ fprintf(stderr, "error %d invalid timeout %s\n", errno, path);
+ rv = -1;
+ goto out;
+ }
+
+ printf("%s\n", path);
+ rv = 0;
+
+ out:
+ err = write(fd, "V", 1);
+ if (err < 0) {
+ fprintf(stderr, "probe failed to disarm %s error %d %d\n", path, err,
errno);
+ openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON);
+ syslog(LOG_ERR, "probe failed to disarm %s error %d %d\n", path, err,
errno);
+ }
+
close(fd);
return rv;
}
@@ -1366,14 +1709,37 @@ static int _probe_dev(const char *path)
return rv;
}
-static int probe_dev(const char *path)
+static int probe_dev(const char *wdpath)
{
- if (try_timeout)
- return _try_timeout(path);
- else
- return _probe_dev(path);
+ char *path = (char *)wdpath;
+
+ setup_identity(path); /* sets itco=1 if iTCO_wdt */
+
+ if (try_timeout) {
+ /*
+ * Used to test support for a given timeout with: wdmd -t <secs>
+ * or to test firing for a given timeout with: wdmd -F -t <secs>
+ */
+ if (itco)
+ return _try_timeout_itco(path);
+ else
+ return _try_timeout(path);
+ } else {
+ /*
+ * Used to print on stdout just the path of the watchdog device
+ * that wdmd would use with: wdmd -p
+ */
+ if (itco)
+ return _probe_dev_itco(path);
+ else
+ return _probe_dev(path);
+ }
}
+/*
+ * Confusingly, this is the top level function for both
+ * wdmd -t (test timeout) and wdmd -p (print functional watchdog device).
+ */
static int probe_watchdog(void)
{
int rv;
@@ -1861,6 +2227,11 @@ int main(int argc, char *argv[])
}
}
+ if (forcefire && !do_probe) {
+ fprintf(stderr, "Use force fire (-F) with a timeout (-t).\n");
+ exit(EXIT_FAILURE);
+ }
+
if (do_probe) {
rv = setup_shm();
if (rv < 0) {
@@ -1891,9 +2262,6 @@ int main(int argc, char *argv[])
openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON);
- log_error("wdmd started S%d H%d G%d", allow_scripts, high_priority,
- socket_gid);
-
setup_priority();
rv = lockfile();
@@ -1920,10 +2288,17 @@ int main(int argc, char *argv[])
if (rv < 0)
goto out_files;
+ /* Sets watchdog_path */
rv = setup_watchdog();
if (rv < 0)
goto out_clients;
+ /* Sets watchdog_identity and itco */
+ setup_identity(watchdog_path);
+
+ log_error("wdmd started S%d H%d G%d using %s \"%s\"", allow_scripts,
high_priority,
+ socket_gid, watchdog_path, watchdog_identity[0] ? watchdog_identity :
"unknown");
+
rv = test_loop();
close_watchdog();
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.