This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch master
in repository sanlock.
The following commit(s) were added to refs/heads/master by this push:
new 1d8cda2 sanlock: fix time calculations for dead host
1d8cda2 is described below
commit 1d8cda209d6431dbec7a60192e5cc50e24eeb0b8
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Fri Jul 26 14:08:48 2024 -0500
sanlock: fix time calculations for dead host
In two places, incorrect time values were used in the
dead host calculation:
- When checking if a shared lease holder was dead.
- When reporting host state flags for info queries.
In both cases, the current local time was compared to
the local time of the last succesful renewal, with the
difference compared to other_host_dead_seconds.
Using the current local time is incorrect, because it
doesn't necessarily correspond to the time when the
other lease was checked, e.g. if local renewals were
delayed. It must compare the local times of two
consecutive, successful reads of the other lease.
If the other lease is unchanged in this interval,
then the difference can be correctly compared to
other_host_dead_seconds to determine a host is dead.
(The host dead calculation in paxos_lease_acquire,
the most important check, was correct.)
New debug messages are added for other host_id lease
renewals, which can be controlled with debug_hosts
in sanlock.conf.
---
src/cmd.c | 2 +
src/lockspace.c | 154 +++++++++++++++++++++++++++++++++---------
src/main.c | 13 ++++
src/resource.c | 180 ++++++++++++++++++++++++++++++++++++++++---------
src/sanlock.8 | 9 +++
src/sanlock.conf | 3 +
src/sanlock_internal.h | 3 +
7 files changed, 301 insertions(+), 63 deletions(-)
diff --git a/src/cmd.c b/src/cmd.c
index dda9ab8..561e8d5 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -2291,6 +2291,7 @@ static int print_state_daemon(char *str)
"mlock_level=%d "
"quiet_fail=%d "
"debug_renew=%d "
+ "debug_hosts=%d "
"debug_clients=%d "
"debug_cmds=0x%llx "
"renewal_history_size=%d "
@@ -2321,6 +2322,7 @@ static int print_state_daemon(char *str)
com.mlock_level,
com.quiet_fail,
com.debug_renew,
+ com.debug_hosts,
com.debug_clients,
(unsigned long long)com.debug_cmds,
com.renewal_history_size,
diff --git a/src/lockspace.c b/src/lockspace.c
index a984754..ef3336e 100644
--- a/src/lockspace.c
+++ b/src/lockspace.c
@@ -384,6 +384,10 @@ void check_other_leases(struct space *sp, char *buf)
struct sanlk_host_event he;
char *bitmap;
uint64_t now;
+ uint64_t last;
+ uint32_t other_io_timeout;
+ int other_host_fail_seconds;
+ int other_host_dead_seconds;
int i, new;
now = monotime();
@@ -401,6 +405,9 @@ void check_other_leases(struct space *sp, char *buf)
leader_record_in(leader_end, &leader_in);
leader = &leader_in;
+ if (!hs->owner_id && !leader->owner_id)
+ continue;
+
/*
* If this lease has invalid fields, log an error. Limit the logging
* frequency to avoid blowing up logs if some lease is left in a bad
@@ -450,14 +457,95 @@ void check_other_leases(struct space *sp, char *buf)
(unsigned long long)leader->timestamp,
leader->resource_name);
memcpy(hs->owner_name, leader->resource_name, NAME_ID_SIZE);
+
+ /* log warning for each generation of a host_id */
+ hs->warned_fail = 0;
+ hs->warned_dead = 0;
}
+ /*
+ * Other host's timestamp has not changed since we last checked it.
+ */
if (hs->owner_id == leader->owner_id &&
hs->owner_generation == leader->owner_generation &&
hs->timestamp == leader->timestamp) {
+
+ /*
+ * No host state to update since the lease is unchanged.
+ * If host check debugging is enabled, we may want to
+ * log info about another host lease fail/dead state.
+ */
+ if (!com.debug_hosts)
+ continue;
+
+ /*
+ * Our own renewal info is logged elsewhere, this
+ * is about other hosts lease renewals.
+ */
+ if (sp->host_id == hs->owner_id)
+ continue;
+
+ /*
+ * Don't bother logging info about other host leases if
+ * we are quitting/failing ourself.
+ */
+ if (sp->renew_fail || sp->space_dead || sp->killing_pids)
+ continue;
+
+ other_io_timeout = hs->io_timeout;
+ other_host_fail_seconds = calc_id_renewal_fail_seconds(other_io_timeout);
+ other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout);
+ last = hs->last_live ? hs->last_live : hs->first_check;
+
+ /*
+ * Informational/debug only, these fail/dead
+ * determinations are not used anywhere.
+ */
+
+ if (!hs->warned_fail &&
+ (com.debug_hosts == 2) &&
+ (hs->last_check - last > other_host_fail_seconds)) {
+ log_space(sp, "host %llu %llu %llu is failing",
+ (unsigned long long)hs->owner_id,
+ (unsigned long long)hs->owner_generation,
+ (unsigned long long)hs->timestamp);
+ hs->warned_fail = 1;
+ }
+
+ if (!hs->warned_dead &&
+ (hs->last_check - last > other_host_dead_seconds)) {
+ log_warns(sp, "host %llu %llu %llu is dead",
+ (unsigned long long)hs->owner_id,
+ (unsigned long long)hs->owner_generation,
+ (unsigned long long)hs->timestamp);
+ hs->warned_dead = 1;
+ }
+
continue;
}
+ /*
+ * Log debug messages for each renewal seen from another host.
+ */
+ if ((com.debug_hosts == 2) && hs->owner_id) { /* hs->owner_id: skip first time set */
+ if ((hs->owner_id != leader->owner_id) || (hs->owner_generation != leader->owner_generation)) {
+ log_space(sp, "host %llu %llu %llu updated to %llu %llu %llu",
+ (unsigned long long)hs->owner_id,
+ (unsigned long long)hs->owner_generation,
+ (unsigned long long)hs->timestamp,
+ (unsigned long long)leader->owner_id,
+ (unsigned long long)leader->owner_generation,
+ (unsigned long long)leader->timestamp);
+ } else if (hs->timestamp != leader->timestamp) {
+ /* Normal renewal, or release if new timestamp is 0. */
+ log_space(sp, "host %llu %llu %llu new timestamp %llu",
+ (unsigned long long)hs->owner_id,
+ (unsigned long long)hs->owner_generation,
+ (unsigned long long)hs->timestamp,
+ (unsigned long long)leader->timestamp);
+ }
+ }
+
/*
* Replacing good values with potentially bad values
* would have no purpose, and would confuse things, so
@@ -1494,52 +1582,54 @@ int get_lockspaces(char *buf, int *len, int *count, int maxlen)
* After 80 seconds, we'd return FAIL. After 140 seconds we'd return DEAD.
*/
-/* Also see host_live() */
-
static uint32_t get_host_flag(struct space *sp, struct host_status *hs)
{
- uint64_t now, last;
- uint32_t flags;
uint32_t other_io_timeout;
int other_host_fail_seconds, other_host_dead_seconds;
- now = monotime();
- other_io_timeout = hs->io_timeout;
- other_host_fail_seconds = calc_id_renewal_fail_seconds(other_io_timeout);
- other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout);
+ /* we know we are alive */
+ if (sp->host_id == hs->owner_id)
+ return SANLK_HOST_LIVE;
- flags = 0;
+ /* See sh_holder_live() for same/similar checks */
- if (!hs->timestamp) {
- flags = SANLK_HOST_FREE;
- goto out;
- }
+ if (!hs->first_check)
+ return SANLK_HOST_UNKNOWN;
- if (!hs->last_live)
- last = hs->first_check;
- else
- last = hs->last_live;
+ if (!hs->last_check)
+ return SANLK_HOST_UNKNOWN;
- if (sp->host_id == hs->owner_id) {
- /* we are alive */
- flags = SANLK_HOST_LIVE;
+ if (!hs->timestamp)
+ return SANLK_HOST_FREE;
- } else if ((now - last <= other_host_fail_seconds) &&
- (hs->first_check == hs->last_live)) {
- /* we haven't seen the timestamp change yet */
- flags = SANLK_HOST_UNKNOWN;
+ if (hs->first_check == hs->last_check)
+ return SANLK_HOST_UNKNOWN;
- } else if (now - last <= other_host_fail_seconds) {
- flags = SANLK_HOST_LIVE;
+ if (hs->last_check == hs->last_live)
+ return SANLK_HOST_LIVE;
- } else if (now - last > other_host_dead_seconds) {
- flags = SANLK_HOST_DEAD;
+ other_io_timeout = hs->io_timeout;
+ other_host_fail_seconds = calc_id_renewal_fail_seconds(other_io_timeout);
+ other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout);
- } else if (now - last > other_host_fail_seconds) {
- flags = SANLK_HOST_FAIL;
+ if (!hs->last_live) {
+ if (hs->last_check - hs->first_check <= other_host_fail_seconds)
+ return SANLK_HOST_UNKNOWN;
+ if (hs->last_check - hs->first_check > other_host_dead_seconds)
+ return SANLK_HOST_DEAD;
+ if (hs->last_check - hs->first_check > other_host_fail_seconds)
+ return SANLK_HOST_FAIL;
+ /* shouldn't be possible */
+ return SANLK_HOST_UNKNOWN;
+ } else {
+ if (hs->last_check - hs->last_live > other_host_dead_seconds)
+ return SANLK_HOST_DEAD;
+ if (hs->last_check - hs->last_live > other_host_fail_seconds)
+ return SANLK_HOST_FAIL;
+ /* last_check == last_live is the common case, covered above */
+ /* last_check > last_live, but not yet fail_seconds, is live */
+ return SANLK_HOST_LIVE;
}
-out:
- return flags;
}
int get_hosts(struct sanlk_lockspace *ls, char *buf, int *len, int *count, int maxlen)
diff --git a/src/main.c b/src/main.c
index 124848b..f9ebe96 100644
--- a/src/main.c
+++ b/src/main.c
@@ -886,6 +886,14 @@ static int main_loop(void)
if (check_buf)
memset(check_buf, 0, check_buf_len);
+ /*
+ * check_our_lease will return check_all=1 if
+ * all ls leases have been successfully read
+ * since it was last called. They are copied
+ * into check_buf for check_other_leases.
+ * So, check_other_leases will be called once
+ * after each successful read of other leases.
+ */
check_all = 0;
rv = check_our_lease(sp, &check_all, check_buf);
@@ -2912,6 +2920,10 @@ static void read_config_file(void)
get_val_int(line, &val);
com.debug_renew = val;
+ } else if (!strcmp(str, "debug_hosts")) {
+ get_val_int(line, &val);
+ com.debug_hosts = val;
+
} else if (!strcmp(str, "use_aio")) {
get_val_int(line, &val);
com.aio_arg = val;
@@ -4085,6 +4097,7 @@ int main(int argc, char *argv[])
com.max_sectors_kb_align = DEFAULT_MAX_SECTORS_KB_ALIGN;
com.max_sectors_kb_num = DEFAULT_MAX_SECTORS_KB_NUM;
com.debug_cmds = ~0LL;
+ com.debug_hosts = 1;
/* By default disable cmds that often cause too much logging. */
clear_cmd_debug(SM_CMD_INQ_LOCKSPACE);
diff --git a/src/resource.c b/src/resource.c
index f5b2f98..388e737 100644
--- a/src/resource.c
+++ b/src/resource.c
@@ -328,70 +328,188 @@ int read_resource_owners(struct task *task, struct token *token,
return rv;
}
-/* return 1 (is alive) to force a failure if we don't have enough
+/* return 1 (other host is alive) to force a failure if we don't have enough
knowledge to know it's really not alive. Later we could have this sit and
wait (like paxos_lease_acquire) until we have waited long enough or have
enough knowledge to say it's safely dead (unless of course we find it is
alive while waiting) */
-static int host_live(char *lockspace_name, uint32_t space_id, uint64_t host_id, uint64_t gen)
+/* check_other_leases is called once each time other leases are successfully
+ read, and sets the host_info fields retrieved here. */
+
+static int sh_holder_live(char *lockspace_name, uint32_t space_id, uint64_t host_id, uint64_t mb_gen)
{
struct host_status hs;
- uint64_t now;
int other_io_timeout, other_host_dead_seconds;
int rv;
+ /*
+ * host_info() returns what our own renewal thread has observed about
+ * other host_id leases in the lockspace. Our renewal thread could be
+ * delayed renewing our lease, or get io errors reading the lockspace
+ * leases, so we need to be cautious when making judgements about other
+ * host liveness based on this data.
+ *
+ * hs.first_check
+ * Local time when we first looked at the host_id lease.
+ * Will only be zero for a short period before our first call
+ * to check_other_leases().
+ * While zero, just assume the other host is alive.
+ *
+ * hs.last_check
+ * Local time that we last looked at the host_id lease.
+ * Will only be zero for a short period before our first call
+ * to check_other_leases()
+ * While zero, just assume the other host is alive.
+ *
+ * hs.last_live
+ * Local time that we last looked at the host_id lease, and
+ * saw an updated timestamp (vs the prev check.)
+ * Will be zero until three steps occur:
+ * 1. we renew our lease, and read other lease, read timestamp=X
+ * 2. other host updates its lease, writes timestamp=Y
+ * 3. we renew our lease, and read other lease, read timestamp=Y
+ * because X and Y differ, set last_live to local time.
+ * If a new timestamp is never seen, last_live will remain zero,
+ * in which case the difference between first_check and last_check
+ * can be used to determine the host is dead.
+ *
+ * The local time difference between now and last_live cannot be used
+ * to determine that another host is dead, because this time difference
+ * can become long if we fail to read other leases, i.e. the other
+ * lease may have a newer timestamp and we just haven't read it.
+ *
+ * If we read another host's lease, record the local time we read it
+ * (time 'A'), then reread that same lease later (time 'B'). If the
+ * lease is unchanged (no updated timestamp) between the two reads,
+ * then the time difference between them (B-A) can be compared to
+ * other_host_dead_seconds to determine if that host is dead.
+ * It doesn't matter when our own times A and B occur, or if they
+ * are extended because of our own failure to renew our lease.
+ */
+
rv = host_info(lockspace_name, host_id, &hs);
if (rv) {
- log_sid(space_id, "host_live %llu %llu yes host_info %d",
- (unsigned long long)host_id, (unsigned long long)gen, rv);
+ log_sid(space_id, "sh_holder %llu mb_gen %llu LIVE - host_info error %d",
+ (unsigned long long)host_id, (unsigned long long)mb_gen, rv);
+ return 1;
+ }
+
+ if (!hs.first_check) {
+ log_sid(space_id, "sh_holder %llu mb_gen %llu LIVE - host_info not yet checked",
+ (unsigned long long)host_id, (unsigned long long)mb_gen);
return 1;
}
if (!hs.last_check) {
- log_sid(space_id, "host_live %llu %llu yes unchecked",
- (unsigned long long)host_id, (unsigned long long)gen);
+ /* not possible */
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu LIVE - zero last_check should not happen",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen);
return 1;
}
/* the host_id lease is free, not being used */
if (!hs.timestamp) {
- log_sid(space_id, "host_live %llu %llu no lease free",
- (unsigned long long)host_id, (unsigned long long)gen);
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu DEAD - lease free",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen);
return 0;
}
- if (hs.owner_generation > gen) {
- log_sid(space_id, "host_live %llu %llu no old gen %llu",
- (unsigned long long)host_id, (unsigned long long)gen,
- (unsigned long long)hs.owner_generation);
+ if (hs.owner_generation > mb_gen) {
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu DEAD - old mb_gen",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen);
return 0;
}
- now = monotime();
+ if (hs.first_check == hs.last_check) {
+ /* not enough checks to know, assume live */
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu LIVE - first_check and last_check %llu",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_live);
+ return 1;
+ }
+
+ if (hs.last_check == hs.last_live) {
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu LIVE - last_check and last_live %llu",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_live);
+ return 1;
+ }
other_io_timeout = hs.io_timeout;
other_host_dead_seconds = calc_host_dead_seconds(other_io_timeout);
- if (!hs.last_live && (now - hs.first_check > other_host_dead_seconds)) {
- log_sid(space_id, "host_live %llu %llu no first_check %llu",
- (unsigned long long)host_id, (unsigned long long)gen,
- (unsigned long long)hs.first_check);
- return 0;
+ /*
+ * last_live remains 0 until we see the host write a new timestamp,
+ * which it may never do if it's dead.
+ */
+ if (!hs.last_live) {
+ if (hs.last_check - hs.first_check > other_host_dead_seconds) {
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu DEAD - last_live 0 last_check %llu first_check %llu",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_check,
+ (unsigned long long)hs.first_check);
+ return 0;
+ } else {
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu LIVE - last_live 0 last_check %llu first_check %llu",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_check,
+ (unsigned long long)hs.first_check);
+ return 1;
+ }
}
- if (hs.last_live && (now - hs.last_live > other_host_dead_seconds)) {
- log_sid(space_id, "host_live %llu %llu no last_live %llu",
- (unsigned long long)host_id, (unsigned long long)gen,
- (unsigned long long)hs.last_live);
+ /*
+ * last_live is the local time when we last saw a new timestamp.
+ * last_check is the local time when we last read the timestamp.
+ * The host is dead if it has not updated its timestamp in
+ * last_check-last_live.
+ *
+ * Note that 'now' could be much later than last_check if we
+ * have failed to read the lockspace leases (and renew our own
+ * lease.) So, now-last_live is not a correct way to measure
+ * if the other host is dead.
+ */
+ if (hs.last_check - hs.last_live > other_host_dead_seconds) {
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu DEAD - last_check %llu last_live %llu other_host_dead_seconds %u",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_check,
+ (unsigned long long)hs.last_live,
+ other_host_dead_seconds);
return 0;
}
- log_sid(space_id, "host_live %llu %llu yes recent first_check %llu last_live %llu",
- (unsigned long long)host_id, (unsigned long long)gen,
- (unsigned long long)hs.first_check,
+ log_sid(space_id, "sh_holder %llu %llu %llu mb_gen %llu LIVE - last_check %llu last_live %llu",
+ (unsigned long long)host_id,
+ (unsigned long long)hs.owner_generation,
+ (unsigned long long)hs.timestamp,
+ (unsigned long long)mb_gen,
+ (unsigned long long)hs.last_check,
(unsigned long long)hs.last_live);
-
return 1;
}
@@ -660,7 +778,7 @@ static int clear_dead_shared(struct task *task, struct token *token,
continue;
}
- if (host_live(token->r.lockspace_name, token->space_id, host_id, mb.generation)) {
+ if (sh_holder_live(token->r.lockspace_name, token->space_id, host_id, mb.generation)) {
log_token(token, "clear_dead_shared host_id %llu gen %llu alive",
(unsigned long long)host_id, (unsigned long long)mb.generation);
live++;
@@ -675,10 +793,10 @@ static int clear_dead_shared(struct task *task, struct token *token,
}
/*
- * not an error, just useful to have a record of when we clear a shared
- * lock that was left by a failed host.
+ * may be interesting to have a record of when we clear a
+ * shared lock that was left by a failed host.
*/
- log_errot(token, "cleared shared lease for dead host_id %llu gen %llu",
+ log_warnt(token, "cleared shared lease for dead host %llu %llu",
(unsigned long long)host_id, (unsigned long long)mb.generation);
}
diff --git a/src/sanlock.8 b/src/sanlock.8
index 9b71059..d94e6e5 100644
--- a/src/sanlock.8
+++ b/src/sanlock.8
@@ -1406,6 +1406,15 @@ for that command. Special values +all and -all can be used to
enable or disable all commands, and can be used before or after other
debug_cmd lines.
+.IP \[bu] 2
+debug_hosts = 1
+.br
+Log information about other host_id lease renewals. When set to 1
+(the default), messages are logged when a host_id lease is observed
+reaching the failed and dead states. When set to 2, messages are
+logged when any update (e.g. renewal) is observed for another host_id
+lease. When set to 0, neither are logged.
+
.IP \[bu] 2
write_init_io_timeout = <seconds>
.br
diff --git a/src/sanlock.conf b/src/sanlock.conf
index 89899f1..d55c280 100644
--- a/src/sanlock.conf
+++ b/src/sanlock.conf
@@ -75,3 +75,6 @@
#
# watchdog_fire_timeout = 60
# command line: n/a
+#
+# debug_hosts = 1
+# command line: n/a
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index dda7d25..cb41538 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -180,6 +180,8 @@ struct host_status {
uint64_t set_bit_time;
uint16_t io_timeout;
uint16_t lease_bad;
+ uint8_t warned_fail;
+ uint8_t warned_dead;
char owner_name[NAME_ID_SIZE] __attribute__ ((nonstring));
};
@@ -342,6 +344,7 @@ struct command_line {
int action; /* ACT_ */
int debug;
int debug_renew;
+ int debug_hosts;
int debug_clients;
int debug_io_submit;
int debug_io_complete;
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.