Gitweb:
http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: fe9a89972834d0459c312bede9e4a32df52e445a
Parent: 1efff01c28dadf735cab90433c20a7dcbe3c81ef
Author: Eduardo Damato <edamato(a)redhat.com>
AuthorDate: Tue Sep 29 09:58:18 2009 -0400
Committer: Lon Hohberger <lhh(a)redhat.com>
CommitterDate: Fri Oct 30 16:30:23 2009 -0400
qdisk: Implement I/O timeouts in qdiskd
This allows administrators to make qdiskd reboot the
system if it can not write its status out for interval*tko
seconds.
Resolves: rhbz#511113
Part 1/4
Signed-off-by: Eduardo Damato <edamato(a)redhat.com>
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
---
cman/man/qdisk.5 | 7 +++++++
cman/qdisk/disk.h | 3 ++-
cman/qdisk/main.c | 34 ++++++++++++++++++++++++++++++++--
3 files changed, 41 insertions(+), 3 deletions(-)
diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5
index 65b9956..513d56b 100644
--- a/cman/man/qdisk.5
+++ b/cman/man/qdisk.5
@@ -291,6 +291,13 @@ if it takes more than (interval * tko) seconds to complete a quorum
disk
pass. The default for this value is 0 (off).
.in 9
+\fIio_timeout\fP\fB="\fP0\fB"\fP
+.in 12
+If set to 1 (on), qdiskd will watch internal timers and reboot the node
+if qdisk is not able to write to disk after (interval * tko) seconds.
+The default for this value is 0 (off).
+
+.in 9
\fIscheduler\fP\fB="\fPrr\fB"\fP
.in 12
Valid values are 'rr', 'fifo', and 'other'. Selects the
scheduling queue
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index 3509339..0b652b2 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -73,7 +73,8 @@ typedef enum {
RF_PARANOID = 0x8,
RF_ALLOW_KILL = 0x10,
RF_UPTIME = 0x20,
- RF_CMAN_LABEL = 0x40
+ RF_CMAN_LABEL = 0x40,
+ RF_IOTIMEOUT = 0x80
} run_flag_t;
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 81999a0..c86759e 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -867,7 +867,7 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
int low_id, bid_pending = 0, score, score_max, score_req,
upgrade = 0, count, errors, error_cycles = 0;
memb_mask_t mask, master_mask;
- struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval;
+ struct timeval maxtime, oldtime, newtime, diff, sleeptime, interval, lastok;
ctx->qc_status = S_NONE;
@@ -877,6 +877,9 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
interval.tv_usec = 0;
interval.tv_sec = ctx->qc_interval;
+ lastok.tv_usec = 0;
+ lastok.tv_sec = 0;
+
get_my_score(&score, &score_max);
if (score_max < ctx->qc_scoremin) {
clulog(LOG_WARNING, "Minimum score (%d) is impossible to "
@@ -1065,6 +1068,8 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
clulog(LOG_ERR, "Error writing to quorum disk\n");
errors++; /* this value isn't really used
at this point */
+ } else {
+ get_time(&lastok, ctx->qc_flags&RF_UPTIME);
}
/* write out our local status */
@@ -1073,11 +1078,27 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
/* Cycle. We could time the loop and sleep
usleep(interval-looptime), but this is fine for now.*/
get_time(&newtime, ctx->qc_flags&RF_UPTIME);
- _diff_tv(&diff, &oldtime, &newtime);
+ /*
+ * Reboot if the last successful hearbeat was longer ago than interval*TKO_COUNT
+ */
+ _diff_tv(&diff, &lastok, &newtime);
+ if (_cmp_tv(&maxtime, &diff) == 1 &&
+ ctx->qc_flags & RF_IOTIMEOUT) {
+ clulog(LOG_EMERG, "Failed to send a heartbeat within "
+ "%d second%s (%d.%06d) - REBOOTING\n",
+ (int)maxtime.tv_sec,
+ maxtime.tv_sec==1?"":"s",
+ (int)diff.tv_sec,
+ (int)diff.tv_usec);
+ if (!(ctx->qc_flags & RF_DEBUG))
+ reboot(RB_AUTOBOOT);
+ }
+
/*
* Reboot if we didn't send a heartbeat in interval*TKO_COUNT
*/
+ _diff_tv(&diff, &oldtime, &newtime);
if (_cmp_tv(&maxtime, &diff) == 1 &&
ctx->qc_flags & RF_PARANOID) {
clulog(LOG_EMERG, "Failed to complete a cycle within "
@@ -1347,6 +1368,15 @@ get_config_data(char *cluster_name, qd_ctx *ctx, struct h_data *h,
int maxh,
free(val);
}
+ /* default = off, so, 1 to turn on */
+ snprintf(query, sizeof(query), "/cluster/quorumd/@io_timeout");
+ if (ccs_get(ccsfd, query, &val) == 0) {
+ if (!atoi(val))
+ ctx->qc_flags &= ~RF_IOTIMEOUT;
+ else
+ ctx->qc_flags |= RF_IOTIMEOUT;
+ free(val);
+ }
/*
* Get flag to see if we're supposed to reboot if we can't complete