Gitweb:
http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=76741bb2a94...
Commit: 76741bb2a94ae94e493c609d50f570d02e2f3029
Parent: 3290dff9b88992e913b28dd57c6b64a3c2b75c8a
Author: Lon Hohberger <lhh(a)redhat.com>
AuthorDate: Fri May 6 10:14:04 2011 -0400
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Mon Jul 23 14:54:07 2012 +0200
qdiskd: Make multipath issues go away
Qdiskd hsitorically has required significant tuning to work around
delays which occur during multipath failover, overloaded I/O, and LUN
trespasses in both device-mapper-multipath and EMC PowerPath
environments.
This patch goes a very long way towards eliminating false evictions
when these conditions occur by making qdiskd whine to the other
cluster members when it detects hung system calls. When a cluster
member whines, it indicates the source of the problem (which system
call is hung), and the act of receiving a whine from a host indicates
that qdiskd is operational, but that I/O is hung. Hung I/O is different
from losing storage entirely (where you get I/O errors).
Possible problems:
- Receive queue getting very full, causing messages to become blocked on
a node where I/O is hung. 1) that would take a very long time, and 2)
node should get evicted at that point anyway.
Resolves: rhbz#782900
this version of the patch is a backport of:
e2937eb33f224f86904fead08499a6178868ca6a
34d2872fb7e60be1594158acaaeb8acd74f78d22
There is a minor change vs original patch based on how qdiskd
in RHEL5 handles cman connection. We add an extra call to cman_alive
in main qdisk_loop to make sure data are not stalled on the
cman port, and data_callback to qdiskd_whine executed.
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/daemon/cnxman-socket.h | 1 +
cman/qdisk/Makefile | 2 +-
cman/qdisk/disk.h | 6 ++++
cman/qdisk/iostate.c | 17 +++++++++++--
cman/qdisk/iostate.h | 4 ++-
cman/qdisk/main.c | 54 +++++++++++++++++++++++++++++++++++++++----
6 files changed, 74 insertions(+), 10 deletions(-)
diff --git a/cman/daemon/cnxman-socket.h b/cman/daemon/cnxman-socket.h
index 351c97c..1d01b44 100644
--- a/cman/daemon/cnxman-socket.h
+++ b/cman/daemon/cnxman-socket.h
@@ -79,6 +79,7 @@
#define CLUSTER_PORT_SERVICES 2
#define CLUSTER_PORT_SYSMAN 10 /* Remote execution daemon */
#define CLUSTER_PORT_CLVMD 11 /* Cluster LVM daemon */
+#define CLUSTER_PORT_QDISKD 178 /* Quorum disk daemon */
/* Port numbers above this will be blocked when the cluster is inquorate or in
* transition */
diff --git a/cman/qdisk/Makefile b/cman/qdisk/Makefile
index f58806b..9bfc486 100644
--- a/cman/qdisk/Makefile
+++ b/cman/qdisk/Makefile
@@ -32,7 +32,7 @@ qdiskd: disk.o crc32.o disk_util.o main.o score.o bitmap.o clulog.o \
gcc -o $@ $^ -lpthread -L../lib -L${ccslibdir} -lccs -lrt
mkqdisk: disk.o crc32.o disk_util.o iostate.o \
- proc.o mkqdisk.o scandisk.o clulog.o gettid.o
+ proc.o mkqdisk.o scandisk.o clulog.o gettid.o ../lib/libcman.a
gcc -o $@ $^ -lrt
%.o: %.c
diff --git a/cman/qdisk/disk.h b/cman/qdisk/disk.h
index b784220..d491de1 100644
--- a/cman/qdisk/disk.h
+++ b/cman/qdisk/disk.h
@@ -290,6 +290,12 @@ typedef struct {
status_block_t ni_status;
} node_info_t;
+typedef struct {
+ qd_ctx *ctx;
+ node_info_t *ni;
+ size_t ni_len;
+} qd_priv_t;
+
int qd_write_status(qd_ctx *ctx, int nid, disk_node_state_t state,
disk_msg_t *msg, memb_mask_t mask, memb_mask_t master);
int qd_read_print_status(target_info_t *disk, int nid);
diff --git a/cman/qdisk/iostate.c b/cman/qdisk/iostate.c
index 65b4d50..eb74ad2 100644
--- a/cman/qdisk/iostate.c
+++ b/cman/qdisk/iostate.c
@@ -1,10 +1,14 @@
#include <pthread.h>
+#include <libcman.h>
#include <iostate.h>
#include <unistd.h>
#include <time.h>
#include <sys/time.h>
#include <clulog.h>
+#include <stdint.h>
+#include "platform.h"
#include "iostate.h"
+#include "../daemon/cnxman-socket.h"
static iostate_t main_state = 0;
static int main_incarnation = 0;
@@ -26,7 +30,7 @@ static struct state_table io_state_table[] = {
{ STATE_LSEEK, "seek" },
{ -1, NULL } };
-static const char *
+const char *
state_to_string(iostate_t state)
{
static const char *ret = "unknown";
@@ -65,6 +69,8 @@ io_nanny_thread(void *arg)
iostate_t last_main_state = 0, current_main_state = 0;
int last_main_incarnation = 0, current_main_incarnation = 0;
int logged_incarnation = 0;
+ cman_handle_t ch = (cman_handle_t)arg;
+ int32_t whine_state;
/* Start with wherever we're at now */
pthread_mutex_lock(&state_mutex);
@@ -96,6 +102,11 @@ io_nanny_thread(void *arg)
continue;
}
+ /* Whine on CMAN api */
+ whine_state = (int32_t)current_main_state;
+ swab32(whine_state);
+ cman_send_data(ch, &whine_state, sizeof(int32_t), 0, CLUSTER_PORT_QDISKD, 0);
+
/* Don't log things twice */
if (logged_incarnation == current_main_incarnation)
continue;
@@ -114,7 +125,7 @@ io_nanny_thread(void *arg)
int
-io_nanny_start(int timeout)
+io_nanny_start(cman_handle_t ch, int timeout)
{
int ret;
@@ -124,7 +135,7 @@ io_nanny_start(int timeout)
qdisk_timeout = timeout;
thread_active = 1;
- ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, NULL);
+ ret = pthread_create(&io_nanny_tid, NULL, io_nanny_thread, ch);
pthread_mutex_unlock(&state_mutex);
return ret;
diff --git a/cman/qdisk/iostate.h b/cman/qdisk/iostate.h
index 7dd7bf6..a65b1d4 100644
--- a/cman/qdisk/iostate.h
+++ b/cman/qdisk/iostate.h
@@ -11,7 +11,9 @@ typedef enum {
void io_state(iostate_t state);
-int io_nanny_start(int timeout);
+int io_nanny_start(cman_handle_t ch, int timeout);
int io_nanny_stop(void);
+const char * state_to_string(iostate_t state);
+
#endif
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 0d7bb3d..90d00ab 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -48,6 +48,7 @@
(defined(LIBCMAN_VERSION) && LIBCMAN_VERSION < 2))
#include <cluster/cnxman-socket.h>
#endif
+#include "../daemon/cnxman-socket.h"
int daemon_init(char *);
int check_process_running(char *, pid_t *);
@@ -892,6 +893,11 @@ quorum_loop(qd_ctx *ctx, node_info_t *ni, int max)
_running = 1;
while (_running) {
+ /* perform a forceful cman dispatch */
+ if (cman_alive(ctx->qc_ch) < 0) {
+ clulog(LOG_ERR, "cman: %s\n", strerror(errno));
+ }
+
/* XXX this was getuptime() in clumanager */
get_time(&oldtime, (ctx->qc_flags&RF_UPTIME));
@@ -1514,6 +1520,31 @@ check_stop_cman(qd_ctx *ctx)
}
}
+static void
+qdisk_whine(cman_handle_t h, void *privdata, char *buf, int len,
+ uint8_t port, int nodeid)
+{
+ int32_t dstate;
+ qd_priv_t *qp = (qd_priv_t *)privdata;
+ node_info_t *ni = qp->ni;
+
+ if (len != sizeof(dstate)) {
+ return;
+ }
+
+ dstate = *((int32_t*)buf);
+
+ if (nodeid == (qp->ctx->qc_my_id))
+ return;
+
+ swab32(dstate);
+
+ if (dstate) {
+ clulog(LOG_CRIT, "qdiskd on node %d reports hung %s()\n",
+ state_to_string(dstate));
+ ni[nodeid-1].ni_misses = 0;
+ }
+}
int
main(int argc, char **argv)
@@ -1528,6 +1559,7 @@ main(int argc, char **argv)
char device[128];
pid_t pid;
quorum_header_t qh;
+ qd_priv_t qp;
if (check_process_running(argv[0], &pid) && pid !=getpid()) {
printf("QDisk services already running\n");
@@ -1559,10 +1591,16 @@ main(int argc, char **argv)
}
}
+ /* For cman notifications we need two sockets - one for events,
+ one for config change callbacks */
+ qp.ctx = &ctx;
+ qp.ni = &ni[0];
+ qp.ni_len = MAX_NODES_DISK;
+
#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
- ch = cman_admin_init(NULL);
+ ch = cman_admin_init(&qp);
#else
- ch = cman_init(NULL);
+ ch = cman_init(&qp);
#endif
if (!ch) {
if (!foreground && !forked) {
@@ -1577,13 +1615,19 @@ main(int argc, char **argv)
do {
sleep(5);
#if (defined(LIBCMAN_VERSION) && LIBCMAN_VERSION >= 2)
- ch = cman_admin_init(NULL);
+ ch = cman_admin_init(&qp);
#else
- ch = cman_init(NULL);
+ ch = cman_init(&qp);
#endif
} while (!ch);
}
+ if (cman_start_recv_data(ch, qdisk_whine, CLUSTER_PORT_QDISKD) != 0) {
+ clulog_and_print(LOG_CRIT, "Could not register with CMAN: %s\n",
+ strerror(errno));
+ goto out;
+ }
+
memset(&me, 0, sizeof(me));
while (cman_get_node(ch, CMAN_NODEID_US, &me) < 0) {
if (!foreground && !forked) {
@@ -1696,7 +1740,7 @@ main(int argc, char **argv)
}
}
- io_nanny_start(ctx.qc_tko * ctx.qc_interval);
+ io_nanny_start(ch, ctx.qc_tko * ctx.qc_interval);
if (quorum_loop(&ctx, ni, MAX_NODES_DISK) == 0)
cman_unregister_quorum_device(ctx.qc_ch);