Gitweb:
http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=04d4a74883a...
Commit: 04d4a74883ae3e33ceafd72c03f09eb5e75a33a5
Parent: 870ba3e84c99dfa1b36a35ca22dcc37d41b3f501
Author: Christine Caulfield <ccaulfie(a)redhat.com>
AuthorDate: Fri Nov 20 14:21:55 2015 +0000
Committer: Christine Caulfield <ccaulfie(a)redhat.com>
CommitterDate: Fri Nov 20 14:21:55 2015 +0000
fenced: Delay kill due to stateful merge until after fencing
If the cluster merges with state (due to eg a fast disconnect/reconnect)
then delay sending node kill messages until fencing has been attempted.
This stops the 'wrong' node being killed if a two_node cluster splits
& rejoins quickly
Signed-off-by: Christine Caulfield <ccaulfie(a)redhat.com>
---
fence/fenced/cpg.c | 41 ++++++++++++++++++++++++++++++++++++++---
fence/fenced/fd.h | 1 +
fence/fenced/member_cman.c | 11 -----------
3 files changed, 39 insertions(+), 14 deletions(-)
diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c
index e99e95a..28d8e1d 100644
--- a/fence/fenced/cpg.c
+++ b/fence/fenced/cpg.c
@@ -331,6 +331,20 @@ static struct node *get_node_victim(struct fd *fd, int nodeid)
return NULL;
}
+static struct node_history *get_node_history_nodeid(int nodeid)
+{
+ struct fd *fd;
+ struct node_history *node;
+
+ list_for_each_entry(fd, &domains, list) {
+ list_for_each_entry(node, &fd->node_history, list) {
+ if (node->nodeid == nodeid)
+ return node;
+ }
+ }
+ return NULL;
+}
+
static struct node_history *get_node_history(struct fd *fd, int nodeid)
{
struct node_history *node;
@@ -668,6 +682,14 @@ static void receive_victim_done(struct fd *fd, struct fd_header *hd,
int len)
nodeh->fence_time_local = time(NULL);
if (hd->nodeid == our_nodeid) {
+ /* We received our own victim_done message. */
+
+ if (nodeh && nodeh->kill_after_fencing) {
+ log_error("kick fence victim %d from cluster", nodeh->nodeid);
+ nodeh->kill_after_fencing = 0;
+ kick_node_from_cluster(nodeh->nodeid);
+ }
+
/* sanity check, I don't think this should happen;
see comment in fence_victims() */
if (node) {
@@ -2068,9 +2090,22 @@ static void receive_protocol(struct fd_header *hd, int len)
quorate_time < node->left_time) {
log_debug("daemon node %d kill due to stateful merge",
hd->nodeid);
- if (!node->killed)
- kick_node_from_cluster(hd->nodeid);
- node->killed = 1;
+ if (!node->killed) {
+ struct node_history *nodeh;
+ nodeh = get_node_history_nodeid(hd->nodeid);
+ if (!nodeh) {
+ log_error("cannot kick node, no node history %d", hd->nodeid);
+ } else if (cluster_quorate && node->left_time &&
+ quorate_time < node->left_time &&
+ nodeh->fence_time >= nodeh->fail_time) {
+ log_error("node %d kill due to stateful merge", hd->nodeid);
+ kick_node_from_cluster(hd->nodeid);
+ } else {
+ log_debug("delay kicking node %d until fencing is done",
nodeh->nodeid);
+ nodeh->kill_after_fencing = 1;
+ }
+ node->killed = 1;
+ }
}
/* don't save p->proto into node->proto; we need to come
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index d678bfa..9e8c77e 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -141,6 +141,7 @@ struct node_history {
struct list_head list;
int nodeid;
int check_quorum;
+ int kill_after_fencing;
uint64_t add_time;
uint64_t left_time;
uint64_t fail_time;
diff --git a/fence/fenced/member_cman.c b/fence/fenced/member_cman.c
index ced4272..a7f4341 100644
--- a/fence/fenced/member_cman.c
+++ b/fence/fenced/member_cman.c
@@ -32,17 +32,6 @@ void kick_node_from_cluster(int nodeid)
log_error("telling cman to shut down cluster locally");
cman_shutdown(ch_admin, CMAN_SHUTDOWN_ANYWAY);
} else {
-
- /* in a two_node cluster where both nodes maintain quorum
- * by themselves during a partition+merge, both will kick
- * the other, which can leave both dead and unfenced.
- * this delay should help */
-
- if (two_node_mode && our_nodeid > nodeid) {
- log_debug("kick_node_from_cluster %d delay", nodeid);
- sleep(5);
- }
-
log_error("telling cman to remove nodeid %d from cluster",
nodeid);
cman_kill_node(ch_admin, nodeid);