dlm: master - dlm_controld: partition merge handling - cluster-commits - Fedora Mailing-Lists

Wednesday, 6 March 2013

Gitweb:       
http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=ee11f54d5448447...
Commit:        ee11f54d5448447505239fdff4a68b3922d97f4e
Parent:        16ddbe2d61b4edbed4d911c523a82bcf5f325b70
Author:        David Teigland <teigland(a)redhat.com&gt;
AuthorDate:    Thu Feb 28 15:12:33 2013 -0600
Committer:     David Teigland <teigland(a)redhat.com&gt;
CommitterDate: Thu Feb 28 15:12:33 2013 -0600

dlm_controld: partition merge handling

Fix/improve the handling of partition merge cases.
This includes fencing and killing merged nodes.

Signed-off-by: David Teigland <teigland(a)redhat.com&gt;
---
 dlm_controld/daemon_cpg.c |  159 +++++++++++++++++++++++++++++++++++----------
 dlm_controld/dlm_daemon.h |    1 +
 dlm_controld/member.c     |   15 ++++
 3 files changed, 141 insertions(+), 34 deletions(-)

diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index 641b189..b3c543d 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -369,6 +369,18 @@ static int nodes_need_fencing(void)
 	return 0;
 }
 
+static int nodeid_needs_fencing(int nodeid)
+{
+	struct node_daemon *node;
+
+	node = get_node_daemon(nodeid);
+	if (!node) {
+		log_error("nodeid_needs_fencing %d not found", nodeid);
+		return 0;
+	}
+	return node->need_fencing;
+}
+
 static int all_daemon_members_fipu(void)
 {
 	struct node_daemon *node;
@@ -839,7 +851,7 @@ static void daemon_fence_work(void)
 			/*
 			 * node has rejoined in clean state
 			 */
-			log_debug("fence request %d skip member", node->nodeid);
+			log_debug("fence request %d skip for is_clean_daemon_member",
node->nodeid);
 
 			node->need_fencing = 0;
 			node->delay_fencing = 0;
@@ -858,6 +870,10 @@ static void daemon_fence_work(void)
 			continue;
 		}
 
+		/* use post_join_delay to avoid fencing a node in the short
+		   time between it joining the cluster (giving cluster quorum)
+		   and joining the daemon cpg, which allows it to bypass fencing */
+
 		if (monotime() - cluster_last_join_monotime < opt(post_join_delay_ind)) {
 			log_debug("fence request %d delay %d from %llu",
 				  node->nodeid, opt(post_join_delay_ind),
@@ -945,7 +961,7 @@ static void daemon_fence_work(void)
 			 * will see and do this, so we don't need to send
 			 * a fence result.
 			 */
-			log_debug("fence wait %d pid %d skip member", nodeid, pid);
+			log_debug("fence wait %d pid %d skip for is_clean_daemon_member", nodeid,
pid);
 
 			node->need_fencing = 0;
 			node->delay_fencing = 0;
@@ -1262,7 +1278,8 @@ static void receive_fence_result(struct dlm_header *hd, int len)
 	}
 
 	if (!node->need_fencing) {
-		/* should never happen */
+		/* should never happen ... will happen if a manual fence_ack is
+		   done for a node that doesn't need it */
 		log_error("receive_fence_result %d from %d result %d no need_fencing",
 		  	  fr->nodeid, hd->nodeid, fr->result);
 		return;
@@ -1275,20 +1292,23 @@ static void receive_fence_result(struct dlm_header *hd, int len)
 		/* should we ignore and return here? */
 	}
 
-	if (!fr->result && node->daemon_member) {
+	if (node->daemon_member &&
+	    (!fr->result || (fr->result == -ECANCELED))) {
 
 		/*
-		 * the only time I think this can happen is if there is a
-		 * manual dlm_tool fence_ack for a node that is a member,
-		 * e.g. partition, merge, fence_ack while it's a merged member.
-		 * Ideally it would be killed after merging with state, but
-		 * not necessarily, i.e. it's start message can't be sent or
-		 * received.
+		 * The node was successfully fenced, but is still a member.
+		 * This will happen when there is a partition, storage fencing
+		 * is started, a merge causes the node to become a member
+		 * again, and storage fencing completes successfully.  If we
+		 * received a proto message from the node after the merge, then
+		 * we will have detected a stateful merge, and we may have
+		 * already killed it.
 		 */
 
-		log_error("receive_fence_result %d from %d result %d node not dead",
+		log_error("receive_fence_result %d from %d result %d node is daemon_member",
 			  fr->nodeid, hd->nodeid, fr->result);
-		return;
+
+		kick_node_from_cluster(fr->nodeid);
 	}
 
 	if ((hd->nodeid == our_nodeid) && (fr->result != -ECANCELED))
@@ -1580,18 +1600,62 @@ static void receive_protocol(struct dlm_header *hd, int len)
 	    (p->dr_ver.flags & PV_STATEFUL) &&
 	    (our_protocol.dr_ver.flags & PV_STATEFUL)) {
 
-		log_debug("daemon node %d stateful merge", hd->nodeid);
-		log_debug("daemon node %d join %llu left %llu local quorum %llu",
+		log_error("daemon node %d stateful merge", hd->nodeid);
+		log_debug("daemon node %d join %llu left %llu local quorum %llu killed %d",
 			  hd->nodeid,
 			  (unsigned long long)node->daemon_add_time,
 			  (unsigned long long)node->daemon_rem_time,
-			  (unsigned long long)cluster_quorate_monotime);
+			  (unsigned long long)cluster_quorate_monotime,
+			  node->killed);
 
 		if (cluster_quorate && node->daemon_rem_time &&
 		    cluster_quorate_monotime < node->daemon_rem_time) {
-			log_debug("daemon node %d kill due to stateful merge", hd->nodeid);
-			if (!node->killed)
-				kick_node_from_cluster(hd->nodeid);
+			if (!node->killed) {
+				if (cluster_two_node) {
+					/*
+					 * When there are two nodes and two_node mode
+					 * is used, both will have quorum throughout
+					 * the partition and subsequent stateful merge.
+					 *
+					 * - both will race to fence each other in
+					 *   response to the partition
+					 *
+					 * - both can attempt to kill the cluster
+					 *   on the other in response to the stateful
+					 *   merge here
+					 *
+					 * - we don't want both nodes to kill the cluster
+					 *   on the other, which can happen if the merge
+					 *   occurs before power fencing is successful,
+					 *   or can happen before/during/after storage
+					 *   fencing
+					 *
+					 * - if nodeA successfully fences nodeB (due
+					 *   to the partition), we want nodeA to kill
+					 *   the cluster on nodeB in response to the
+					 *   merge (we don't want nodeB to kill nodeA
+					 *   in response to the merge).
+					 *
+					 * So, a node that has successfully fenced the
+					 * other will kill the cluster on it. If fencing
+					 * is still running, we wait until it's
+					 * successfull to kill the cluster on the node
+					 * being fenced.
+					 */
+					if (nodeid_needs_fencing(hd->nodeid)) {
+						/* when fencing completes successfully,
+						   we'll see the node is a daemon member
+						   and kill it */
+						log_debug("daemon node %d delay kill for stateful merge",
hd->nodeid);
+					} else {
+						log_error("daemon node %d kill due to stateful merge", hd->nodeid);
+						kick_node_from_cluster(hd->nodeid);
+					}
+				} else {
+					log_error("daemon node %d kill due to stateful merge", hd->nodeid);
+					kick_node_from_cluster(hd->nodeid);
+				}
+			}
 			node->killed = 1;
 		}
 
@@ -1808,6 +1872,7 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 	struct node_daemon *node;
 	uint64_t now, now_wall;
 	int nodedown = 0, procdown = 0, leave = 0;
+	int check_joined_count = 0, check_remove_count = 0, check_member_count = 0;
 	int we_joined = 0;
 	int i, reason, low;
 
@@ -1851,7 +1916,7 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 	}
 
 	if (nodedown || procdown || leave)
-		log_debug("%s left nodedown %d procdown %d leave %d",
+		log_debug("%s left reason nodedown %d procdown %d leave %d",
 			  group_name->value, nodedown, procdown, leave);
 
 	if (nodedown)
@@ -1865,6 +1930,8 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 			if (node->daemon_member)
 				continue;
 
+			check_joined_count++;
+
 			/* node joined daemon cpg */
 			node->daemon_member = 1;
 			node->daemon_add_time = now;
@@ -1878,31 +1945,56 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 
 			if (node->need_fencing) {
 				/* need_fencing will be cleared if we accept a
-				   valid proto from it */
-				log_error("daemon new nodeid %d needs fencing",
-					  node->nodeid);
+				   valid proto from it (is_clean_daemon_member) */
+				log_error("daemon joined %d needs fencing", node->nodeid);
+			} else {
+				log_debug("daemon joined %d");
 			}
-
 		} else {
 			if (!node->daemon_member)
 				continue;
 
+			check_remove_count++;
+
 			/* node left daemon cpg */
 			node->daemon_member = 0;
-			node->killed = 0;
-			memset(&node->proto, 0, sizeof(struct protocol));
 			node->daemon_rem_time = now;
+			node->killed = 0;
 
-			/* tell loop below to look at this node */
-			node->recover_setup = 1;
+			/* If we never accepted a valid proto from this node,
+			   then it never fully joined and there's no need to
+			   recover it.  Similary, node_history_lockspace_fail
+			   only sets need_fencing in the lockspace if
+			   node->start_time was non-zero. */
+
+			if (node->proto.daemon_max[0]) {
+				/* tell loop below to look at this node */
+				node->recover_setup = 1;
+			} else {
+				log_debug("daemon remove %d no proto skip recover", node->nodeid);
+			}
+
+			memset(&node->proto, 0, sizeof(struct protocol));
 		}
 	}
 
-	/* set up recovery work for nodes that just failed */
+	list_for_each_entry(node, &daemon_nodes, list) {
+		if (node->daemon_member)
+			check_member_count++;
+	}
+
+	/* when we join, all previous members look like they are joining */
+	if (!we_joined &&
+	    (daemon_joined_count != check_joined_count ||
+	     daemon_remove_count != check_remove_count ||
+	     daemon_member_count != check_member_count)) {
+		log_error("daemon counts joined %d check %d remove %d check %d member %d check
%d",
+			  daemon_joined_count, check_joined_count,
+			  daemon_remove_count, check_remove_count,
+			  daemon_member_count, check_member_count);
+	}
 
-	/* TODO: limit to nodes with a valid proto?
-	 * node_history_lockspace_fail() would only set
-	 * need_fencing if node->start_time was non-zero. */
+	/* set up recovery work for nodes that just failed (recover_setup set above) */
 
 	list_for_each_entry(node, &daemon_nodes, list) {
 		if (!node->recover_setup)
@@ -1916,8 +2008,7 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 			continue;
 
 		if (node->need_fencing) {
-			log_error("daemon remove nodeid %d already needs fencing",
-				  node->nodeid);
+			log_error("daemon remove %d already needs fencing", node->nodeid);
 			continue;
 		}
 
@@ -1931,7 +2022,7 @@ static void confchg_cb_daemon(cpg_handle_t handle,
 		if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) {
 			if (node->fence_pid_wait || node->fence_pid) {
 				/* sanity check, should never happen */
-				log_error("daemon remove nodeid %d pid_wait %d pid %d",
+				log_error("daemon remove %d pid_wait %d pid %d",
 					  node->nodeid, node->fence_pid_wait, node->fence_pid);
 			}
 
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index 60d3d46..11a4777 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -169,6 +169,7 @@ EXTERN int plock_fd;
 EXTERN int plock_ci;
 EXTERN struct list_head lockspaces;
 EXTERN int cluster_quorate;
+EXTERN int cluster_two_node;
 EXTERN uint64_t cluster_last_join_monotime;
 EXTERN uint64_t cluster_quorate_monotime;
 EXTERN uint64_t cluster_joined_monotime;
diff --git a/dlm_controld/member.c b/dlm_controld/member.c
index d195ab1..fca3248 100644
--- a/dlm_controld/member.c
+++ b/dlm_controld/member.c
@@ -348,6 +348,8 @@ int setup_node_config(void)
 	cmap_handle_t h;
 	cs_error_t err;
 	uint32_t nodeid;
+	uint8_t two_node = 0;
+	int node_count = 0;
 	int i;
 
 	err = cmap_initialize(&h);
@@ -365,10 +367,23 @@ int setup_node_config(void)
 
 		log_debug("node_config %d", nodeid);
 
+		node_count++;
+
 		if (opt(enable_fencing_ind) && opt(enable_startup_fencing_ind))
 			add_startup_node(nodeid);
 	}
 
+	memset(key, 0, sizeof(key));
+	snprintf(key, CMAP_KEYNAME_MAXLEN, "quorum.two_node");
+
+	err = cmap_get_uint8(h, key, &two_node);
+	if (err != CS_OK)
+		goto out;
+
+	if (node_count == 2 && two_node)
+		cluster_two_node = 1;
+
+ out:
 	cmap_finalize(h);
 	return 0;
 }

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

dlm: master - dlm_controld: partition merge handling