dlm: master - dlm_controld: repeat fencing attempts after all actors
fail
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=d5d7b8dd15562db...
Commit: d5d7b8dd15562dbdfdbfa1bfa7f8318c43a512d3
Parent: 2eefc91150a3292c2be82f4139207e06b4c4bba4
Author: John Ruemker <jruemker(a)redhat.com>
AuthorDate: Thu Sep 29 15:37:15 2016 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Sep 29 15:37:15 2016 -0500
dlm_controld: repeat fencing attempts after all actors fail
After fencing fails against a node, restore the original
set of nodes that should try to fence it, causing the
entire fencing process to be repeated. Continue this
until fencing is successful against the failed node.
---
dlm_controld/daemon_cpg.c | 17 ++++++++++++++---
dlm_controld/dlm.conf.5 | 2 ++
dlm_controld/dlm_controld.8 | 4 ++++
dlm_controld/dlm_daemon.h | 1 +
dlm_controld/main.c | 5 +++++
5 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index 0d55027..2009d4f 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -79,6 +79,7 @@ struct node_daemon {
int fence_actor_done; /* for status/debug */
int fence_actor_last; /* for status/debug */
int fence_actors[MAX_NODES];
+ int fence_actors_orig[MAX_NODES];
struct protocol proto;
struct fence_config fence_config;
@@ -537,6 +538,7 @@ static int set_fence_actors(struct node_daemon *node, int all_memb)
int i, nodeid, count = 0, low = 0;
memset(node->fence_actors, 0, sizeof(node->fence_actors));
+ memset(node->fence_actors_orig, 0, sizeof(node->fence_actors_orig));
for (i = 0; i < daemon_member_count; i++) {
nodeid = daemon_member[i].nodeid;
@@ -550,6 +552,9 @@ static int set_fence_actors(struct node_daemon *node, int all_memb)
low = nodeid;
}
+ /* keep a copy of the original set so they can be retried if all fail */
+ memcpy(node->fence_actors_orig, node->fence_actors, sizeof(node->fence_actors));
+
log_debug("set_fence_actors for %d low %d count %d",
node->nodeid, low, count);
return low;
@@ -592,6 +597,7 @@ static int get_fence_actor(struct node_daemon *node)
static void clear_fence_actor(int nodeid, int actor)
{
struct node_daemon *node;
+ int remaining = 0;
int i;
node = get_node_daemon(nodeid);
@@ -599,10 +605,15 @@ static void clear_fence_actor(int nodeid, int actor)
return;
for (i = 0; i < MAX_NODES; i++) {
- if (node->fence_actors[i] == actor) {
+ if (node->fence_actors[i] == actor)
node->fence_actors[i] = 0;
- return;
- }
+ else if (node->fence_actors[i])
+ remaining++;
+ }
+
+ if (!remaining && opt(repeat_failed_fencing_ind)) {
+ log_debug("clear_fence_actor %d restoring original actors to retry", actor);
+ memcpy(node->fence_actors, node->fence_actors_orig, sizeof(node->fence_actors));
}
}
diff --git a/dlm_controld/dlm.conf.5 b/dlm_controld/dlm.conf.5
index 007e4de..ca4209a 100644
--- a/dlm_controld/dlm.conf.5
+++ b/dlm_controld/dlm.conf.5
@@ -68,6 +68,8 @@ enable_quorum_fencing
.br
enable_quorum_lockspace
.br
+repeat_failed_fencing
+.br
.SH Fencing
diff --git a/dlm_controld/dlm_controld.8 b/dlm_controld/dlm_controld.8
index c9011fd..b13cbf3 100644
--- a/dlm_controld/dlm_controld.8
+++ b/dlm_controld/dlm_controld.8
@@ -87,6 +87,10 @@ For default settings, see dlm_controld -h.
0|1
enable/disable quorum requirement for lockspace operations
+.B --repeat_failed_fencing
+0|1
+ enable/disable retrying after fencing fails
+
.B --fence_all
.I str
fence all nodes with this agent
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index 62508ea..f8068cf 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -106,6 +106,7 @@ enum {
enable_fencing_ind,
enable_concurrent_fencing_ind,
enable_startup_fencing_ind,
+ repeat_failed_fencing_ind,
enable_quorum_fencing_ind,
enable_quorum_lockspace_ind,
help_ind,
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index 13b3834..02f4737 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -1346,6 +1346,11 @@ static void set_opt_defaults(void)
1, NULL,
"enable/disable startup fencing");
+ set_opt_default(repeat_failed_fencing_ind,
+ "repeat_failed_fencing", '\0', req_arg_bool,
+ 1, NULL,
+ "enable/disable retrying after fencing fails");
+
set_opt_default(enable_quorum_fencing_ind,
"enable_quorum_fencing", 'q', req_arg_bool,
1, NULL,
7 years, 6 months
cluster: RHEL6 - mkfs.gfs2: Open the target device with O_EXCL
by Andrew Price
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=24e5e65be1c...
Commit: 24e5e65be1c9b8b433edc1723b4e13feba0db8ba
Parent: 80b9ae57170c1c6042375ea5119aea1ec79e3ab4
Author: Andrew Price <anprice(a)redhat.com>
AuthorDate: Tue Apr 5 14:38:03 2016 +0100
Committer: Andrew Price <anprice(a)redhat.com>
CommitterDate: Tue Aug 16 17:34:15 2016 +0100
mkfs.gfs2: Open the target device with O_EXCL
O_EXCL will let local mounters know that the device is busy while
mkfs.gfs2 is running so that they don't try to access it.
Before:
# mount /dev/vdc /mnt/test
mount: /dev/vdc: can't read superblock
With O_EXCL:
# mount /dev/vdc /mnt/test
mount: /dev/vdc is already mounted or /mnt/test busy
Resolves: rhbz#1291944
Signed-off-by: Andrew Price <anprice(a)redhat.com>
---
gfs2/mkfs/main_mkfs.c | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/gfs2/mkfs/main_mkfs.c b/gfs2/mkfs/main_mkfs.c
index b132af8..03f078e 100644
--- a/gfs2/mkfs/main_mkfs.c
+++ b/gfs2/mkfs/main_mkfs.c
@@ -558,7 +558,7 @@ void main_mkfs(int argc, char *argv[])
verify_arguments(sdp);
- sdp->device_fd = open(sdp->device_name, O_RDWR | O_CLOEXEC);
+ sdp->device_fd = open(sdp->device_name, O_RDWR | O_CLOEXEC | O_EXCL);
if (sdp->device_fd < 0)
die( _("can't open device %s: %s\n"),
sdp->device_name, strerror(errno));
7 years, 6 months