cluster: STABLE32 - cman init: increase default shutdown timeouts
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=05b6c764abb...
Commit: 05b6c764abb3e8e6f406bd8f33df8000206bd986
Parent: 0eae00792877a7a7b50b3323119296682e4f9097
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Fri Sep 14 14:06:34 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Fri Sep 14 14:06:34 2012 +0200
cman init: increase default shutdown timeouts
in some conditions, specially triggered when shutting down all nodes
at the same time, corosync takes a lot longer than 10 seconds
to stabilize membership. That means that daemons will not quit fast
enough before cman init will declare a shutdown error.
Increase the default shutdown timeouts from 10 to 30 seconds.
Resolves: rhbz#854032
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/init.d/cman.in | 6 +++---
1 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/cman/init.d/cman.in b/cman/init.d/cman.in
index 1917abd..a88f52f 100644
--- a/cman/init.d/cman.in
+++ b/cman/init.d/cman.in
@@ -305,7 +305,7 @@ stop_daemon()
shift
retryforsec=$1
- [ -z "$retryforsec" ] && retryforsec=1
+ [ -z "$retryforsec" ] && retryforsec=30
retries=0
if check_sleep; then
@@ -661,7 +661,7 @@ start_qdiskd()
stop_qdiskd()
{
- stop_daemon qdiskd 5
+ stop_daemon qdiskd
}
start_groupd()
@@ -770,7 +770,7 @@ join_fence_domain()
leave_fence_domain()
{
if status fenced > /dev/null 2>&1; then
- errmsg=$( fence_tool leave -w 10 2>&1 )
+ errmsg=$( fence_tool leave -w 30 2>&1 )
return $?
fi
}
11 years, 7 months
cluster: RHEL59 - cman init: allow dlm tcp port to be configurable via cman init script
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=6342b59f089...
Commit: 6342b59f089475e679bbbd440ef8ae6277d173d8
Parent: b1f42a7a30d0ea55c5d152774acde97f5704928b
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Fri Sep 14 09:51:14 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Fri Sep 14 09:51:14 2012 +0200
cman init: allow dlm tcp port to be configurable via cman init script
Resolves: rhbz#856954
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/init.d/cman | 6 +++++-
1 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/cman/init.d/cman b/cman/init.d/cman
index 2101eb7..e254d33 100755
--- a/cman/init.d/cman
+++ b/cman/init.d/cman
@@ -304,7 +304,8 @@ tune_dlm_hash_sizes()
if [ -z "$DLM_LKBTBL_SIZE" ] && \
[ -z "$DLM_RSBTBL_SIZE" ] && \
- [ -z "$DLM_DIRTBL_SIZE" ]; then
+ [ -z "$DLM_DIRTBL_SIZE" ] && \
+ [ -z "$DLM_TCP_PORT" ]; then
return 0
fi
for i in $(seq 1 $maxloop); do
@@ -325,6 +326,9 @@ tune_dlm_hash_sizes()
if [ -n "$DLM_DIRTBL_SIZE" ]; then
echo $DLM_DIRTBL_SIZE > $dlmdir/dirtbl_size
fi
+ if [ -n "$DLM_TCP_PORT" ]; then
+ echo $DLM_TCP_PORT > $dlmdir/tcp_port
+ fi
return 0
}
11 years, 7 months
cluster: RHEL6 - cman init: allow dlm tcp port to be configurable via cman init script
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=33c58d437ad...
Commit: 33c58d437ad7f8c919ce7c56ef6826579a97fca7
Parent: d9e0108ac67cb23c2b93e13d3630033398fb1b86
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Fri Sep 14 09:29:19 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Fri Sep 14 09:31:29 2012 +0200
cman init: allow dlm tcp port to be configurable via cman init script
Resolves: rhbz#857299
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/init.d/cman.in | 13 ++++++++-----
cman/init.d/cman.init.defaults.in | 4 ++--
2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/cman/init.d/cman.in b/cman/init.d/cman.in
index 9de349d..4f8db7a 100644
--- a/cman/init.d/cman.in
+++ b/cman/init.d/cman.in
@@ -110,12 +110,12 @@ fi
# DLM_CONTROLD_OPTS -- allow extra options to be passed to dlm_controld daemon.
[ -z "$DLM_CONTROLD_OPTS" ] && DLM_CONTROLD_OPTS=""
-# DLM_LKBTBL_SIZE - DLM_RSBTBL_SIZE - DLM_DIRTBL_SIZE
-# Allow tuning of DLM kernel hash table sizes.
+# Allow tuning of DLM kernel config.
# do NOT change unless instructed to do so.
[ -z "$DLM_LKBTBL_SIZE" ] && DLM_LKBTBL_SIZE=""
[ -z "$DLM_RSBTBL_SIZE" ] && DLM_RSBTBL_SIZE=""
[ -z "$DLM_DIRTBL_SIZE" ] && DLM_DIRTBL_SIZE=""
+[ -z "$DLM_TCP_PORT" ] && DLM_TCP_PORT=""
# FENCE_JOIN_TIMEOUT -- seconds to wait for fence domain join to
# complete. If the join hasn't completed in this time, fence_tool join
@@ -713,7 +713,7 @@ leave_fence_domain()
fi
}
-tune_dlm_hash_sizes()
+tune_dlm_config()
{
dlmdir="/sys/kernel/config/dlm/cluster"
@@ -726,6 +726,9 @@ tune_dlm_hash_sizes()
[ -n "$DLM_DIRTBL_SIZE" ] && [ -f $dlmdir/dirtbl_size ] && \
echo $DLM_DIRTBL_SIZE > $dlmdir/dirtbl_size
+ [ -n "$DLM_TCP_PORT" ] && [ -f $dlmdir/tcp_port ] && \
+ echo $DLM_TCP_PORT > $dlmdir/tcp_port
+
return 0
}
@@ -797,9 +800,9 @@ start()
none \
"Starting dlm_controld"
- runwrap tune_dlm_hash_sizes \
+ runwrap tune_dlm_config \
none \
- "Tuning DLM kernel hash tables"
+ "Tuning DLM kernel config"
runwrap start_gfs_controld \
none \
diff --git a/cman/init.d/cman.init.defaults.in b/cman/init.d/cman.init.defaults.in
index bbaa049..835b44f 100644
--- a/cman/init.d/cman.init.defaults.in
+++ b/cman/init.d/cman.init.defaults.in
@@ -34,12 +34,12 @@
# DLM_CONTROLD_OPTS -- allow extra options to be passed to dlm_controld daemon.
#DLM_CONTROLD_OPTS=""
-# DLM_LKBTBL_SIZE - DLM_RSBTBL_SIZE - DLM_DIRTBL_SIZE
-# Allow tuning of DLM kernel hash table sizes.
+# Allow tuning of DLM kernel config.
# do NOT change unless instructed to do so.
#DLM_LKBTBL_SIZE=""
#DLM_RSBTBL_SIZE=""
#DLM_DIRTBL_SIZE=""
+#DLM_TCP_PORT=""
# FENCE_JOIN_TIMEOUT -- seconds to wait for fence domain join to
# complete. If the join hasn't completed in this time, fence_tool join
11 years, 7 months
cluster: STABLE32 - cman init: allow dlm tcp port to be configurable via cman init script
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=0eae0079287...
Commit: 0eae00792877a7a7b50b3323119296682e4f9097
Parent: 6d380d530eb6f777ec6ab92a30de67a22772dc8c
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Fri Sep 14 09:29:19 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Fri Sep 14 09:29:19 2012 +0200
cman init: allow dlm tcp port to be configurable via cman init script
Resolves: rhbz#857299
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
cman/init.d/cman.in | 13 ++++++++-----
cman/init.d/cman.init.defaults.in | 4 ++--
2 files changed, 10 insertions(+), 7 deletions(-)
diff --git a/cman/init.d/cman.in b/cman/init.d/cman.in
index e15144b..1917abd 100644
--- a/cman/init.d/cman.in
+++ b/cman/init.d/cman.in
@@ -119,12 +119,12 @@ fi
# DLM_CONTROLD_OPTS -- allow extra options to be passed to dlm_controld daemon.
[ -z "$DLM_CONTROLD_OPTS" ] && DLM_CONTROLD_OPTS=""
-# DLM_LKBTBL_SIZE - DLM_RSBTBL_SIZE - DLM_DIRTBL_SIZE
-# Allow tuning of DLM kernel hash table sizes.
+# Allow tuning of DLM kernel config.
# do NOT change unless instructed to do so.
[ -z "$DLM_LKBTBL_SIZE" ] && DLM_LKBTBL_SIZE=""
[ -z "$DLM_RSBTBL_SIZE" ] && DLM_RSBTBL_SIZE=""
[ -z "$DLM_DIRTBL_SIZE" ] && DLM_DIRTBL_SIZE=""
+[ -z "$DLM_TCP_PORT" ] && DLM_TCP_PORT=""
# FENCE_JOIN_TIMEOUT -- seconds to wait for fence domain join to
# complete. If the join hasn't completed in this time, fence_tool join
@@ -775,7 +775,7 @@ leave_fence_domain()
fi
}
-tune_dlm_hash_sizes()
+tune_dlm_config()
{
dlmdir="/sys/kernel/config/dlm/cluster"
@@ -788,6 +788,9 @@ tune_dlm_hash_sizes()
[ -n "$DLM_DIRTBL_SIZE" ] && [ -f $dlmdir/dirtbl_size ] && \
echo $DLM_DIRTBL_SIZE > $dlmdir/dirtbl_size
+ [ -n "$DLM_TCP_PORT" ] && [ -f $dlmdir/tcp_port ] && \
+ echo $DLM_TCP_PORT > $dlmdir/tcp_port
+
return 0
}
@@ -866,9 +869,9 @@ start()
dlm_controld_enabled \
"Starting dlm_controld"
- runwrap tune_dlm_hash_sizes \
+ runwrap tune_dlm_config \
none \
- "Tuning DLM kernel hash tables"
+ "Tuning DLM kernel config"
gfs_controld_enabled && cd @INITDDIR@ && ./gfs2-cluster start
diff --git a/cman/init.d/cman.init.defaults.in b/cman/init.d/cman.init.defaults.in
index 4525d70..e94f676 100644
--- a/cman/init.d/cman.init.defaults.in
+++ b/cman/init.d/cman.init.defaults.in
@@ -42,12 +42,12 @@
# DLM_CONTROLD_OPTS -- allow extra options to be passed to dlm_controld daemon.
#DLM_CONTROLD_OPTS=""
-# DLM_LKBTBL_SIZE - DLM_RSBTBL_SIZE - DLM_DIRTBL_SIZE
-# Allow tuning of DLM kernel hash table sizes.
+# Allow tuning of DLM kernel config.
# do NOT change unless instructed to do so.
#DLM_LKBTBL_SIZE=""
#DLM_RSBTBL_SIZE=""
#DLM_DIRTBL_SIZE=""
+#DLM_TCP_PORT=""
# FENCE_JOIN_TIMEOUT -- seconds to wait for fence domain join to
# complete. If the join hasn't completed in this time, fence_tool join
11 years, 7 months
cluster: RHEL6 - fence_check: add script and man page
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=d9e0108ac67...
Commit: d9e0108ac67cb23c2b93e13d3630033398fb1b86
Parent: d7eb8359ad414e836079735de4065ee19dcad26a
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Thu Sep 6 19:47:27 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Tue Sep 11 20:07:41 2012 +0200
fence_check: add script and man page
Resolves: rhbz#797952
This commits depends on 3c9226bbc66fde9e9703a4685ed46e6e17d9085a and
0cfe92b84b8a4ce62e896919a93210d6b7694f6b
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
fence/Makefile | 2 +-
fence/fence_check/Makefile | 20 +++
fence/fence_check/fence_check.in | 241 ++++++++++++++++++++++++++++++++++++++
fence/man/Makefile | 2 +-
fence/man/fence_check.8 | 65 ++++++++++
5 files changed, 328 insertions(+), 2 deletions(-)
diff --git a/fence/Makefile b/fence/Makefile
index b3f9a61..a8783bb 100644
--- a/fence/Makefile
+++ b/fence/Makefile
@@ -1,4 +1,4 @@
include ../make/defines.mk
include $(OBJDIR)/make/passthrough.mk
-SUBDIRS=libfence libfenced fenced fence_node fence_tool man
+SUBDIRS=libfence libfenced fenced fence_node fence_tool fence_check man
diff --git a/fence/fence_check/Makefile b/fence/fence_check/Makefile
new file mode 100644
index 0000000..3788f26
--- /dev/null
+++ b/fence/fence_check/Makefile
@@ -0,0 +1,20 @@
+include ../../make/defines.mk
+
+TARGET1 = fence_check
+
+SBINDIRT = $(TARGET1)
+
+all: $(TARGET1)
+
+include $(OBJDIR)/make/clean.mk
+include $(OBJDIR)/make/install.mk
+include $(OBJDIR)/make/uninstall.mk
+
+${TARGET1}: $(S)/${TARGET1}.in
+ cat $(S)/$(TARGET1).in | sed \
+ -e 's#@SBINDIR@#${sbindir}#g' \
+ -e 's#@LOGDIR@#${logdir}#g' \
+ -e 's#@VERSION@#${RELEASE_VERSION}#g' \
+ > $(TARGET1)
+
+clean: generalclean
diff --git a/fence/fence_check/fence_check.in b/fence/fence_check/fence_check.in
new file mode 100644
index 0000000..e194daf
--- /dev/null
+++ b/fence/fence_check/fence_check.in
@@ -0,0 +1,241 @@
+#!/bin/bash
+
+set +e
+export LC_ALL=C
+export PATH="/bin:/usr/bin:/sbin:/usr/sbin:@SBINDIR@"
+
+logfile=@LOGDIR(a)/fence_check.log
+verbose=""
+vardir=/var/run
+
+print_usage() {
+ echo "Usage:"
+ echo ""
+ echo "fence_check [options]"
+ echo ""
+ echo "Options:"
+ echo " -h Print this help, then exit"
+ echo " -V Print program version information, then exit"
+ echo " -d Disable output to logfile ($logfile)"
+ echo " -v Produce verbose output"
+ echo " -e Produce extra verbose output"
+ echo " ATTENTION: IT MIGHT SHOW FENCE PASSWORDS IN LOG FILES!!!"
+ echo " -f Override checks and force execution"
+ echo " DO NOT USE ON PRODUCTION CLUSTERS!!!"
+}
+
+check_opts() {
+ while [ "$1" != "--" ]; do
+ case $1 in
+ -h)
+ print_usage
+ exit 0
+ ;;
+ -V)
+ echo "fence_check version @VERSION@"
+ exit 0
+ ;;
+ -v)
+ verbose=1
+ ;;
+ -e)
+ fencenodeopts="-vv"
+ ;;
+ -d)
+ logfile=""
+ ;;
+ -f)
+ override="1"
+ ;;
+ esac
+ shift
+ done
+}
+
+opts=$(getopt hdefvV $@)
+if [ "$?" != 0 ]; then
+ print_usage >&2
+ exit 1
+fi
+check_opts $opts
+
+cleanup() {
+ vecho "cleanup: $@"
+ rm -f $vardir/fence_check.pid
+ exit $1
+}
+
+trap "cleanup 1 ABRT" ABRT
+trap "cleanup 1 QUIT" QUIT
+trap "cleanup 1 TERM" TERM
+trap "cleanup 1 INT" INT
+
+lecho() {
+ [ -n "$logfile" ] && echo "$@" | tee -a $logfile
+ [ -z "$logfile" ] && echo "$@"
+ return 0
+}
+
+vecho() {
+ [ -z "$verbose" ] && return 0
+ lecho "$@"
+}
+
+error_report()
+{
+ lecho "Unable to perform fence_check: $@"
+}
+
+cman_running()
+{
+ vecho -n "Checking if cman is running: "
+ thisnodeid="$(cman_tool status 2>&1 | grep "Node ID:" | awk '{print $NF}')"
+ [ -z "$thisnodeid" ] && {
+ vecho "not running"
+ return 1
+ }
+ vecho "running"
+}
+
+cman_has_quorum()
+{
+ vecho -n "Checking if node is quorate: "
+ cman_tool -t 1 -q wait > /dev/null 2>&1 || {
+ vecho "not quorate"
+ return 1
+ }
+ vecho "quorate"
+}
+
+fence_domain()
+{
+ vecho -n "Checking if node is in fence domain: "
+ fencels="$(fence_tool ls 2>&1)" || {
+ vecho "not part of fence domain"
+ return 1
+ }
+ vecho "yes"
+}
+
+fence_in_progress()
+{
+ vecho -n "Checking if real fencing is in progress: "
+ victim="$(echo "$fencels" | grep "victim count" | awk '{print $NF}')"
+ [ "$victim" != "0" ] && {
+ vecho "real fencing in progress"
+ return 1
+ }
+ vecho "no fencing in progress"
+}
+
+fence_master()
+{
+ vecho -n "Checking if node is fence master: "
+ master="$(echo "$fencels" | grep "master nodeid" | awk '{print $NF}')"
+ [ "$master" != "$thisnodeid" ] && {
+ vecho "node is not fence master"
+ return 1
+ }
+ vecho "this node is fence master"
+}
+
+can_check()
+{
+ cman_running || {
+ error_report "cman is not running"
+ return 2
+ }
+
+ [ "$override" = "1" ] && return 0
+
+ cman_has_quorum || {
+ error_report "node is not quorate"
+ return 3
+ }
+
+ fence_domain || {
+ error_report "node is not part of the fence domain"
+ return 3
+ }
+
+ fence_master || {
+ error_report "node is not fence master"
+ return 3
+ }
+
+ fence_in_progress || {
+ error_report "real fencing operation in progress"
+ return 3
+ }
+
+ return 0
+}
+
+execute_check()
+{
+ can_check || return $?
+
+ vecho -n "Get node list: "
+ nodelist="$(cman_tool nodes -F id,name |grep -v '^0' | awk '{print $2}')"
+ vecho $nodelist
+
+ ret=0
+
+ for node in $nodelist; do
+ vecho "Testing $node fencing"
+
+ can_check
+ canret=$?
+
+ if [ "$canret" != 0 ]; then
+ if [ "$ret" != "5" ]; then
+ return $canret
+ else
+ return $ret
+ fi
+ fi
+
+ vecho "Checking how many fencing methods are configured for node $node"
+ for i in $(seq 1 8); do
+ ccs_tool query \
+ /cluster/clusternodes/clusternode[@name=\"$node\"]/fence/method[$i]/@name >/dev/null 2>&1 || break
+ done
+ nummethods=$((i - 1))
+ vecho "Found $nummethods method(s) to test for node $node"
+
+ for method in $(seq 1 $nummethods); do
+ vecho "Testing $node method $method status"
+ fenceres="$(fence_node $fencenodeopts -S $node -m $method 2>&1)"
+ if [ "$?" != 0 ]; then
+ ret=5
+ lecho "Testing $node method $method: FAILED"
+ if [ -z "$fencenodeopts" ]; then
+ fenceres="$(echo "$fenceres" | tail -n 2 | head -n 1)"
+ else
+ fenceargs="$(echo "$fenceres" | tail -n 2 | head -n 1)"
+ fenceres="$(echo "$fenceres" | tail -n 3 | head -n 1)"
+ fi
+ lecho "$fenceres"
+ [ -n "$fenceargs" ] && lecho "$fenceargs"
+ else
+ lecho "Testing $node method $method: success"
+ fi
+ done
+ done
+ return $ret
+}
+
+(
+ lecho "fence_check run at $(date) pid: $BASHPID"
+
+ flock --nonblock --exclusive 200 || {
+ lecho "Another process ($(cat $vardir/fence_check.pid)) is holding the lock"
+ exit 4
+ }
+
+ echo $BASHPID > $vardir/fence_check.pid
+
+ execute_check
+ cleanup $?
+
+) 200>>$vardir/fence_check.pid
diff --git a/fence/man/Makefile b/fence/man/Makefile
index a4a9cf1..3c9aa44 100644
--- a/fence/man/Makefile
+++ b/fence/man/Makefile
@@ -1,6 +1,6 @@
include ../../make/defines.mk
-MANTARGET = fenced.8 fence_node.8 fence_tool.8
+MANTARGET = fenced.8 fence_node.8 fence_tool.8 fence_check.8
include $(OBJDIR)/make/install.mk
include $(OBJDIR)/make/uninstall.mk
diff --git a/fence/man/fence_check.8 b/fence/man/fence_check.8
new file mode 100644
index 0000000..7fc221c
--- /dev/null
+++ b/fence/man/fence_check.8
@@ -0,0 +1,65 @@
+.TH "fence_check" "8" "September 2012" "" "fence configuration check"
+.SH "NAME"
+fence_check \- fence configuration check utility
+.SH "SYNOPSIS"
+\fBfence_check [\-h] [\-V] [\-d] [\-v] [\-e]
+.SH "DESCRIPTION"
+.PP
+The \fBfence_check\fP utility can be used to actively test the fence
+configuration for each node in the cluster, by issuing status commands
+instead of "reboot" "on" "off" commands on the configured fence
+methods/devices in cluster.conf.
+
+\fBfence_check\fP can also be very useful when executed via a cron job
+for regular monitoring of fence devices and to detect issues after
+cluster.conf changes.
+
+.SH "OPTIONS"
+.IP "\-h"
+Print help message, then exit.
+.IP "\-V"
+Print program version information, then exit.
+.IP "\-d"
+Disable output to logfile (default /var/log/cluster/fence_check.log).
+Useful in combination with \-e that could record sensitive data from
+cluster.conf.
+.IP "\-v"
+Enable verbose output of all actions taken during execution.
+Useful to debug issues with fence_check execution.
+Use of \-v does not include \-e.
+.IP "\-e"
+Produce detailed output, in case of failure, of the command used to test
+the fence device. Use of \-e does not include \-v.
+ATTENTION: IT MIGHT SHOW FENCE PASSWORDS IN LOG FILES! USE WITH CARE!
+.IP "\-f"
+Override checks and force execution. DO NOT USE ON PRODUCTION CLUSTERS!
+
+.SH "NOTES"
+\fBfence_check\fP can only be executed when the following conditions are met:
+
+\- cman is running on the node
+
+\- the node is quorate
+
+\- the node has joined the fence domain
+
+\- the node is in charge of fencing for the whole cluster
+
+\- no real fencing action is in progress
+
+\- no other \fBfence_check\fP operations are in progress.
+
+\fBfence_check\fP will perform all those checks prior starting a cluster wide
+fence status check. When used in combination with \-f, cman must be running
+on the node and no other processes must be performing a check at the same time.
+
+By default every run of \fBfence_check\fP is logged to logfile and to stdout.
+
+fence_check returns:
+
+0 - if execution completes
+1 - on generic execution errors (fatal)
+2 - if cman is not running (fatal)
+3 - node is not quorate/node is not part of fence domain/if node is not in charge of fencing/a real fencing operation is in progress (can be overridden)
+4 - if another \fBfence_check\fP is in progress (fatal)
+5 - if any of the fence status tests failed
11 years, 7 months
cluster: RHEL6 - fenced: fence_check delay
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=d7eb8359ad4...
Commit: d7eb8359ad414e836079735de4065ee19dcad26a
Parent: ea6bcc00c3246381060e882ba40dccd7238b205a
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Sep 4 11:44:28 2012 -0500
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Tue Sep 11 20:07:30 2012 +0200
fenced: fence_check delay
Delay fencing if the fence_check script is busy checking
fencing, which might cause our fencing to fail.
Configure delay seconds, default 5, 0 to disable, as
<fence_daemon fence_check_delay="5"/>
after which fenced sends SIGTERM to fence_check pid and
continues with normal fencing.
Resolves: rhbz#797952
Signed-off-by: David Teigland <teigland(a)redhat.com>
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
fence/fenced/config.c | 5 ++++
fence/fenced/config.h | 4 +++
fence/fenced/fd.h | 1 +
fence/fenced/recover.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/fence/fenced/config.c b/fence/fenced/config.c
index 0517c2a..66610ef 100644
--- a/fence/fenced/config.c
+++ b/fence/fenced/config.c
@@ -13,6 +13,7 @@ int optd_disable_dbus;
int optd_skip_undefined;
int optd_post_join_delay;
int optd_post_fail_delay;
+int optd_fence_check_delay;
int optd_override_time;
int optd_override_path;
@@ -25,6 +26,7 @@ int cfgd_disable_dbus = DEFAULT_DISABLE_DBUS;
int cfgd_skip_undefined = DEFAULT_SKIP_UNDEFINED;
int cfgd_post_join_delay = DEFAULT_POST_JOIN_DELAY;
int cfgd_post_fail_delay = DEFAULT_POST_FAIL_DELAY;
+int cfgd_fence_check_delay = DEFAULT_FENCE_CHECK_DELAY;
int cfgd_override_time = DEFAULT_OVERRIDE_TIME;
const char *cfgd_override_path = DEFAULT_OVERRIDE_PATH;
@@ -89,6 +91,7 @@ void read_ccs_int(const char *path, int *config_val)
#define CLEAN_START_PATH "/cluster/fence_daemon/@clean_start"
#define POST_JOIN_DELAY_PATH "/cluster/fence_daemon/@post_join_delay"
#define POST_FAIL_DELAY_PATH "/cluster/fence_daemon/@post_fail_delay"
+#define FENCE_CHECK_DELAY_PATH "/cluster/fence_daemon/@fence_check_delay"
#define OVERRIDE_PATH_PATH "/cluster/fence_daemon/@override_path"
#define OVERRIDE_TIME_PATH "/cluster/fence_daemon/@override_time"
#define METHOD_NAME_PATH "/cluster/clusternodes/clusternode[@name=\"%s\"]/fence/method[%d]/@name"
@@ -118,6 +121,8 @@ void reread_ccs(void)
read_ccs_int(POST_JOIN_DELAY_PATH, &cfgd_post_join_delay);
if (!optd_post_fail_delay)
read_ccs_int(POST_FAIL_DELAY_PATH, &cfgd_post_fail_delay);
+ if (!optd_fence_check_delay)
+ read_ccs_int(FENCE_CHECK_DELAY_PATH, &cfgd_fence_check_delay);
if (!optd_override_time)
read_ccs_int(OVERRIDE_TIME_PATH, &cfgd_override_time);
}
diff --git a/fence/fenced/config.h b/fence/fenced/config.h
index d17ed1a..5f42dea 100644
--- a/fence/fenced/config.h
+++ b/fence/fenced/config.h
@@ -8,8 +8,10 @@
#define DEFAULT_SKIP_UNDEFINED 0
#define DEFAULT_POST_JOIN_DELAY 6
#define DEFAULT_POST_FAIL_DELAY 0
+#define DEFAULT_FENCE_CHECK_DELAY 5
#define DEFAULT_OVERRIDE_TIME 3
#define DEFAULT_OVERRIDE_PATH "/var/run/cluster/fenced_override"
+#define DEFAULT_FENCE_CHECK_PID_PATH "/var/run/fence_check.pid"
extern int optd_groupd_compat;
extern int optd_debug_logfile;
@@ -18,6 +20,7 @@ extern int optd_disable_dbus;
extern int optd_skip_undefined;
extern int optd_post_join_delay;
extern int optd_post_fail_delay;
+extern int optd_fence_check_delay;
extern int optd_override_time;
extern int optd_override_path;
@@ -28,6 +31,7 @@ extern int cfgd_disable_dbus;
extern int cfgd_skip_undefined;
extern int cfgd_post_join_delay;
extern int cfgd_post_fail_delay;
+extern int cfgd_fence_check_delay;
extern int cfgd_override_time;
extern const char *cfgd_override_path;
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 21855b2..0be3332 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -22,6 +22,7 @@
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/time.h>
+#include <sys/file.h>
#include <openais/saAis.h>
#include <corosync/cpg.h>
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index 13014c8..0b5e2b2 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -165,6 +165,37 @@ static int check_override(int ofd, char *nodename, int timeout)
return rv;
}
+static int fence_check_pid(void)
+{
+ char buf[16];
+ int fd, rv, pid = 0;
+
+ fd = open(DEFAULT_FENCE_CHECK_PID_PATH, O_RDONLY);
+ if (fd < 0)
+ return 0;
+
+ rv = flock(fd, LOCK_EX | LOCK_NB);
+ if (!rv) {
+ flock(fd, LOCK_UN);
+ goto out;
+ }
+
+ /* fence_check script is running, return its pid */
+
+ memset(buf, 0, sizeof(buf));
+
+ rv = read(fd, buf, sizeof(buf));
+ if (rv <= 0)
+ goto out;
+
+ pid = atoi(buf);
+ if (pid <= 0)
+ pid = 0;
+ out:
+ close(fd);
+ return pid;
+}
+
/* If there are victims after a node has joined, it's a good indication that
they may be joining the cluster shortly. If we delay a bit they might
become members and we can avoid fencing them. This is only really an issue
@@ -174,13 +205,37 @@ static int check_override(int ofd, char *nodename, int timeout)
void delay_fencing(struct fd *fd, int node_join)
{
struct timeval first, last, start, now;
- int victim_count, last_count = 0, delay = 0;
+ int victim_count, last_count = 0, delay = 0, pid;
struct node *node;
const char *delay_type;
if (list_empty(&fd->victims))
return;
+ gettimeofday(&first, NULL);
+ gettimeofday(&start, NULL);
+
+ if (cfgd_fence_check_delay) {
+ for (;;) {
+ pid = fence_check_pid();
+ if (!pid)
+ break;
+
+ gettimeofday(&now, NULL);
+ if (now.tv_sec - start.tv_sec >= cfgd_fence_check_delay)
+ break;
+
+ log_debug("delay fencing for fence_check_pid %d", pid);
+ sleep(1);
+ }
+
+ if (pid) {
+ kill(pid, SIGTERM);
+ log_error("kill fence_check_pid %d delay %d",
+ pid, cfgd_fence_check_delay);
+ }
+ }
+
if (node_join || cluster_quorate_from_last_update) {
delay = cfgd_post_join_delay;
delay_type = "post_join_delay";
@@ -195,9 +250,6 @@ void delay_fencing(struct fd *fd, int node_join)
if (delay == 0)
goto out;
- gettimeofday(&first, NULL);
- gettimeofday(&start, NULL);
-
for (;;) {
query_unlock();
sleep(1);
11 years, 7 months
cluster: RHEL6 - fence_node/libfence: status
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=ea6bcc00c32...
Commit: ea6bcc00c3246381060e882ba40dccd7238b205a
Parent: 6b7602b0f65268e2f09c87a314cda3947d839b35
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Aug 27 14:13:10 2012 -0500
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Tue Sep 11 20:07:18 2012 +0200
fence_node/libfence: status
fence_node -S to run status on a node, via
libfence fence_node_status().
Resolves: rhbz#797952
Signed-off-by: David Teigland <teigland(a)redhat.com>
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
fence/fence_node/fence_node.c | 71 ++++++-
fence/libfence/agent.c | 461 +++++++++++++++++++++++++++++++++++++++++
fence/libfence/libfence.h | 26 +++
3 files changed, 550 insertions(+), 8 deletions(-)
diff --git a/fence/fence_node/fence_node.c b/fence/fence_node/fence_node.c
index a98962a..f926825 100644
--- a/fence/fence_node/fence_node.c
+++ b/fence/fence_node/fence_node.c
@@ -14,13 +14,16 @@ static char *prog_name;
static char our_name[CMAN_MAX_NODENAME_LEN+1];
static int verbose;
static int unfence;
+static int status;
+static int call_fenced = 1;
+static int use_method_num;
#define FL_SIZE 32
static struct fence_log flog[FL_SIZE];
static int flog_count;
static const char *action = "fence";
-#define OPTION_STRING "UvhV"
+#define OPTION_STRING "UvhVSe:m:"
#define die(fmt, args...) \
do \
@@ -39,10 +42,13 @@ static void print_usage(void)
printf("\n");
printf("Options:\n");
printf("\n");
- printf(" -U Unfence the node, default local node name\n");
- printf(" -v Show fence agent results, -vv for agent args\n");
- printf(" -h Print this help, then exit\n");
- printf(" -V Print program version information, then exit\n");
+ printf(" -U Unfence the node, default local node name\n");
+ printf(" -S Run status on node name\n");
+ printf(" -v Show fence agent results, -vv for agent args\n");
+ printf(" -h Print this help, then exit\n");
+ printf(" -V Print program version information, then exit\n");
+ printf(" -e 0|1 Enable/disable fenced_external notification\n");
+ printf(" -m <num> Method number, starting from 1\n");
printf("\n");
}
@@ -104,6 +110,14 @@ static const char *fe_str(int r)
return "error config method";
case FE_READ_DEVICE:
return "error config device";
+ case FE_NUM_METHOD:
+ return "error method number";
+ case FE_AGENT_STATUS_ON:
+ return "status on";
+ case FE_AGENT_STATUS_OFF:
+ return "status off";
+ case FE_AGENT_STATUS_ERROR:
+ return "status error";
default:
return "error unknown";
}
@@ -126,6 +140,19 @@ int main(int argc, char *argv[])
action = "unfence";
break;
+ case 'S':
+ status = 1;
+ action = "status";
+ break;
+
+ case 'e':
+ call_fenced = atoi(optarg);
+ break;
+
+ case 'm':
+ use_method_num = atoi(optarg);
+ break;
+
case 'v':
verbose++;
break;
@@ -178,11 +205,17 @@ int main(int argc, char *argv[])
memset(&flog, 0, sizeof(flog));
flog_count = 0;
- if (unfence)
+ if (status)
+ error = fence_node_status(victim, flog, FL_SIZE, &flog_count,
+ use_method_num);
+ else if (unfence)
error = unfence_node(victim, flog, FL_SIZE, &flog_count);
else
error = fence_node(victim, flog, FL_SIZE, &flog_count);
+ if (status && !verbose && error < 0)
+ verbose = 1;
+
if (!verbose)
goto skip;
@@ -211,7 +244,28 @@ int main(int argc, char *argv[])
logt_init("fence_node", LOG_MODE_OUTPUT_SYSLOG, SYSLOGFACILITY,
SYSLOGLEVEL, 0, NULL);
- if (unfence) {
+ if (status) {
+ if (error == -2) {
+ fprintf(stderr, "status %s undefined\n", victim);
+ rv = 2;
+ } else if (error < 0) {
+ fprintf(stderr, "status %s failed %d\n", victim, error);
+ logt_print(LOG_ERR, "status %s failed %d\n", victim, error);
+ rv = EXIT_FAILURE;
+ } else if (error == 2) {
+ fprintf(stderr, "status %s success off\n", victim);
+ logt_print(LOG_ERR, "status %s success off\n", victim);
+ rv = EXIT_SUCCESS;
+ } else if (!error) {
+ fprintf(stderr, "status %s success on\n", victim);
+ logt_print(LOG_ERR, "status %s success on\n", victim);
+ rv = EXIT_SUCCESS;
+ } else {
+ fprintf(stderr, "status %s failed invalid %d\n", victim, error);
+ logt_print(LOG_ERR, "status %s failed invalid %d\n", victim, error);
+ rv = EXIT_FAILURE;
+ }
+ } else if (unfence) {
if (error == -2) {
fprintf(stderr, "unfence %s undefined\n", victim);
rv = 2;
@@ -241,7 +295,8 @@ int main(int argc, char *argv[])
/* Tell fenced what we've done so that it can avoid
fencing this node again if the fence_node() rebooted
it. */
- fenced_external(victim);
+ if (call_fenced)
+ fenced_external(victim);
}
}
diff --git a/fence/libfence/agent.c b/fence/libfence/agent.c
index 5577fda..fac6c92 100644
--- a/fence/libfence/agent.c
+++ b/fence/libfence/agent.c
@@ -679,3 +679,464 @@ int unfence_node(char *victim, struct fence_log *log, int log_size,
return error;
}
+/*
+ * Returns:
+ * < 0: internal error
+ * 0: agent exited with 0
+ * 1: agent exited with 1
+ * 2: agent exited with 2
+ */
+
+static int run_agent_status(char *agent, char *args, int *agent_result)
+{
+ int pid, status, len, rv;
+ int pw_fd = -1; /* parent write file descriptor */
+ int cr_fd = -1; /* child read file descriptor */
+ int pfd[2];
+
+ if (args == NULL || agent == NULL) {
+ rv = -1;
+ goto fail;
+ }
+ len = strlen(args);
+
+ if (pipe(pfd)) {
+ rv = -errno;
+ goto fail;
+ }
+ cr_fd = pfd[0];
+ pw_fd = pfd[1];
+
+ pid = fork();
+ if (pid < 0) {
+ rv = -errno;
+ *agent_result = FE_AGENT_FORK;
+ goto fail;
+ }
+
+ if (pid) {
+ /* parent */
+ int ret;
+
+ do {
+ ret = write(pw_fd, args, len);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret != len) {
+ rv = -1;
+ goto fail;
+ }
+
+ close(cr_fd);
+ close(pw_fd);
+
+ rv = waitpid(pid, &status, 0);
+
+ if (rv < 0) {
+ /* shouldn't happen */
+ rv = -errno;
+ goto out;
+ }
+
+ if (rv != pid) {
+ /* shouldn't happen */
+ rv = -1;
+ goto out;
+ }
+
+ if (WIFEXITED(status)) {
+ /* pid exited properly with an exit code */
+ rv = WEXITSTATUS(status);
+
+ if (rv == 0)
+ *agent_result = FE_AGENT_STATUS_ON;
+ else if (rv == 2)
+ *agent_result = FE_AGENT_STATUS_OFF;
+ else
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ } else if (WIFSIGNALED(status)) {
+ /* pid terminated due to a signal */
+ rv = -1;
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ } else {
+ /* something else happened, not sure what */
+ rv = -1;
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ }
+ goto out;
+
+ } else {
+ /* child */
+ int c_stdout, c_stderr;
+
+ /* redirect agent stdout/stderr to /dev/null */
+ close(1);
+ c_stdout = open("/dev/null", O_WRONLY);
+ if (c_stdout < 0) {
+ rv = -1;
+ goto fail;
+ }
+ close(2);
+ c_stderr = open("/dev/null", O_WRONLY);
+ if (c_stderr < 0) {
+ rv = -1;
+ goto fail;
+ }
+
+ /* redirect agent stdin from parent */
+ close(0);
+ if (dup(cr_fd) < 0) {
+ rv = -errno;
+ goto fail;
+ }
+
+ close(cr_fd);
+ close(pw_fd);
+
+ execlp(agent, agent, NULL);
+ exit(EXIT_FAILURE);
+ }
+ fail:
+ close(cr_fd);
+ close(pw_fd);
+ out:
+ return rv;
+}
+
+static int make_args_status(int cd, char *victim, char *method, int d,
+ char *device, char **args_out)
+{
+ char path[PATH_MAX];
+ char *args, *str;
+ int error, ret, cnt = 0;
+ size_t len, pos;
+
+ args = malloc(FENCE_AGENT_ARGS_MAX);
+ if (!args)
+ return -ENOMEM;
+ memset(args, 0, FENCE_AGENT_ARGS_MAX);
+
+ len = FENCE_AGENT_ARGS_MAX - 1;
+ pos = 0;
+
+ /* node-specific args for victim */
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, NODE_FENCE_ARGS_PATH, victim, method, d+1);
+
+ for (;;) {
+ error = ccs_get_list(cd, path, &str);
+ if (error || !str)
+ break;
+ ++cnt;
+
+ if (!strncmp(str, "name=", 5)) {
+ free(str);
+ continue;
+ }
+
+ if (!strncmp(str, "action=", 7)) {
+ free(str);
+ continue;
+ }
+
+ ret = snprintf(args + pos, len - pos, "%s\n", str);
+
+ free(str);
+
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ /* add nodename of victim to args */
+
+ if (!strstr(args, "nodename=")) {
+ ret = snprintf(args + pos, len - pos, "nodename=%s\n", victim);
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ /* add action=status to args */
+
+ ret = snprintf(args + pos, len - pos, "action=status\n");
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+
+ /* device-specific args */
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, FENCE_DEVICE_ARGS_PATH, device);
+
+ for (;;) {
+ error = ccs_get_list(cd, path, &str);
+ if (error || !str)
+ break;
+ ++cnt;
+
+ if (!strncmp(str, "name=", 5)) {
+ free(str);
+ continue;
+ }
+
+ ret = snprintf(args + pos, len - pos, "%s\n", str);
+
+ free(str);
+
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ if (cnt)
+ error = 0;
+ out:
+ if (error) {
+ free(args);
+ args = NULL;
+ }
+
+ *args_out = args;
+ return error;
+}
+
+static int use_device_status(int cd, char *victim, char *method, int d,
+ char *device, struct fence_log *lp)
+{
+ char path[PATH_MAX], *agent, *args = NULL;
+ int error;
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, AGENT_NAME_PATH, device);
+
+ error = ccs_get(cd, path, &agent);
+ if (error) {
+ lp->error = FE_READ_AGENT;
+ goto out;
+ }
+
+ strncpy(lp->agent_name, agent, FENCE_AGENT_NAME_MAX-1);
+
+ error = make_args_status(cd, victim, method, d, device, &args);
+ if (error) {
+ lp->error = FE_READ_ARGS;
+ goto out_agent;
+ }
+
+ strncpy(lp->agent_args, args, FENCE_AGENT_ARGS_MAX-1);
+
+ error = run_agent_status(agent, args, &lp->error);
+
+ free(args);
+ out_agent:
+ free(agent);
+ out:
+ return error;
+}
+
+/* We want to run status on each device in each method, and we need all
+ to succeed in order for status as a whole to succeed. Agent success
+ for status is being either "on" (exit 0) or "off" (exit 2). Agent
+ failure for status is when the on/off state is unknown (exit 1),
+ i.e. the agent failed to run or ran and cannot connect, or cannot get
+ the on/off state for some reason.
+
+ As soon as any one device in any method fails, we can quit and report
+ failure (rv < 0) for status as a whole. If status of all devices is
+ "on", then status as a whole returns 0. If status of all devices are
+ "off", then status as a whole returns 2. If the status of all devices
+ are mixed on/off, then status as a whole returns 0. */
+
+int fence_node_status(char *victim, struct fence_log *log, int log_size,
+ int *log_count, int use_method_num)
+{
+ struct fence_log stub;
+ struct fence_log *lp = log;
+ char *method = NULL, *device = NULL;
+ char *victim_nodename = NULL;
+ int num_methods, num_devices, m, d, cd, rv;
+ int on_count = 0, off_count = 0;
+ int left = log_size;
+ int error = -1;
+ int count = 0;
+
+ cd = ccs_connect();
+ if (cd < 0) {
+ if (lp && left) {
+ lp->error = FE_NO_CONFIG;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ goto ret;
+ }
+
+ if (ccs_lookup_nodename(cd, victim, &victim_nodename) == 0)
+ victim = victim_nodename;
+
+ num_methods = count_methods(cd, victim);
+ if (!num_methods) {
+ if (lp && left) {
+ lp->error = FE_NO_METHOD;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -2; /* No fencing */
+ goto out;
+ }
+
+ if (use_method_num && (use_method_num > num_methods)) {
+ if (lp && left) {
+ lp->error = FE_NUM_METHOD;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -2; /* No fencing */
+ goto out;
+ }
+
+ for (m = 0; m < num_methods; m++) {
+
+ if (use_method_num && (m + 1 != use_method_num))
+ continue;
+
+ rv = get_method(cd, victim, m, &method);
+ if (rv) {
+ if (lp && left) {
+ lp->error = FE_READ_METHOD;
+ lp->method_num = m;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ break;
+ }
+
+ num_devices = count_devices(cd, victim, method);
+ if (!num_devices) {
+ if (lp && left) {
+ lp->error = FE_NO_DEVICE;
+ lp->method_num = m;
+ lp++;
+ left--;
+ }
+ count++;
+ continue;
+ }
+
+ for (d = 0; d < num_devices; d++) {
+ rv = get_device(cd, victim, method, d, &device);
+ if (rv) {
+ if (lp && left) {
+ lp->error = FE_READ_DEVICE;
+ lp->method_num = m;
+ lp->device_num = d;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ break;
+ }
+
+ /* every call to use_device generates a log entry,
+ whether success or fail */
+
+ error = use_device_status(cd, victim, method, d, device,
+ (lp && left) ? lp : &stub);
+ count++;
+ if (lp && left) {
+ /* error, name, args already set */
+ lp->method_num = m;
+ lp->device_num = d;
+ lp++;
+ left--;
+ }
+
+ /*
+ * error values:
+ * < 0: internal error from use_device_status,
+ * internal error from run_agent_status,
+ * run_agent_status failed to fork agent
+ * 0: agent exited with 0 (success, status is on)
+ * 2: agent exited with 2 (success, status is off)
+ * 1: agent exited with 1 (error, status is unknown)
+ */
+
+ /* internal error: status fail */
+ if (error < 0)
+ break;
+
+ /* agent error: status fail */
+ if (error == 1) {
+ error = -1;
+ break;
+ }
+
+ if (!error) {
+ /* agent success "on": status success */
+ on_count++;
+ } else if (error == 2) {
+ /* agent success "off": status success */
+ error = 0;
+ off_count++;
+ } else {
+ /* some other error */
+ error = -1;
+ break;
+ }
+
+ free(device);
+ device = NULL;
+ }
+
+ if (device)
+ free(device);
+
+ free(method);
+
+ /* if any device failed in this method, return failure
+ for the status */
+
+ if (error)
+ break;
+ }
+
+ if (error < 0)
+ goto out;
+
+ /* All devices are either on or off, none are unknown/inaccessible,
+ so status as a whole is a success. Decide which of the two
+ success values to return: 2 if all devices are off, or 0 if
+ all devices are on, 0 if mixed on/off. */
+
+ if (!on_count)
+ error = 2;
+ else
+ error = 0;
+
+ out:
+ if (victim_nodename)
+ free(victim_nodename);
+
+ ccs_disconnect(cd);
+ ret:
+ if (log_count)
+ *log_count = count;
+ return error;
+}
+
diff --git a/fence/libfence/libfence.h b/fence/libfence/libfence.h
index 33f493a..10d00f0 100644
--- a/fence/libfence/libfence.h
+++ b/fence/libfence/libfence.h
@@ -15,6 +15,10 @@ extern "C" {
#define FE_READ_ARGS 8 /* read (ccs) error on node/dev args */
#define FE_READ_METHOD 9 /* read (ccs) error on method */
#define FE_READ_DEVICE 10 /* read (ccs) error on method/device */
+#define FE_NUM_METHOD 11 /* method number does not exist */
+#define FE_AGENT_STATUS_ON 12
+#define FE_AGENT_STATUS_OFF 13
+#define FE_AGENT_STATUS_ERROR 14
#define FENCE_AGENT_NAME_MAX 256 /* including terminating \0 */
#define FENCE_AGENT_ARGS_MAX 4096 /* including terminating \0 */
@@ -32,6 +36,28 @@ int fence_node(char *name, struct fence_log *log, int log_size, int *log_count);
int unfence_node(char *name, struct fence_log *log, int log_size,
int *log_count);
+/*
+ * use_method_num == 0: run status on all devices of all methods
+ * use_method_num > 0: run status on all devices of given method number,
+ * where first method is use_method_num = 1
+ *
+ * Returns 0 on success: status is successful on all devices of all methods
+ * (or all devices of specified method). All devices are in the "on" state,
+ * or some devices are on and some are off.
+ *
+ * Returns 2 on success: status is successful on all devices of all methods
+ * (or all devices of a specified method). All devices are in the "off" state.
+ *
+ * Returns -2 if no fencing methods are defined for the node, or if
+ * use_method_num was specified and the specified method number does
+ * not exist.
+ *
+ * Returns -EXXX for other failures.
+ */
+
+int fence_node_status(char *victim, struct fence_log *log, int log_size,
+ int *log_count, int use_method_num);
+
#ifdef __cplusplus
}
#endif
11 years, 7 months
cluster: STABLE32 - fence_check: add script and man page
by Fabio M. Di Nitto
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=6d380d530eb...
Commit: 6d380d530eb6f777ec6ab92a30de67a22772dc8c
Parent: 75ec3a1793b58b2664c59a7c50697daaf0cbb4d0
Author: Fabio M. Di Nitto <fdinitto(a)redhat.com>
AuthorDate: Thu Sep 6 19:47:27 2012 +0200
Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com>
CommitterDate: Thu Sep 6 19:47:27 2012 +0200
fence_check: add script and man page
Resolves: rhbz#797952
This commits depends on 82ff946b955f9c6b7856b5831d5603f70aaf974f
and 75ec3a1793b58b2664c59a7c50697daaf0cbb4d0
Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com>
---
.gitignore | 1 +
fence/Makefile | 2 +-
fence/fence_check/Makefile | 20 +++
fence/fence_check/fence_check.in | 241 ++++++++++++++++++++++++++++++++++++++
fence/man/Makefile | 2 +-
fence/man/fence_check.8 | 65 ++++++++++
6 files changed, 329 insertions(+), 2 deletions(-)
diff --git a/.gitignore b/.gitignore
index e171466..ec352a8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -34,6 +34,7 @@ dlm/tool/dlm_tool
doc/cluster
fence/fence_node/fence_node
fence/fence_tool/fence_tool
+fence/fence_check/fence_check
fence/fenced/fenced
group/daemon/groupd
group/dlm_controld/dlm_controld
diff --git a/fence/Makefile b/fence/Makefile
index b3f9a61..a8783bb 100644
--- a/fence/Makefile
+++ b/fence/Makefile
@@ -1,4 +1,4 @@
include ../make/defines.mk
include $(OBJDIR)/make/passthrough.mk
-SUBDIRS=libfence libfenced fenced fence_node fence_tool man
+SUBDIRS=libfence libfenced fenced fence_node fence_tool fence_check man
diff --git a/fence/fence_check/Makefile b/fence/fence_check/Makefile
new file mode 100644
index 0000000..3788f26
--- /dev/null
+++ b/fence/fence_check/Makefile
@@ -0,0 +1,20 @@
+include ../../make/defines.mk
+
+TARGET1 = fence_check
+
+SBINDIRT = $(TARGET1)
+
+all: $(TARGET1)
+
+include $(OBJDIR)/make/clean.mk
+include $(OBJDIR)/make/install.mk
+include $(OBJDIR)/make/uninstall.mk
+
+${TARGET1}: $(S)/${TARGET1}.in
+ cat $(S)/$(TARGET1).in | sed \
+ -e 's#@SBINDIR@#${sbindir}#g' \
+ -e 's#@LOGDIR@#${logdir}#g' \
+ -e 's#@VERSION@#${RELEASE_VERSION}#g' \
+ > $(TARGET1)
+
+clean: generalclean
diff --git a/fence/fence_check/fence_check.in b/fence/fence_check/fence_check.in
new file mode 100644
index 0000000..e194daf
--- /dev/null
+++ b/fence/fence_check/fence_check.in
@@ -0,0 +1,241 @@
+#!/bin/bash
+
+set +e
+export LC_ALL=C
+export PATH="/bin:/usr/bin:/sbin:/usr/sbin:@SBINDIR@"
+
+logfile=@LOGDIR(a)/fence_check.log
+verbose=""
+vardir=/var/run
+
+print_usage() {
+ echo "Usage:"
+ echo ""
+ echo "fence_check [options]"
+ echo ""
+ echo "Options:"
+ echo " -h Print this help, then exit"
+ echo " -V Print program version information, then exit"
+ echo " -d Disable output to logfile ($logfile)"
+ echo " -v Produce verbose output"
+ echo " -e Produce extra verbose output"
+ echo " ATTENTION: IT MIGHT SHOW FENCE PASSWORDS IN LOG FILES!!!"
+ echo " -f Override checks and force execution"
+ echo " DO NOT USE ON PRODUCTION CLUSTERS!!!"
+}
+
+check_opts() {
+ while [ "$1" != "--" ]; do
+ case $1 in
+ -h)
+ print_usage
+ exit 0
+ ;;
+ -V)
+ echo "fence_check version @VERSION@"
+ exit 0
+ ;;
+ -v)
+ verbose=1
+ ;;
+ -e)
+ fencenodeopts="-vv"
+ ;;
+ -d)
+ logfile=""
+ ;;
+ -f)
+ override="1"
+ ;;
+ esac
+ shift
+ done
+}
+
+opts=$(getopt hdefvV $@)
+if [ "$?" != 0 ]; then
+ print_usage >&2
+ exit 1
+fi
+check_opts $opts
+
+cleanup() {
+ vecho "cleanup: $@"
+ rm -f $vardir/fence_check.pid
+ exit $1
+}
+
+trap "cleanup 1 ABRT" ABRT
+trap "cleanup 1 QUIT" QUIT
+trap "cleanup 1 TERM" TERM
+trap "cleanup 1 INT" INT
+
+lecho() {
+ [ -n "$logfile" ] && echo "$@" | tee -a $logfile
+ [ -z "$logfile" ] && echo "$@"
+ return 0
+}
+
+vecho() {
+ [ -z "$verbose" ] && return 0
+ lecho "$@"
+}
+
+error_report()
+{
+ lecho "Unable to perform fence_check: $@"
+}
+
+cman_running()
+{
+ vecho -n "Checking if cman is running: "
+ thisnodeid="$(cman_tool status 2>&1 | grep "Node ID:" | awk '{print $NF}')"
+ [ -z "$thisnodeid" ] && {
+ vecho "not running"
+ return 1
+ }
+ vecho "running"
+}
+
+cman_has_quorum()
+{
+ vecho -n "Checking if node is quorate: "
+ cman_tool -t 1 -q wait > /dev/null 2>&1 || {
+ vecho "not quorate"
+ return 1
+ }
+ vecho "quorate"
+}
+
+fence_domain()
+{
+ vecho -n "Checking if node is in fence domain: "
+ fencels="$(fence_tool ls 2>&1)" || {
+ vecho "not part of fence domain"
+ return 1
+ }
+ vecho "yes"
+}
+
+fence_in_progress()
+{
+ vecho -n "Checking if real fencing is in progress: "
+ victim="$(echo "$fencels" | grep "victim count" | awk '{print $NF}')"
+ [ "$victim" != "0" ] && {
+ vecho "real fencing in progress"
+ return 1
+ }
+ vecho "no fencing in progress"
+}
+
+fence_master()
+{
+ vecho -n "Checking if node is fence master: "
+ master="$(echo "$fencels" | grep "master nodeid" | awk '{print $NF}')"
+ [ "$master" != "$thisnodeid" ] && {
+ vecho "node is not fence master"
+ return 1
+ }
+ vecho "this node is fence master"
+}
+
+can_check()
+{
+ cman_running || {
+ error_report "cman is not running"
+ return 2
+ }
+
+ [ "$override" = "1" ] && return 0
+
+ cman_has_quorum || {
+ error_report "node is not quorate"
+ return 3
+ }
+
+ fence_domain || {
+ error_report "node is not part of the fence domain"
+ return 3
+ }
+
+ fence_master || {
+ error_report "node is not fence master"
+ return 3
+ }
+
+ fence_in_progress || {
+ error_report "real fencing operation in progress"
+ return 3
+ }
+
+ return 0
+}
+
+execute_check()
+{
+ can_check || return $?
+
+ vecho -n "Get node list: "
+ nodelist="$(cman_tool nodes -F id,name |grep -v '^0' | awk '{print $2}')"
+ vecho $nodelist
+
+ ret=0
+
+ for node in $nodelist; do
+ vecho "Testing $node fencing"
+
+ can_check
+ canret=$?
+
+ if [ "$canret" != 0 ]; then
+ if [ "$ret" != "5" ]; then
+ return $canret
+ else
+ return $ret
+ fi
+ fi
+
+ vecho "Checking how many fencing methods are configured for node $node"
+ for i in $(seq 1 8); do
+ ccs_tool query \
+ /cluster/clusternodes/clusternode[@name=\"$node\"]/fence/method[$i]/@name >/dev/null 2>&1 || break
+ done
+ nummethods=$((i - 1))
+ vecho "Found $nummethods method(s) to test for node $node"
+
+ for method in $(seq 1 $nummethods); do
+ vecho "Testing $node method $method status"
+ fenceres="$(fence_node $fencenodeopts -S $node -m $method 2>&1)"
+ if [ "$?" != 0 ]; then
+ ret=5
+ lecho "Testing $node method $method: FAILED"
+ if [ -z "$fencenodeopts" ]; then
+ fenceres="$(echo "$fenceres" | tail -n 2 | head -n 1)"
+ else
+ fenceargs="$(echo "$fenceres" | tail -n 2 | head -n 1)"
+ fenceres="$(echo "$fenceres" | tail -n 3 | head -n 1)"
+ fi
+ lecho "$fenceres"
+ [ -n "$fenceargs" ] && lecho "$fenceargs"
+ else
+ lecho "Testing $node method $method: success"
+ fi
+ done
+ done
+ return $ret
+}
+
+(
+ lecho "fence_check run at $(date) pid: $BASHPID"
+
+ flock --nonblock --exclusive 200 || {
+ lecho "Another process ($(cat $vardir/fence_check.pid)) is holding the lock"
+ exit 4
+ }
+
+ echo $BASHPID > $vardir/fence_check.pid
+
+ execute_check
+ cleanup $?
+
+) 200>>$vardir/fence_check.pid
diff --git a/fence/man/Makefile b/fence/man/Makefile
index a4a9cf1..3c9aa44 100644
--- a/fence/man/Makefile
+++ b/fence/man/Makefile
@@ -1,6 +1,6 @@
include ../../make/defines.mk
-MANTARGET = fenced.8 fence_node.8 fence_tool.8
+MANTARGET = fenced.8 fence_node.8 fence_tool.8 fence_check.8
include $(OBJDIR)/make/install.mk
include $(OBJDIR)/make/uninstall.mk
diff --git a/fence/man/fence_check.8 b/fence/man/fence_check.8
new file mode 100644
index 0000000..7fc221c
--- /dev/null
+++ b/fence/man/fence_check.8
@@ -0,0 +1,65 @@
+.TH "fence_check" "8" "September 2012" "" "fence configuration check"
+.SH "NAME"
+fence_check \- fence configuration check utility
+.SH "SYNOPSIS"
+\fBfence_check [\-h] [\-V] [\-d] [\-v] [\-e]
+.SH "DESCRIPTION"
+.PP
+The \fBfence_check\fP utility can be used to actively test the fence
+configuration for each node in the cluster, by issuing status commands
+instead of "reboot" "on" "off" commands on the configured fence
+methods/devices in cluster.conf.
+
+\fBfence_check\fP can also be very useful when executed via a cron job
+for regular monitoring of fence devices and to detect issues after
+cluster.conf changes.
+
+.SH "OPTIONS"
+.IP "\-h"
+Print help message, then exit.
+.IP "\-V"
+Print program version information, then exit.
+.IP "\-d"
+Disable output to logfile (default /var/log/cluster/fence_check.log).
+Useful in combination with \-e that could record sensitive data from
+cluster.conf.
+.IP "\-v"
+Enable verbose output of all actions taken during execution.
+Useful to debug issues with fence_check execution.
+Use of \-v does not include \-e.
+.IP "\-e"
+Produce detailed output, in case of failure, of the command used to test
+the fence device. Use of \-e does not include \-v.
+ATTENTION: IT MIGHT SHOW FENCE PASSWORDS IN LOG FILES! USE WITH CARE!
+.IP "\-f"
+Override checks and force execution. DO NOT USE ON PRODUCTION CLUSTERS!
+
+.SH "NOTES"
+\fBfence_check\fP can only be executed when the following conditions are met:
+
+\- cman is running on the node
+
+\- the node is quorate
+
+\- the node has joined the fence domain
+
+\- the node is in charge of fencing for the whole cluster
+
+\- no real fencing action is in progress
+
+\- no other \fBfence_check\fP operations are in progress.
+
+\fBfence_check\fP will perform all those checks prior starting a cluster wide
+fence status check. When used in combination with \-f, cman must be running
+on the node and no other processes must be performing a check at the same time.
+
+By default every run of \fBfence_check\fP is logged to logfile and to stdout.
+
+fence_check returns:
+
+0 - if execution completes
+1 - on generic execution errors (fatal)
+2 - if cman is not running (fatal)
+3 - node is not quorate/node is not part of fence domain/if node is not in charge of fencing/a real fencing operation is in progress (can be overridden)
+4 - if another \fBfence_check\fP is in progress (fatal)
+5 - if any of the fence status tests failed
11 years, 8 months
cluster: STABLE32 - fenced: fence_check delay
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=75ec3a1793b...
Commit: 75ec3a1793b58b2664c59a7c50697daaf0cbb4d0
Parent: 82ff946b955f9c6b7856b5831d5603f70aaf974f
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Sep 4 11:44:28 2012 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Sep 6 09:37:22 2012 -0500
fenced: fence_check delay
Delay fencing if the fence_check script is busy checking
fencing, which might cause our fencing to fail.
Configure delay seconds, default 5, 0 to disable, as
<fence_daemon fence_check_delay="5"/>
after which fenced sends SIGTERM to fence_check pid and
continues with normal fencing.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fenced/config.c | 5 ++++
fence/fenced/config.h | 4 +++
fence/fenced/fd.h | 1 +
fence/fenced/recover.c | 60 ++++++++++++++++++++++++++++++++++++++++++++---
4 files changed, 66 insertions(+), 4 deletions(-)
diff --git a/fence/fenced/config.c b/fence/fenced/config.c
index 0517c2a..66610ef 100644
--- a/fence/fenced/config.c
+++ b/fence/fenced/config.c
@@ -13,6 +13,7 @@ int optd_disable_dbus;
int optd_skip_undefined;
int optd_post_join_delay;
int optd_post_fail_delay;
+int optd_fence_check_delay;
int optd_override_time;
int optd_override_path;
@@ -25,6 +26,7 @@ int cfgd_disable_dbus = DEFAULT_DISABLE_DBUS;
int cfgd_skip_undefined = DEFAULT_SKIP_UNDEFINED;
int cfgd_post_join_delay = DEFAULT_POST_JOIN_DELAY;
int cfgd_post_fail_delay = DEFAULT_POST_FAIL_DELAY;
+int cfgd_fence_check_delay = DEFAULT_FENCE_CHECK_DELAY;
int cfgd_override_time = DEFAULT_OVERRIDE_TIME;
const char *cfgd_override_path = DEFAULT_OVERRIDE_PATH;
@@ -89,6 +91,7 @@ void read_ccs_int(const char *path, int *config_val)
#define CLEAN_START_PATH "/cluster/fence_daemon/@clean_start"
#define POST_JOIN_DELAY_PATH "/cluster/fence_daemon/@post_join_delay"
#define POST_FAIL_DELAY_PATH "/cluster/fence_daemon/@post_fail_delay"
+#define FENCE_CHECK_DELAY_PATH "/cluster/fence_daemon/@fence_check_delay"
#define OVERRIDE_PATH_PATH "/cluster/fence_daemon/@override_path"
#define OVERRIDE_TIME_PATH "/cluster/fence_daemon/@override_time"
#define METHOD_NAME_PATH "/cluster/clusternodes/clusternode[@name=\"%s\"]/fence/method[%d]/@name"
@@ -118,6 +121,8 @@ void reread_ccs(void)
read_ccs_int(POST_JOIN_DELAY_PATH, &cfgd_post_join_delay);
if (!optd_post_fail_delay)
read_ccs_int(POST_FAIL_DELAY_PATH, &cfgd_post_fail_delay);
+ if (!optd_fence_check_delay)
+ read_ccs_int(FENCE_CHECK_DELAY_PATH, &cfgd_fence_check_delay);
if (!optd_override_time)
read_ccs_int(OVERRIDE_TIME_PATH, &cfgd_override_time);
}
diff --git a/fence/fenced/config.h b/fence/fenced/config.h
index d17ed1a..5f42dea 100644
--- a/fence/fenced/config.h
+++ b/fence/fenced/config.h
@@ -8,8 +8,10 @@
#define DEFAULT_SKIP_UNDEFINED 0
#define DEFAULT_POST_JOIN_DELAY 6
#define DEFAULT_POST_FAIL_DELAY 0
+#define DEFAULT_FENCE_CHECK_DELAY 5
#define DEFAULT_OVERRIDE_TIME 3
#define DEFAULT_OVERRIDE_PATH "/var/run/cluster/fenced_override"
+#define DEFAULT_FENCE_CHECK_PID_PATH "/var/run/fence_check.pid"
extern int optd_groupd_compat;
extern int optd_debug_logfile;
@@ -18,6 +20,7 @@ extern int optd_disable_dbus;
extern int optd_skip_undefined;
extern int optd_post_join_delay;
extern int optd_post_fail_delay;
+extern int optd_fence_check_delay;
extern int optd_override_time;
extern int optd_override_path;
@@ -28,6 +31,7 @@ extern int cfgd_disable_dbus;
extern int cfgd_skip_undefined;
extern int cfgd_post_join_delay;
extern int cfgd_post_fail_delay;
+extern int cfgd_fence_check_delay;
extern int cfgd_override_time;
extern const char *cfgd_override_path;
diff --git a/fence/fenced/fd.h b/fence/fenced/fd.h
index 21855b2..0be3332 100644
--- a/fence/fenced/fd.h
+++ b/fence/fenced/fd.h
@@ -22,6 +22,7 @@
#include <sys/poll.h>
#include <sys/select.h>
#include <sys/time.h>
+#include <sys/file.h>
#include <openais/saAis.h>
#include <corosync/cpg.h>
diff --git a/fence/fenced/recover.c b/fence/fenced/recover.c
index 13014c8..0b5e2b2 100644
--- a/fence/fenced/recover.c
+++ b/fence/fenced/recover.c
@@ -165,6 +165,37 @@ static int check_override(int ofd, char *nodename, int timeout)
return rv;
}
+static int fence_check_pid(void)
+{
+ char buf[16];
+ int fd, rv, pid = 0;
+
+ fd = open(DEFAULT_FENCE_CHECK_PID_PATH, O_RDONLY);
+ if (fd < 0)
+ return 0;
+
+ rv = flock(fd, LOCK_EX | LOCK_NB);
+ if (!rv) {
+ flock(fd, LOCK_UN);
+ goto out;
+ }
+
+ /* fence_check script is running, return its pid */
+
+ memset(buf, 0, sizeof(buf));
+
+ rv = read(fd, buf, sizeof(buf));
+ if (rv <= 0)
+ goto out;
+
+ pid = atoi(buf);
+ if (pid <= 0)
+ pid = 0;
+ out:
+ close(fd);
+ return pid;
+}
+
/* If there are victims after a node has joined, it's a good indication that
they may be joining the cluster shortly. If we delay a bit they might
become members and we can avoid fencing them. This is only really an issue
@@ -174,13 +205,37 @@ static int check_override(int ofd, char *nodename, int timeout)
void delay_fencing(struct fd *fd, int node_join)
{
struct timeval first, last, start, now;
- int victim_count, last_count = 0, delay = 0;
+ int victim_count, last_count = 0, delay = 0, pid;
struct node *node;
const char *delay_type;
if (list_empty(&fd->victims))
return;
+ gettimeofday(&first, NULL);
+ gettimeofday(&start, NULL);
+
+ if (cfgd_fence_check_delay) {
+ for (;;) {
+ pid = fence_check_pid();
+ if (!pid)
+ break;
+
+ gettimeofday(&now, NULL);
+ if (now.tv_sec - start.tv_sec >= cfgd_fence_check_delay)
+ break;
+
+ log_debug("delay fencing for fence_check_pid %d", pid);
+ sleep(1);
+ }
+
+ if (pid) {
+ kill(pid, SIGTERM);
+ log_error("kill fence_check_pid %d delay %d",
+ pid, cfgd_fence_check_delay);
+ }
+ }
+
if (node_join || cluster_quorate_from_last_update) {
delay = cfgd_post_join_delay;
delay_type = "post_join_delay";
@@ -195,9 +250,6 @@ void delay_fencing(struct fd *fd, int node_join)
if (delay == 0)
goto out;
- gettimeofday(&first, NULL);
- gettimeofday(&start, NULL);
-
for (;;) {
query_unlock();
sleep(1);
11 years, 8 months
cluster: STABLE32 - fence_node/libfence: status
by David Teigland
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=82ff946b955...
Commit: 82ff946b955f9c6b7856b5831d5603f70aaf974f
Parent: 80b7f2cc7bc5eaa52cdeb89e53a0c5be1ae9a1e9
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Aug 27 14:13:10 2012 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Thu Sep 6 09:36:58 2012 -0500
fence_node/libfence: status
fence_node -S to run status on a node, via
libfence fence_node_status().
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
fence/fence_node/fence_node.c | 71 ++++++-
fence/libfence/agent.c | 461 +++++++++++++++++++++++++++++++++++++++++
fence/libfence/libfence.h | 26 +++
3 files changed, 550 insertions(+), 8 deletions(-)
diff --git a/fence/fence_node/fence_node.c b/fence/fence_node/fence_node.c
index a98962a..f926825 100644
--- a/fence/fence_node/fence_node.c
+++ b/fence/fence_node/fence_node.c
@@ -14,13 +14,16 @@ static char *prog_name;
static char our_name[CMAN_MAX_NODENAME_LEN+1];
static int verbose;
static int unfence;
+static int status;
+static int call_fenced = 1;
+static int use_method_num;
#define FL_SIZE 32
static struct fence_log flog[FL_SIZE];
static int flog_count;
static const char *action = "fence";
-#define OPTION_STRING "UvhV"
+#define OPTION_STRING "UvhVSe:m:"
#define die(fmt, args...) \
do \
@@ -39,10 +42,13 @@ static void print_usage(void)
printf("\n");
printf("Options:\n");
printf("\n");
- printf(" -U Unfence the node, default local node name\n");
- printf(" -v Show fence agent results, -vv for agent args\n");
- printf(" -h Print this help, then exit\n");
- printf(" -V Print program version information, then exit\n");
+ printf(" -U Unfence the node, default local node name\n");
+ printf(" -S Run status on node name\n");
+ printf(" -v Show fence agent results, -vv for agent args\n");
+ printf(" -h Print this help, then exit\n");
+ printf(" -V Print program version information, then exit\n");
+ printf(" -e 0|1 Enable/disable fenced_external notification\n");
+ printf(" -m <num> Method number, starting from 1\n");
printf("\n");
}
@@ -104,6 +110,14 @@ static const char *fe_str(int r)
return "error config method";
case FE_READ_DEVICE:
return "error config device";
+ case FE_NUM_METHOD:
+ return "error method number";
+ case FE_AGENT_STATUS_ON:
+ return "status on";
+ case FE_AGENT_STATUS_OFF:
+ return "status off";
+ case FE_AGENT_STATUS_ERROR:
+ return "status error";
default:
return "error unknown";
}
@@ -126,6 +140,19 @@ int main(int argc, char *argv[])
action = "unfence";
break;
+ case 'S':
+ status = 1;
+ action = "status";
+ break;
+
+ case 'e':
+ call_fenced = atoi(optarg);
+ break;
+
+ case 'm':
+ use_method_num = atoi(optarg);
+ break;
+
case 'v':
verbose++;
break;
@@ -178,11 +205,17 @@ int main(int argc, char *argv[])
memset(&flog, 0, sizeof(flog));
flog_count = 0;
- if (unfence)
+ if (status)
+ error = fence_node_status(victim, flog, FL_SIZE, &flog_count,
+ use_method_num);
+ else if (unfence)
error = unfence_node(victim, flog, FL_SIZE, &flog_count);
else
error = fence_node(victim, flog, FL_SIZE, &flog_count);
+ if (status && !verbose && error < 0)
+ verbose = 1;
+
if (!verbose)
goto skip;
@@ -211,7 +244,28 @@ int main(int argc, char *argv[])
logt_init("fence_node", LOG_MODE_OUTPUT_SYSLOG, SYSLOGFACILITY,
SYSLOGLEVEL, 0, NULL);
- if (unfence) {
+ if (status) {
+ if (error == -2) {
+ fprintf(stderr, "status %s undefined\n", victim);
+ rv = 2;
+ } else if (error < 0) {
+ fprintf(stderr, "status %s failed %d\n", victim, error);
+ logt_print(LOG_ERR, "status %s failed %d\n", victim, error);
+ rv = EXIT_FAILURE;
+ } else if (error == 2) {
+ fprintf(stderr, "status %s success off\n", victim);
+ logt_print(LOG_ERR, "status %s success off\n", victim);
+ rv = EXIT_SUCCESS;
+ } else if (!error) {
+ fprintf(stderr, "status %s success on\n", victim);
+ logt_print(LOG_ERR, "status %s success on\n", victim);
+ rv = EXIT_SUCCESS;
+ } else {
+ fprintf(stderr, "status %s failed invalid %d\n", victim, error);
+ logt_print(LOG_ERR, "status %s failed invalid %d\n", victim, error);
+ rv = EXIT_FAILURE;
+ }
+ } else if (unfence) {
if (error == -2) {
fprintf(stderr, "unfence %s undefined\n", victim);
rv = 2;
@@ -241,7 +295,8 @@ int main(int argc, char *argv[])
/* Tell fenced what we've done so that it can avoid
fencing this node again if the fence_node() rebooted
it. */
- fenced_external(victim);
+ if (call_fenced)
+ fenced_external(victim);
}
}
diff --git a/fence/libfence/agent.c b/fence/libfence/agent.c
index 5577fda..fac6c92 100644
--- a/fence/libfence/agent.c
+++ b/fence/libfence/agent.c
@@ -679,3 +679,464 @@ int unfence_node(char *victim, struct fence_log *log, int log_size,
return error;
}
+/*
+ * Returns:
+ * < 0: internal error
+ * 0: agent exited with 0
+ * 1: agent exited with 1
+ * 2: agent exited with 2
+ */
+
+static int run_agent_status(char *agent, char *args, int *agent_result)
+{
+ int pid, status, len, rv;
+ int pw_fd = -1; /* parent write file descriptor */
+ int cr_fd = -1; /* child read file descriptor */
+ int pfd[2];
+
+ if (args == NULL || agent == NULL) {
+ rv = -1;
+ goto fail;
+ }
+ len = strlen(args);
+
+ if (pipe(pfd)) {
+ rv = -errno;
+ goto fail;
+ }
+ cr_fd = pfd[0];
+ pw_fd = pfd[1];
+
+ pid = fork();
+ if (pid < 0) {
+ rv = -errno;
+ *agent_result = FE_AGENT_FORK;
+ goto fail;
+ }
+
+ if (pid) {
+ /* parent */
+ int ret;
+
+ do {
+ ret = write(pw_fd, args, len);
+ } while (ret < 0 && errno == EINTR);
+
+ if (ret != len) {
+ rv = -1;
+ goto fail;
+ }
+
+ close(cr_fd);
+ close(pw_fd);
+
+ rv = waitpid(pid, &status, 0);
+
+ if (rv < 0) {
+ /* shouldn't happen */
+ rv = -errno;
+ goto out;
+ }
+
+ if (rv != pid) {
+ /* shouldn't happen */
+ rv = -1;
+ goto out;
+ }
+
+ if (WIFEXITED(status)) {
+ /* pid exited properly with an exit code */
+ rv = WEXITSTATUS(status);
+
+ if (rv == 0)
+ *agent_result = FE_AGENT_STATUS_ON;
+ else if (rv == 2)
+ *agent_result = FE_AGENT_STATUS_OFF;
+ else
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ } else if (WIFSIGNALED(status)) {
+ /* pid terminated due to a signal */
+ rv = -1;
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ } else {
+ /* something else happened, not sure what */
+ rv = -1;
+ *agent_result = FE_AGENT_STATUS_ERROR;
+ }
+ goto out;
+
+ } else {
+ /* child */
+ int c_stdout, c_stderr;
+
+ /* redirect agent stdout/stderr to /dev/null */
+ close(1);
+ c_stdout = open("/dev/null", O_WRONLY);
+ if (c_stdout < 0) {
+ rv = -1;
+ goto fail;
+ }
+ close(2);
+ c_stderr = open("/dev/null", O_WRONLY);
+ if (c_stderr < 0) {
+ rv = -1;
+ goto fail;
+ }
+
+ /* redirect agent stdin from parent */
+ close(0);
+ if (dup(cr_fd) < 0) {
+ rv = -errno;
+ goto fail;
+ }
+
+ close(cr_fd);
+ close(pw_fd);
+
+ execlp(agent, agent, NULL);
+ exit(EXIT_FAILURE);
+ }
+ fail:
+ close(cr_fd);
+ close(pw_fd);
+ out:
+ return rv;
+}
+
+static int make_args_status(int cd, char *victim, char *method, int d,
+ char *device, char **args_out)
+{
+ char path[PATH_MAX];
+ char *args, *str;
+ int error, ret, cnt = 0;
+ size_t len, pos;
+
+ args = malloc(FENCE_AGENT_ARGS_MAX);
+ if (!args)
+ return -ENOMEM;
+ memset(args, 0, FENCE_AGENT_ARGS_MAX);
+
+ len = FENCE_AGENT_ARGS_MAX - 1;
+ pos = 0;
+
+ /* node-specific args for victim */
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, NODE_FENCE_ARGS_PATH, victim, method, d+1);
+
+ for (;;) {
+ error = ccs_get_list(cd, path, &str);
+ if (error || !str)
+ break;
+ ++cnt;
+
+ if (!strncmp(str, "name=", 5)) {
+ free(str);
+ continue;
+ }
+
+ if (!strncmp(str, "action=", 7)) {
+ free(str);
+ continue;
+ }
+
+ ret = snprintf(args + pos, len - pos, "%s\n", str);
+
+ free(str);
+
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ /* add nodename of victim to args */
+
+ if (!strstr(args, "nodename=")) {
+ ret = snprintf(args + pos, len - pos, "nodename=%s\n", victim);
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ /* add action=status to args */
+
+ ret = snprintf(args + pos, len - pos, "action=status\n");
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+
+ /* device-specific args */
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, FENCE_DEVICE_ARGS_PATH, device);
+
+ for (;;) {
+ error = ccs_get_list(cd, path, &str);
+ if (error || !str)
+ break;
+ ++cnt;
+
+ if (!strncmp(str, "name=", 5)) {
+ free(str);
+ continue;
+ }
+
+ ret = snprintf(args + pos, len - pos, "%s\n", str);
+
+ free(str);
+
+ if (ret >= len - pos) {
+ error = -E2BIG;
+ goto out;
+ }
+ pos += ret;
+ }
+
+ if (cnt)
+ error = 0;
+ out:
+ if (error) {
+ free(args);
+ args = NULL;
+ }
+
+ *args_out = args;
+ return error;
+}
+
+static int use_device_status(int cd, char *victim, char *method, int d,
+ char *device, struct fence_log *lp)
+{
+ char path[PATH_MAX], *agent, *args = NULL;
+ int error;
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, AGENT_NAME_PATH, device);
+
+ error = ccs_get(cd, path, &agent);
+ if (error) {
+ lp->error = FE_READ_AGENT;
+ goto out;
+ }
+
+ strncpy(lp->agent_name, agent, FENCE_AGENT_NAME_MAX-1);
+
+ error = make_args_status(cd, victim, method, d, device, &args);
+ if (error) {
+ lp->error = FE_READ_ARGS;
+ goto out_agent;
+ }
+
+ strncpy(lp->agent_args, args, FENCE_AGENT_ARGS_MAX-1);
+
+ error = run_agent_status(agent, args, &lp->error);
+
+ free(args);
+ out_agent:
+ free(agent);
+ out:
+ return error;
+}
+
+/* We want to run status on each device in each method, and we need all
+ to succeed in order for status as a whole to succeed. Agent success
+ for status is being either "on" (exit 0) or "off" (exit 2). Agent
+ failure for status is when the on/off state is unknown (exit 1),
+ i.e. the agent failed to run or ran and cannot connect, or cannot get
+ the on/off state for some reason.
+
+ As soon as any one device in any method fails, we can quit and report
+ failure (rv < 0) for status as a whole. If status of all devices is
+ "on", then status as a whole returns 0. If status of all devices are
+ "off", then status as a whole returns 2. If the status of all devices
+ are mixed on/off, then status as a whole returns 0. */
+
+int fence_node_status(char *victim, struct fence_log *log, int log_size,
+ int *log_count, int use_method_num)
+{
+ struct fence_log stub;
+ struct fence_log *lp = log;
+ char *method = NULL, *device = NULL;
+ char *victim_nodename = NULL;
+ int num_methods, num_devices, m, d, cd, rv;
+ int on_count = 0, off_count = 0;
+ int left = log_size;
+ int error = -1;
+ int count = 0;
+
+ cd = ccs_connect();
+ if (cd < 0) {
+ if (lp && left) {
+ lp->error = FE_NO_CONFIG;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ goto ret;
+ }
+
+ if (ccs_lookup_nodename(cd, victim, &victim_nodename) == 0)
+ victim = victim_nodename;
+
+ num_methods = count_methods(cd, victim);
+ if (!num_methods) {
+ if (lp && left) {
+ lp->error = FE_NO_METHOD;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -2; /* No fencing */
+ goto out;
+ }
+
+ if (use_method_num && (use_method_num > num_methods)) {
+ if (lp && left) {
+ lp->error = FE_NUM_METHOD;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -2; /* No fencing */
+ goto out;
+ }
+
+ for (m = 0; m < num_methods; m++) {
+
+ if (use_method_num && (m + 1 != use_method_num))
+ continue;
+
+ rv = get_method(cd, victim, m, &method);
+ if (rv) {
+ if (lp && left) {
+ lp->error = FE_READ_METHOD;
+ lp->method_num = m;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ break;
+ }
+
+ num_devices = count_devices(cd, victim, method);
+ if (!num_devices) {
+ if (lp && left) {
+ lp->error = FE_NO_DEVICE;
+ lp->method_num = m;
+ lp++;
+ left--;
+ }
+ count++;
+ continue;
+ }
+
+ for (d = 0; d < num_devices; d++) {
+ rv = get_device(cd, victim, method, d, &device);
+ if (rv) {
+ if (lp && left) {
+ lp->error = FE_READ_DEVICE;
+ lp->method_num = m;
+ lp->device_num = d;
+ lp++;
+ left--;
+ }
+ count++;
+ error = -1;
+ break;
+ }
+
+ /* every call to use_device generates a log entry,
+ whether success or fail */
+
+ error = use_device_status(cd, victim, method, d, device,
+ (lp && left) ? lp : &stub);
+ count++;
+ if (lp && left) {
+ /* error, name, args already set */
+ lp->method_num = m;
+ lp->device_num = d;
+ lp++;
+ left--;
+ }
+
+ /*
+ * error values:
+ * < 0: internal error from use_device_status,
+ * internal error from run_agent_status,
+ * run_agent_status failed to fork agent
+ * 0: agent exited with 0 (success, status is on)
+ * 2: agent exited with 2 (success, status is off)
+ * 1: agent exited with 1 (error, status is unknown)
+ */
+
+ /* internal error: status fail */
+ if (error < 0)
+ break;
+
+ /* agent error: status fail */
+ if (error == 1) {
+ error = -1;
+ break;
+ }
+
+ if (!error) {
+ /* agent success "on": status success */
+ on_count++;
+ } else if (error == 2) {
+ /* agent success "off": status success */
+ error = 0;
+ off_count++;
+ } else {
+ /* some other error */
+ error = -1;
+ break;
+ }
+
+ free(device);
+ device = NULL;
+ }
+
+ if (device)
+ free(device);
+
+ free(method);
+
+ /* if any device failed in this method, return failure
+ for the status */
+
+ if (error)
+ break;
+ }
+
+ if (error < 0)
+ goto out;
+
+ /* All devices are either on or off, none are unknown/inaccessible,
+ so status as a whole is a success. Decide which of the two
+ success values to return: 2 if all devices are off, or 0 if
+ all devices are on, 0 if mixed on/off. */
+
+ if (!on_count)
+ error = 2;
+ else
+ error = 0;
+
+ out:
+ if (victim_nodename)
+ free(victim_nodename);
+
+ ccs_disconnect(cd);
+ ret:
+ if (log_count)
+ *log_count = count;
+ return error;
+}
+
diff --git a/fence/libfence/libfence.h b/fence/libfence/libfence.h
index 33f493a..10d00f0 100644
--- a/fence/libfence/libfence.h
+++ b/fence/libfence/libfence.h
@@ -15,6 +15,10 @@ extern "C" {
#define FE_READ_ARGS 8 /* read (ccs) error on node/dev args */
#define FE_READ_METHOD 9 /* read (ccs) error on method */
#define FE_READ_DEVICE 10 /* read (ccs) error on method/device */
+#define FE_NUM_METHOD 11 /* method number does not exist */
+#define FE_AGENT_STATUS_ON 12
+#define FE_AGENT_STATUS_OFF 13
+#define FE_AGENT_STATUS_ERROR 14
#define FENCE_AGENT_NAME_MAX 256 /* including terminating \0 */
#define FENCE_AGENT_ARGS_MAX 4096 /* including terminating \0 */
@@ -32,6 +36,28 @@ int fence_node(char *name, struct fence_log *log, int log_size, int *log_count);
int unfence_node(char *name, struct fence_log *log, int log_size,
int *log_count);
+/*
+ * use_method_num == 0: run status on all devices of all methods
+ * use_method_num > 0: run status on all devices of given method number,
+ * where first method is use_method_num = 1
+ *
+ * Returns 0 on success: status is successful on all devices of all methods
+ * (or all devices of specified method). All devices are in the "on" state,
+ * or some devices are on and some are off.
+ *
+ * Returns 2 on success: status is successful on all devices of all methods
+ * (or all devices of a specified method). All devices are in the "off" state.
+ *
+ * Returns -2 if no fencing methods are defined for the node, or if
+ * use_method_num was specified and the specified method number does
+ * not exist.
+ *
+ * Returns -EXXX for other failures.
+ */
+
+int fence_node_status(char *victim, struct fence_log *log, int log_size,
+ int *log_count, int use_method_num);
+
#ifdef __cplusplus
}
#endif
11 years, 8 months