Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=75e57e30597ba7... Commit: 75e57e30597ba73186bb5b9f93fea2fda78048de Parent: ba5cc65fe172c34c78c884f2bef96a6e9a45d20b Author: Ryan McCabe rmccabe@redhat.com AuthorDate: Tue Oct 8 14:34:03 2013 -0400 Committer: Ryan McCabe rmccabe@redhat.com CommitterDate: Tue Oct 8 14:34:03 2013 -0400
rgmanager: sync HA LVM agents with upstream
Sync the HA LVM resource agents with upstream.
Resolves: rhbz#962376
Signed-off-by: Ryan McCabe rmccabe@redhat.com --- rgmanager/src/resources/lvm.sh | 10 +- rgmanager/src/resources/lvm_by_lv.sh | 223 +++++++++++++++++++++++---------- rgmanager/src/resources/lvm_by_vg.sh | 233 +++++++++++++++++++++++---------- 3 files changed, 324 insertions(+), 142 deletions(-)
diff --git a/rgmanager/src/resources/lvm.sh b/rgmanager/src/resources/lvm.sh index cb2f5ec..28cd822 100755 --- a/rgmanager/src/resources/lvm.sh +++ b/rgmanager/src/resources/lvm.sh @@ -55,7 +55,7 @@ function ha_lvm_proper_setup_check # Are we using the "tagging" or "CLVM" variant? # The CLVM variant will have the cluster attribute set ## - if [[ $(vgs -o attr --noheadings --config 'global{locking_type=0}' $OCF_RESKEY_vg_name 2>/dev/null) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings --config 'global{locking_type=0}' $OCF_RESKEY_vg_name 2>/dev/null)" =~ .....c ]]; then # Is clvmd running? if ! ps -C clvmd >& /dev/null; then ocf_log err "HA LVM: $OCF_RESKEY_vg_name has the cluster attribute set, but 'clvmd' is not running" @@ -127,7 +127,7 @@ case $1 in start) ha_lvm_proper_setup_check || exit 1
- if [ -z $OCF_RESKEY_lv_name ]; then + if [ -z "$OCF_RESKEY_lv_name" ]; then vg_start || exit 1 else lv_start || exit 1 @@ -137,7 +137,7 @@ start) status|monitor) ocf_log notice "Getting status"
- if [ -z $OCF_RESKEY_lv_name ]; then + if [ -z "$OCF_RESKEY_lv_name" ]; then vg_status || exit 1 else lv_status || exit 1 @@ -147,7 +147,7 @@ status|monitor) stop) ha_lvm_proper_setup_check
- if [ -z $OCF_RESKEY_lv_name ]; then + if [ -z "$OCF_RESKEY_lv_name" ]; then vg_stop || exit 1 else lv_stop || exit 1 @@ -164,7 +164,7 @@ meta-data) ;;
validate-all|verify-all) - if [ -z $OCF_RESKEY_lv_name ]; then + if [ -z "$OCF_RESKEY_lv_name" ]; then vg_verify || exit 1 else lv_verify || exit 1 diff --git a/rgmanager/src/resources/lvm_by_lv.sh b/rgmanager/src/resources/lvm_by_lv.sh index 7140076..af2059e 100644 --- a/rgmanager/src/resources/lvm_by_lv.sh +++ b/rgmanager/src/resources/lvm_by_lv.sh @@ -29,6 +29,81 @@ lv_verify() return $OCF_SUCCESS }
+# lv_owner +# +# Returns: +# 1 == We are the owner +# 2 == We can claim it +# 0 == Owned by someone else +function lv_owner +{ + local my_name=$1 + local owner=$2 + + if [ -z "$my_name" ]; then + ocf_log err "Unable to determine cluster node name" + return 0 + fi + + if [ -z "$owner" ]; then + # No-one owns this LV yet, so we can claim it + return 2 + fi + + if [ $owner != $my_name ]; then + if is_node_member_clustat $owner ; then + return 0 + fi + return 2 + fi + + return 1 +} + +steal_tag() +{ + local owner=$1 + local lv_path=$2 + + ocf_log notice "Owner of $lv_path is not in the cluster" + ocf_log notice "Stealing $lv_path" + + lvchange --deltag $owner $lv_path + if [ $? -ne 0 ]; then + ocf_log err "Failed to steal $lv_path from $owner" + return $OCF_ERR_GENERIC + fi + + # Warning --deltag doesn't always result in failure + if [ ! -z `lvs -o tags --noheadings $lv_path` ]; then + ocf_log err "Failed to steal $lv_path from $owner." + return $OCF_ERR_GENERIC + fi + + return $OCF_SUCCESS +} + +restore_transient_failed_pvs() +{ + local a=0 + local -a results + + results=(`pvs -o name,vg_name,attr --noheadings | grep $OCF_RESKEY_vg_name | grep -v 'unknown device'`) + while [ ! -z "${results[$a]}" ] ; do + if [[ ${results[$(($a + 2))]} =~ ..m ]] && + [ $OCF_RESKEY_vg_name == ${results[$(($a + 1))]} ]; then + ocf_log notice "Attempting to restore missing PV, ${results[$a]} in $OCF_RESKEY_vg_name" + vgextend --restoremissing $OCF_RESKEY_vg_name ${results[$a]} + if [ $? -ne 0 ]; then + ocf_log notice "Failed to restore ${results[$a]}" + else + ocf_log notice " ${results[$a]} restored" + fi + fi + a=$(($a + 3)) + done +} + # lv_exec_resilient # # Sometimes, devices can come back. Their metadata will conflict @@ -43,7 +118,7 @@ lv_exec_resilient()
ocf_log notice "Making resilient : $command"
- if [ -z $command ]; then + if [ -z "$command" ]; then ocf_log err "lv_exec_resilient: Arguments not supplied" return $OCF_ERR_ARGS fi @@ -84,13 +159,18 @@ lv_activate_resilient() declare lv_path=$2 declare op="-ay"
- if [ -z $action ] || [ -z $lv_path ]; then + if [ -z "$action" ] || [ -z "$lv_path" ]; then ocf_log err "lv_activate_resilient: Arguments not supplied" return $OCF_ERR_ARGS fi
if [ $action != "start" ]; then op="-an" + elif [[ "$(lvs -o attr --noheadings $lv_path)" =~ r.......p ]] || + [[ "$(lvs -o attr --noheadings $lv_path)" =~ R.......p ]]; then + # We can activate partial RAID LVs and run just fine. + ocf_log notice "Attempting activation of partial RAID LV, $lv_path" + op="-ay --partial" fi
if ! lv_exec_resilient "lvchange $op $lv_path" ; then @@ -106,7 +186,7 @@ lv_status_clustered() # # Check if device is active # - if [[ ! $(lvs -o attr --noheadings $lv_path) =~ ....a. ]]; then + if [[ ! "$(lvs -o attr --noheadings $lv_path)" =~ ....a. ]]; then return $OCF_ERR_GENERIC fi
@@ -127,11 +207,11 @@ lv_status_single() # # Check if device is active # - if [[ ! $(lvs -o attr --noheadings $lv_path) =~ ....a. ]]; then + if [[ ! "$(lvs -o attr --noheadings $lv_path)" =~ ....a. ]]; then return $OCF_ERR_GENERIC fi
- if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then ocf_log notice "$OCF_RESKEY_vg_name is a cluster volume. Ignoring..." return $OCF_SUCCESS fi @@ -160,14 +240,14 @@ lv_status_single() # owner=`lvs -o tags --noheadings $lv_path` my_name=$(local_node_name) - if [ -z $my_name ]; then + if [ -z "$my_name" ]; then ocf_log err "Unable to determine local machine name"
# FIXME: I don't really want to fail on 1st offense return $OCF_SUCCESS fi
- if [ -z $owner ] || [ $my_name != $owner ]; then + if [ -z "$owner" ] || [ "$my_name" != "$owner" ]; then ocf_log err "WARNING: $lv_path should not be active" ocf_log err "WARNING: $my_name does not own $lv_path" ocf_log err "WARNING: Attempting shutdown of $lv_path" @@ -203,12 +283,12 @@ lv_activate_and_tag() *) self_fence="" ;; esac
- if [ -z $action ] || [ -z $tag ] || [ -z $lv_path ]; then + if [ -z "$action" ] || [ -z "$tag" ] || [ -z "$lv_path" ]; then ocf_log err "Supplied args: 1) $action, 2) $tag, 3) $lv_path" return $OCF_ERR_ARGS fi
- if [ $action == "start" ]; then + if [ "$action" == "start" ]; then ocf_log notice "Activating $lv_path" lvchange --addtag $tag $lv_path if [ $? -ne 0 ]; then @@ -259,7 +339,7 @@ lv_activate_and_tag() fi fi
- if [ `lvs --noheadings -o lv_tags $lv_path` == $tag ]; then + if [ "`lvs --noheadings -o lv_tags $lv_path`" == $tag ]; then ocf_log notice "Removing ownership tag ($tag) from $lv_path" lvchange --deltag $tag $lv_path if [ $? -ne 0 ]; then @@ -292,35 +372,24 @@ lv_activate() declare lv_path="$OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" declare owner=`lvs -o tags --noheadings $lv_path` declare my_name=$(local_node_name) + local owned
- if [ -z $my_name ]; then - ocf_log err "Unable to determine cluster node name" - return $OCF_ERR_GENERIC + lv_owner $my_name $owner + owned=$? + if [ $owned -eq 0 ]; then + ocf_log info "Someone else owns this logical volume" + return $OCF_ERR_GENERIC fi
- # - # FIXME: This code block is repeated below... might be - # nice to put it in a function - # - if [ ! -z $owner ] && [ $owner != $my_name ]; then - if is_node_member_clustat $owner ; then - ocf_log err "$owner owns $lv_path unable to $1" - return $OCF_ERR_GENERIC - fi - ocf_log notice "Owner of $lv_path is not in the cluster" - ocf_log notice "Stealing $lv_path" + # If this is a partial VG, attempt to + # restore any transiently failed PVs + if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ ...p ]]; then + ocf_log err "Volume group "$OCF_RESKEY_vg_name" has PVs marked as missing" + restore_transient_failed_pvs + fi
- lvchange --deltag $owner $lv_path - if [ $? -ne 0 ]; then - ocf_log err "Failed to steal $lv_path from $owner" - return $OCF_ERR_GENERIC - fi - - # Warning --deltag doesn't always result in failure - if [ ! -z `lvs -o tags --noheadings $lv_path` ]; then - ocf_log err "Failed to steal $lv_path from $owner." - return $OCF_ERR_GENERIC - fi + if [ ! -z "$owner" ] && [ $owned -eq 2 ]; then + steal_tag $owner $lv_path fi
if ! lv_activate_and_tag $1 $my_name $lv_path; then @@ -333,25 +402,17 @@ lv_activate() $OCF_RESKEY_vg_name; then ocf_log notice "$OCF_RESKEY_vg_name now consistent" owner=`lvs -o tags --noheadings $lv_path` - if [ ! -z $owner ] && [ $owner != $my_name ]; then - if is_node_member_clustat $owner ; then - ocf_log err "$owner owns $lv_path unable to $1" - return $OCF_ERR_GENERIC - fi - ocf_log notice "Owner of $lv_path is not in the cluster" - ocf_log notice "Stealing $lv_path" - - lvchange --deltag $owner $lv_path - if [ $? -ne 0 ]; then - ocf_log err "Failed to steal $lv_path from $owner" - return $OCF_ERR_GENERIC - fi - - # Warning --deltag doesn't always result in failure - if [ ! -z `lvs -o tags --noheadings $lv_path` ]; then - ocf_log err "Failed to steal $lv_path from $owner." - return $OCF_ERR_GENERIC - fi + lv_owner $my_name $owner + owned=$? + if [ ! -z "$owner" ] && [ $owned -eq 2 ]; then + steal_tag $owner $lv_path + ret=$? + if [ $ret -ne $OCF_SUCCESS ]; then + return $ret + fi + elif [ $owned -eq 0 ]; then + ocf_log info "Someone else owns this logical volume" + return $OCF_ERR_GENERIC fi
if ! lv_activate_and_tag $1 $my_name $lv_path; then @@ -371,23 +432,53 @@ lv_activate()
function lv_start_clustered { - if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then - ocf_log err "Failed to activate logical volume, $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" - ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + if lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then + return $OCF_SUCCESS + fi
- if ! lvconvert --repair --use-policies $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then - ocf_log err "Failed to cleanup $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + # FAILED exclusive activation: + # This can be caused by an LV being active remotely. + # Before attempting a repair effort, we should attempt + # to deactivate the LV cluster-wide; but only if the LV + # is not open. Otherwise, it is senseless to attempt. + if ! [[ "$(lvs -o attr --noheadings $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name)" =~ ....ao ]]; then + # We'll wait a small amount of time for some settling before + # attempting to deactivate. Then the deactivate will be + # immediately followed by another exclusive activation attempt. + sleep 5 + if ! lvchange -an $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then + # Someone could have the device open. + # We can't do anything about that. + ocf_log err "Unable to perform required deactivation of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name before starting" return $OCF_ERR_GENERIC fi
- if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then - ocf_log err "Failed second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" - return $OCF_ERR_GENERIC + if lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then + # Second attempt after deactivation was successful, we now + # have the lock exclusively + return $OCF_SUCCESS fi + fi
- ocf_log notice "Second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name successful" - return $OCF_SUCCESS + # Failed to activate: + # This could be due to a device failure (or another machine could + # have snuck in between the deactivation/activation). We don't yet + # have a mechanism to check for remote activation, so we will proceed + # with repair action. + ocf_log err "Failed to activate logical volume, $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + + if ! lvconvert --repair --use-policies $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then + ocf_log err "Failed to cleanup $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + return $OCF_ERR_GENERIC + fi + + if ! lvchange -aey $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name; then + ocf_log err "Failed second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name" + return $OCF_ERR_GENERIC fi + + ocf_log notice "Second attempt to activate $OCF_RESKEY_vg_name/$OCF_RESKEY_lv_name successful" return $OCF_SUCCESS }
@@ -415,7 +506,7 @@ function lv_start_single function lv_start { # We pass in the VG name to see of the logical volume is clustered - if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then lv_start_clustered else lv_start_single @@ -439,7 +530,7 @@ function lv_stop_single function lv_stop { # We pass in the VG name to see of the logical volume is clustered - if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then lv_stop_clustered else lv_stop_single diff --git a/rgmanager/src/resources/lvm_by_vg.sh b/rgmanager/src/resources/lvm_by_vg.sh index 819d0b8..cb933c5 100755 --- a/rgmanager/src/resources/lvm_by_vg.sh +++ b/rgmanager/src/resources/lvm_by_vg.sh @@ -30,18 +30,19 @@ function vg_owner local owner=`vgs -o tags --noheadings $OCF_RESKEY_vg_name` local my_name=$(local_node_name)
- if [ -z $my_name ]; then + if [ -z "$my_name" ]; then ocf_log err "Unable to determine cluster node name" return 0 fi
- if [ -z $owner ]; then + if [ -z "$owner" ]; then # No-one owns this VG yet, so we can claim it return 2 fi
if [ $owner != $my_name ]; then if is_node_member_clustat $owner ; then + ocf_log err " $owner owns $OCF_RESKEY_vg_name and is still a cluster member" return 0 fi return 2 @@ -50,12 +51,35 @@ function vg_owner return 1 }
-function _strip_tags +restore_transient_failed_pvs() +{ + local a=0 + local -a results + + results=(`pvs -o name,vg_name,attr --noheadings | grep $OCF_RESKEY_vg_name | grep -v 'unknown device'`) + while [ ! -z "${results[$a]}" ] ; do + if [[ ${results[$(($a + 2))]} =~ ..m ]] && + [ $OCF_RESKEY_vg_name == ${results[$(($a + 1))]} ]; then + ocf_log notice "Attempting to restore missing PV, ${results[$a]} in $OCF_RESKEY_vg_name" + vgextend --restoremissing $OCF_RESKEY_vg_name ${results[$a]} + if [ $? -ne 0 ]; then + ocf_log notice "Failed to restore ${results[$a]}" + else + ocf_log notice " ${results[$a]} restored" + fi + fi + a=$(($a + 3)) + done +} + +function strip_tags { local i
for i in `vgs --noheadings -o tags $OCF_RESKEY_vg_name | sed s/","/" "/g`; do ocf_log info "Stripping tag, $i" + + # LVM version 2.02.98 allows changing tags if PARTIAL vgchange --deltag $i $OCF_RESKEY_vg_name done
@@ -67,29 +91,6 @@ function _strip_tags return $OCF_SUCCESS }
-function strip_tags -{ - if ! _strip_tags; then - ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name" - - if ! vgreduce --removemissing --force --config \ - "activation { volume_list = "$OCF_RESKEY_vg_name" }" \ - $OCF_RESKEY_vg_name; then - - ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent" - return $OCF_ERR_GENERIC - fi - - ocf_log notice "Cleanup of $OCF_RESKEY_vg_name successful" - fi - if ! _strip_tags; then - ocf_log err "Failed 2nd attempt to remove tags from, $OCF_RESKEY_vg_name" - return $OCF_ERR_GENERIC - fi - - return $OCF_SUCCESS -} - function strip_and_add_tag { if ! strip_tags; then @@ -176,7 +177,7 @@ function vg_status_single ## function vg_status { - if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then vg_status_clustered else vg_status_single @@ -195,10 +196,37 @@ function vg_start_clustered local results local all_pvs local resilience + local try_again=false
ocf_log info "Starting volume group, $OCF_RESKEY_vg_name"
if ! vgchange -aey $OCF_RESKEY_vg_name; then + try_again=true + + # Failure to activate: + # This could be caused by a remotely active LV. Before + # attempting any repair of the VG, we will first attempt + # to deactivate the VG cluster-wide. + # We must check for open LVs though, since these cannot + # be deactivated. We have no choice but to go one-by-one. + + # Allow for some settling + sleep 5 + + results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`) + a=0 + while [ ! -z "${results[$a]}" ]; do + if [[ ! ${results[$(($a + 1))]} =~ ....ao ]]; then + if ! lvchange -an $OCF_RESKEY_vg_name/${results[$a]}; then + ocf_log err "Unable to perform required deactivation of $OCF_RESKEY_vg_name before starting" + return $OCF_ERR_GENERIC + fi + fi + a=$(($a + 2)) + done + fi + + if $try_again && ! vgchange -aey $OCF_RESKEY_vg_name; then ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name" ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
@@ -219,7 +247,7 @@ function vg_start_clustered # Make sure all the logical volumes are active results=(`lvs -o name,attr --noheadings 2> /dev/null $OCF_RESKEY_vg_name`) a=0 - while [ ! -z ${results[$a]} ]; do + while [ ! -z "${results[$a]}" ]; do if [[ ! ${results[$(($a + 1))]} =~ ....a. ]]; then all_pvs=(`pvs --noheadings -o name 2> /dev/null`) resilience=" --config devices{filter=[" @@ -235,7 +263,7 @@ function vg_start_clustered done
# We need to check the LVs again if we made the command resilient - if [ ! -z $resilience ]; then + if [ ! -z "$resilience" ]; then results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name $resilience 2> /dev/null`) a=0 while [ ! -z ${results[$a]} ]; do @@ -275,28 +303,50 @@ function vg_start_single ;; esac
- if ! strip_and_add_tag || - ! vgchange -ay $OCF_RESKEY_vg_name; then - ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name" - ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name" - - if ! vgreduce --removemissing --force --config \ - "activation { volume_list = "$OCF_RESKEY_vg_name" }" \ - $OCF_RESKEY_vg_name; then - - ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent" - return $OCF_ERR_GENERIC - fi + if ! strip_and_add_tag; then + # Errors printed by sub-function + return $OCF_ERR_GENERIC + fi
- ocf_log notice "Cleanup of $OCF_RESKEY_vg_name successful" + if ! vgchange -ay $OCF_RESKEY_vg_name; then + ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name" + ocf_log err "Attempting activation of logical volumes one-by-one."
- if ! strip_and_add_tag || - ! vgchange -ay $OCF_RESKEY_vg_name; then - ocf_log err "Failed second attempt to activate $OCF_RESKEY_vg_name" - return $OCF_ERR_GENERIC - fi + results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`) + a=0 + while [ ! -z ${results[$a]} ]; do + if [[ ${results[$(($a + 1))]} =~ r.......p ]] || + [[ ${results[$(($a + 1))]} =~ R.......p ]]; then + # Attempt "partial" activation of any RAID LVs + ocf_log err "Attempting partial activation of ${OCF_RESKEY_vg_name}/${results[$a]}" + if ! lvchange -ay --partial ${OCF_RESKEY_vg_name}/${results[$a]}; then + ocf_log err "Failed attempt to activate ${OCF_RESKEY_vg_name}/${results[$a]} in partial mode" + return $OCF_ERR_GENERIC + fi + ocf_log notice "Activation of ${OCF_RESKEY_vg_name}/${results[$a]} in partial mode succeeded" + elif [[ ${results[$(($a + 1))]} =~ m.......p ]] || + [[ ${results[$(($a + 1))]} =~ M.......p ]]; then + ocf_log err "Attempting repair and activation of ${OCF_RESKEY_vg_name}/${results[$a]}" + if ! lvconvert --repair --use-policies ${OCF_RESKEY_vg_name}/${results[$a]}; then + ocf_log err "Failed to repair ${OCF_RESKEY_vg_name}/${results[$a]}" + return $OCF_ERR_GENERIC + fi + if ! lvchange -ay ${OCF_RESKEY_vg_name}/${results[$a]}; then + ocf_log err "Failed to activate ${OCF_RESKEY_vg_name}/${results[$a]}" + return $OCF_ERR_GENERIC + fi + ocf_log notice "Repair and activation of ${OCF_RESKEY_vg_name}/${results[$a]} succeeded" + else + ocf_log err "Attempting activation of non-redundant LV ${OCF_RESKEY_vg_name}/${results[$a]}" + if ! lvchange -ay ${OCF_RESKEY_vg_name}/${results[$a]}; then + ocf_log err "Failed to activate ${OCF_RESKEY_vg_name}/${results[$a]}" + return $OCF_ERR_GENERIC + fi + ocf_log notice "Successfully activated non-redundant LV ${OCF_RESKEY_vg_name}/${results[$a]}" + fi + a=$(($a + 2)) + done
- ocf_log notice "Second attempt to activate $OCF_RESKEY_vg_name successful" return $OCF_SUCCESS else # The activation commands succeeded, but did they do anything? @@ -319,7 +369,7 @@ function vg_start_single done
# We need to check the LVs again if we made the command resilient - if [ ! -z $resilience ]; then + if [ ! -z "$resilience" ]; then results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name $resilience 2> /dev/null`) a=0 while [ ! -z ${results[$a]} ]; do @@ -341,7 +391,15 @@ function vg_start_single ## function vg_start { - if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + local a=0 + local results + + if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ ...p ]]; then + ocf_log err "Volume group "$OCF_RESKEY_vg_name" has PVs marked as missing" + restore_transient_failed_pvs + fi + + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then vg_start_clustered else vg_start_single @@ -362,25 +420,46 @@ function vg_stop_clustered
# Shut down the volume group # Do we need to make this resilient? - vgchange -aln $OCF_RESKEY_vg_name + a=0 + while ! vgchange -aln $OCF_RESKEY_vg_name; do + a=$(($a + 1)) + if [ $a -gt 10 ]; then + break; + fi + ocf_log err "Unable to deactivate $OCF_RESKEY_vg_name, retrying($a)" + sleep 1 + which udevadm >& /dev/null && udevadm settle + done
# Make sure all the logical volumes are inactive + active=0 results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`) a=0 while [ ! -z ${results[$a]} ]; do if [[ ${results[$(($a + 1))]} =~ ....a. ]]; then - if [ "$self_fence" ]; then - ocf_log err "Unable to deactivate $lv_path REBOOT" - sync - reboot -fn - else - ocf_log err "Logical volume $OCF_RESKEY_vg_name/${results[$a]} failed to shutdown" - fi - return $OCF_ERR_GENERIC + active=1 + break fi - a=$(($a + 2)) + a=$(($a + 2)) done
+ # lvs may not show active volumes if all PVs in VG are gone + dmsetup table | grep -q "^${OCF_RESKEY_vg_name//-/--}-[^-]" + if [ $? -eq 0 ]; then + active=1 + fi + + if [ $active -ne 0 ]; then + if [ "$self_fence" ]; then + ocf_log err "Unable to deactivate $lv_path REBOOT" + sync + reboot -fn + else + ocf_log err "Logical volume $OCF_RESKEY_vg_name/${results[$a]} failed to shutdown" + fi + return $OCF_ERR_GENERIC + fi + return $OCF_SUCCESS }
@@ -398,25 +477,37 @@ function vg_stop_single
# Shut down the volume group # Do we need to make this resilient? - vgchange -aln $OCF_RESKEY_vg_name + vgchange -an $OCF_RESKEY_vg_name
# Make sure all the logical volumes are inactive + active=0 results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`) a=0 while [ ! -z ${results[$a]} ]; do if [[ ${results[$(($a + 1))]} =~ ....a. ]]; then - if [ "$self_fence" ]; then - ocf_log err "Unable to deactivate $lv_path REBOOT" - sync - reboot -fn - else - ocf_log err "Logical volume $OCF_RESKEY_vg_name/${results[$a]} failed to shutdown" - fi - return $OCF_ERR_GENERIC + active=1 + break fi - a=$(($a + 2)) + a=$(($a + 2)) done
+ # lvs may not show active volumes if all PVs in VG are gone + dmsetup table | grep -q "^${OCF_RESKEY_vg_name//-/--}-[^-]" + if [ $? -eq 0 ]; then + active=1 + fi + + if [ $active -ne 0 ]; then + if [ "$self_fence" ]; then + ocf_log err "Unable to deactivate $lv_path REBOOT" + sync + reboot -fn + else + ocf_log err "Logical volume $OCF_RESKEY_vg_name/${results[$a]} failed to shutdown" + fi + return $OCF_ERR_GENERIC + fi + # Make sure we are the owner before we strip the tags vg_owner if [ $? -ne 0 ]; then @@ -431,7 +522,7 @@ function vg_stop_single ## function vg_stop { - if [[ $(vgs -o attr --noheadings $OCF_RESKEY_vg_name) =~ .....c ]]; then + if [[ "$(vgs -o attr --noheadings $OCF_RESKEY_vg_name)" =~ .....c ]]; then vg_stop_clustered else vg_stop_single