cluster: RHEL510 - HA LVM should only remove missing PVs on stop when they belong to mirrors
by Ryan McCabe
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=d45a4fe2fa2...
Commit: d45a4fe2fa265a19a4130161caa4ce92367f9072
Parent: 1d7a0335016cae0246f455a707f78bfc5d1ff880
Author: John Ruemker <jruemker(a)redhat.com>
AuthorDate: Tue May 14 15:19:19 2013 -0500
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Mon May 20 23:46:49 2013 -0400
HA LVM should only remove missing PVs on stop when they belong to mirrors
This adds --mirrorsonly to the 3 'vgreduce --removemissing' calls in the
LVM agents.
You'll also notice that it adds another self_fence check after we fail to
remove tags. In my previous comment, I pointed out that in the case of
single-host by_lv, after we vgreduce we then can't deactivate the logical
volume again because it doesn't exist. This results in us executing
self_fence, which may have just been a happy accident. But when we avoid
making metadata changes by adding --mirrorsonly, the subsequent deactivation
is still successful, and thus we miss the self_fence logic. So, I added
another check so we still catch the failure and fence ourselves in this
situation.
Resolves: rhbz#962376
Signed-off-by: John Ruemker <jruemker(a)redhat.com>
Signed-off-by: Jonthan Brassow <jbrassow(a)redhat.com>
Signed-off-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/resources/lvm_by_lv.sh | 14 ++++++++++----
rgmanager/src/resources/lvm_by_vg.sh | 4 ++--
2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/rgmanager/src/resources/lvm_by_lv.sh b/rgmanager/src/resources/lvm_by_lv.sh
index 4971173..7140076 100644
--- a/rgmanager/src/resources/lvm_by_lv.sh
+++ b/rgmanager/src/resources/lvm_by_lv.sh
@@ -243,13 +243,19 @@ lv_activate_and_tag()
# be removed from the VG via a separate call before
# the tag can be removed.
ocf_log err "Attempting volume group clean-up and retry"
- vgreduce --removemissing --force $OCF_RESKEY_vg_name
+ vgreduce --removemissing --mirrorsonly --force $OCF_RESKEY_vg_name
# Retry tag deletion
lvchange --deltag $tag $lv_path
if [ $? -ne 0 ]; then
- ocf_log err "Failed to delete tag from $lv_path"
- return $OCF_ERR_GENERIC
+ if [ "$self_fence" ]; then
+ ocf_log err "Failed to delete tag from $lv_path: REBOOTING"
+ sync
+ reboot -fn
+ else
+ ocf_log err "Failed to delete tag from $lv_path"
+ fi
+ return $OCF_ERR_GENERIC
fi
fi
@@ -322,7 +328,7 @@ lv_activate()
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if vgreduce --removemissing --force --config \
+ if vgreduce --removemissing --mirrorsonly --force --config \
"activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
$OCF_RESKEY_vg_name; then
ocf_log notice "$OCF_RESKEY_vg_name now consistent"
diff --git a/rgmanager/src/resources/lvm_by_vg.sh b/rgmanager/src/resources/lvm_by_vg.sh
index 0dd2aaa..819d0b8 100755
--- a/rgmanager/src/resources/lvm_by_vg.sh
+++ b/rgmanager/src/resources/lvm_by_vg.sh
@@ -202,7 +202,7 @@ function vg_start_clustered
ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name"
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if ! vgreduce --removemissing --force $OCF_RESKEY_vg_name; then
+ if ! vgreduce --removemissing --mirrorsonly --force $OCF_RESKEY_vg_name; then
ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent"
return $OCF_ERR_GENERIC
fi
@@ -398,7 +398,7 @@ function vg_stop_single
# Shut down the volume group
# Do we need to make this resilient?
- vgchange -an $OCF_RESKEY_vg_name
+ vgchange -aln $OCF_RESKEY_vg_name
# Make sure all the logical volumes are inactive
results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`)
10 years, 12 months
cluster: RHEL510 - Med: Don't preserve SELinux context when copying files to /var/lib/nfs/sm
by Ryan McCabe
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=0c3782fd3fa...
Commit: 0c3782fd3fa61e9ceea938cc4f61e8572c6916c8
Parent: d45a4fe2fa265a19a4130161caa4ce92367f9072
Author: Ryan McCabe <rmccabe(a)redhat.com>
AuthorDate: Mon May 20 15:16:02 2013 -0400
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Mon May 20 23:46:49 2013 -0400
Med: Don't preserve SELinux context when copying files to /var/lib/nfs/sm
Pass the flags -Rdpf instead of -af to cp when copying files to
/var/lib/nfs/sm so that the SELinux context is inherited from the
target directory and not preserved from the files being copied.
Resolves: rhbz#907898
Signed-off-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/resources/svclib_nfslock | 8 ++++----
1 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/rgmanager/src/resources/svclib_nfslock b/rgmanager/src/resources/svclib_nfslock
index 2ec7bdd..fbfa94e 100644
--- a/rgmanager/src/resources/svclib_nfslock
+++ b/rgmanager/src/resources/svclib_nfslock
@@ -228,7 +228,7 @@ notify_list_store()
fi
owner=$(ls -dl /var/lib/nfs/statd/sm | awk '{print $3"."$4}')
- cp -af /var/lib/nfs/statd/sm/* $nl_dir/sm
+ cp -Rdpf /var/lib/nfs/statd/sm/* $nl_dir/sm
chown -R $owner $nl_dir
return 0
elif [ -d "/var/lib/nfs/sm" ]; then
@@ -238,7 +238,7 @@ notify_list_store()
fi
owner=$(ls -dl /var/lib/nfs/sm | awk '{print $3"."$4}')
- cp -af /var/lib/nfs/sm/* $nl_dir/sm
+ cp -Rdpf /var/lib/nfs/sm/* $nl_dir/sm
chown -R $owner $nl_dir
return 0
fi
@@ -265,12 +265,12 @@ notify_list_merge()
if [ -d "/var/lib/nfs/statd/sm" ]; then
owner=$(ls -dl /var/lib/nfs/statd/sm | awk '{print $3"."$4}')
- cp -af $nl_dir/sm/* /var/lib/nfs/statd/sm
+ cp -Rdpf $nl_dir/sm/* /var/lib/nfs/statd/sm
chown -R $owner $nl_dir
return 0
elif [ -d "/var/lib/nfs/sm" ]; then
owner=$(ls -dl /var/lib/nfs/sm | awk '{print $3"."$4}')
- cp -af $nl_dir/sm/* /var/lib/nfs/sm
+ cp -Rdpf $nl_dir/sm/* /var/lib/nfs/sm
chown -R $owner $nl_dir
return 0
fi
10 years, 12 months
cluster: RHEL510 - HA LVM should only remove missing PVs on stop when they belong to mirrors
by Ryan McCabe
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=d45a4fe2fa2...
Commit: d45a4fe2fa265a19a4130161caa4ce92367f9072
Parent: 1d7a0335016cae0246f455a707f78bfc5d1ff880
Author: John Ruemker <jruemker(a)redhat.com>
AuthorDate: Tue May 14 15:19:19 2013 -0500
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Mon May 20 23:46:49 2013 -0400
HA LVM should only remove missing PVs on stop when they belong to mirrors
This adds --mirrorsonly to the 3 'vgreduce --removemissing' calls in the
LVM agents.
You'll also notice that it adds another self_fence check after we fail to
remove tags. In my previous comment, I pointed out that in the case of
single-host by_lv, after we vgreduce we then can't deactivate the logical
volume again because it doesn't exist. This results in us executing
self_fence, which may have just been a happy accident. But when we avoid
making metadata changes by adding --mirrorsonly, the subsequent deactivation
is still successful, and thus we miss the self_fence logic. So, I added
another check so we still catch the failure and fence ourselves in this
situation.
Resolves: rhbz#962376
Signed-off-by: John Ruemker <jruemker(a)redhat.com>
Signed-off-by: Jonthan Brassow <jbrassow(a)redhat.com>
Signed-off-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/resources/lvm_by_lv.sh | 14 ++++++++++----
rgmanager/src/resources/lvm_by_vg.sh | 4 ++--
2 files changed, 12 insertions(+), 6 deletions(-)
diff --git a/rgmanager/src/resources/lvm_by_lv.sh b/rgmanager/src/resources/lvm_by_lv.sh
index 4971173..7140076 100644
--- a/rgmanager/src/resources/lvm_by_lv.sh
+++ b/rgmanager/src/resources/lvm_by_lv.sh
@@ -243,13 +243,19 @@ lv_activate_and_tag()
# be removed from the VG via a separate call before
# the tag can be removed.
ocf_log err "Attempting volume group clean-up and retry"
- vgreduce --removemissing --force $OCF_RESKEY_vg_name
+ vgreduce --removemissing --mirrorsonly --force $OCF_RESKEY_vg_name
# Retry tag deletion
lvchange --deltag $tag $lv_path
if [ $? -ne 0 ]; then
- ocf_log err "Failed to delete tag from $lv_path"
- return $OCF_ERR_GENERIC
+ if [ "$self_fence" ]; then
+ ocf_log err "Failed to delete tag from $lv_path: REBOOTING"
+ sync
+ reboot -fn
+ else
+ ocf_log err "Failed to delete tag from $lv_path"
+ fi
+ return $OCF_ERR_GENERIC
fi
fi
@@ -322,7 +328,7 @@ lv_activate()
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if vgreduce --removemissing --force --config \
+ if vgreduce --removemissing --mirrorsonly --force --config \
"activation { volume_list = \"$OCF_RESKEY_vg_name\" }" \
$OCF_RESKEY_vg_name; then
ocf_log notice "$OCF_RESKEY_vg_name now consistent"
diff --git a/rgmanager/src/resources/lvm_by_vg.sh b/rgmanager/src/resources/lvm_by_vg.sh
index 0dd2aaa..819d0b8 100755
--- a/rgmanager/src/resources/lvm_by_vg.sh
+++ b/rgmanager/src/resources/lvm_by_vg.sh
@@ -202,7 +202,7 @@ function vg_start_clustered
ocf_log err "Failed to activate volume group, $OCF_RESKEY_vg_name"
ocf_log notice "Attempting cleanup of $OCF_RESKEY_vg_name"
- if ! vgreduce --removemissing --force $OCF_RESKEY_vg_name; then
+ if ! vgreduce --removemissing --mirrorsonly --force $OCF_RESKEY_vg_name; then
ocf_log err "Failed to make $OCF_RESKEY_vg_name consistent"
return $OCF_ERR_GENERIC
fi
@@ -398,7 +398,7 @@ function vg_stop_single
# Shut down the volume group
# Do we need to make this resilient?
- vgchange -an $OCF_RESKEY_vg_name
+ vgchange -aln $OCF_RESKEY_vg_name
# Make sure all the logical volumes are inactive
results=(`lvs -o name,attr --noheadings $OCF_RESKEY_vg_name 2> /dev/null`)
10 years, 12 months
cluster: RHEL510 - cman: enable and document logging of CMAN subsystem (additional fix)
by Jan Pokorný
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=1d7a0335016...
Commit: 1d7a0335016cae0246f455a707f78bfc5d1ff880
Parent: 7473b8cf0199b7f0e819cd78bd53f215ed371095
Author: Jan Pokorný <jpokorny(a)redhat.com>
AuthorDate: Mon May 20 20:51:42 2013 +0200
Committer: Jan Pokorný <jpokorny(a)redhat.com>
CommitterDate: Mon May 20 20:51:42 2013 +0200
cman: enable and document logging of CMAN subsystem (additional fix)
Related: rhbz#963251
Signed-off-by: Jan Pokorný <jpokorny(a)redhat.com>
---
cman/daemon/logging.c | 1 +
1 files changed, 1 insertions(+), 0 deletions(-)
diff --git a/cman/daemon/logging.c b/cman/daemon/logging.c
index 4f6e731..44c3d0f 100644
--- a/cman/daemon/logging.c
+++ b/cman/daemon/logging.c
@@ -81,6 +81,7 @@ void log_debug(int subsys, int stamp, const char *fmt, ...)
strcpy(newfmt, "ais: ");
break;
default:
+ newfmt[0] = '\0';
break;
}
}
10 years, 12 months
cluster: RHEL59 - rgmanager: Update the Oracle resource agents to support Oracle 11g
by Ryan McCabe
Gitweb: http://git.fedorahosted.org/git/?p=cluster.git;a=commitdiff;h=3ac6fa12877...
Commit: 3ac6fa12877780c28b07c9d2402687f878ba374e
Parent: c830187908b759b5b5203efc0fa441fc401b7f1b
Author: Ryan McCabe <rmccabe(a)redhat.com>
AuthorDate: Thu May 16 16:51:55 2013 -0400
Committer: Ryan McCabe <rmccabe(a)redhat.com>
CommitterDate: Mon May 20 14:21:13 2013 -0400
rgmanager: Update the Oracle resource agents to support Oracle 11g
Add Oracle 11g support to the orainstance, oralistener, and oracledb
resource agents.
Resolves: rhbz#964991
Signed-off-by: Ryan McCabe <rmccabe(a)redhat.com>
---
rgmanager/src/resources/oracledb.sh | 428 ++++++++++++++++++++------------
rgmanager/src/resources/orainstance.sh | 298 +++++++++++++---------
rgmanager/src/resources/oralistener.sh | 224 ++++++++++-------
3 files changed, 578 insertions(+), 372 deletions(-)
diff --git a/rgmanager/src/resources/oracledb.sh b/rgmanager/src/resources/oracledb.sh
index bcb367f..2bbb807 100755
--- a/rgmanager/src/resources/oracledb.sh
+++ b/rgmanager/src/resources/oracledb.sh
@@ -1,22 +1,27 @@
#!/bin/bash
#
-# Copyright 2003-2004, 2006-2008 Red Hat, Inc.
+# Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved.
+# Copyright (C) 2004-2013 Red Hat, Inc. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#
# Author(s):
# Hardy Merrill <hmerrill at redhat.com>
# Lon Hohberger <lhh at redhat.com>
# Michael Moon <Michael dot Moon at oracle.com>
-#
-# This program is Open Source software. You may modify and/or redistribute
-# it persuant to the terms of the Open Software License version 2.1, which
-# is available from the following URL and is included herein by reference:
-#
-# http://opensource.org/licenses/osl-2.1.php
-#
-# chkconfig: 345 99 01
-# description: Service script for starting/stopping \
-# Oracle(R) Database 10g on \
-# Red Hat Enterprise Linux 5
+# Ryan McCabe <rmccabe at redhat.com>
#
# NOTES:
#
@@ -35,15 +40,16 @@
# Oracle is a registered trademark of Oracle Corporation.
# Oracle9i is a trademark of Oracle Corporation.
# Oracle10g is a trademark of Oracle Corporation.
+# Oracle11g is a trademark of Oracle Corporation.
# All other trademarks are property of their respective owners.
#
-. /etc/init.d/functions
+. $(dirname $0)/ocf-shellfuncs
+. $(dirname $0)/utils/config-utils.sh
+. $(dirname $0)/utils/messages.sh
+. $(dirname $0)/utils/ra-skelet.sh
-#
-# Source stuff from /etc/sysconfig, but this may be overridden if
-# this is being called as a cluster resource agent instead.
-#. /etc/sysconfig/oracledb
+. /etc/init.d/functions
declare SCRIPT="`basename $0`"
declare SCRIPTDIR="`dirname $0`"
@@ -122,6 +128,15 @@ declare -r DB_PROCNAMES="pmon"
declare -r LSNR_PROCNAME="tnslsnr"
#declare -r LSNR_PROCNAME="tnslsnrXX" # testing
+# clulog will not log messages when run by the oracle user.
+# This is a hack to work around that.
+if [ "`id -u`" = "`id -u $ORACLE_USER`" ]; then
+ ocf_log() {
+ prio=$1
+ shift
+ logger -i -p daemon."$prio" -- "$*"
+ }
+fi
##########################################################
# (Hopefully) No user-serviceable parts below this line. #
@@ -134,10 +149,10 @@ meta_data()
<version>1.0</version>
<longdesc lang="en">
- Oracle 10g Failover Instance
+ Oracle 10g/11g Failover Instance
</longdesc>
<shortdesc lang="en">
- Oracle 10g Failover Instance
+ Oracle 10g/11g Failover Instance
</shortdesc>
<parameters>
@@ -164,7 +179,6 @@ meta_data()
<content type="string"/>
</parameter>
-
<parameter name="user" required="1">
<longdesc lang="en">
Oracle user name. This is the user name of the Oracle
@@ -191,8 +205,10 @@ meta_data()
<longdesc lang="en">
This is the Oracle installation type:
base - Database Instance and Listener only
+ base-11g - Oracle11g Database Instance and Listener Only
base-em (or 10g) - Database, Listener, Enterprise Manager,
and iSQL*Plus
+ base-em-11g - Database, Listener, Enterprise Manager dbconsole
ias (or 10g-ias) - Internet Application Server (Infrastructure)
</longdesc>
<shortdesc lang="en">
@@ -230,7 +246,7 @@ meta_data()
<action name="monitor" depth="10" timeout="30" interval="30"/>
<action name="meta-data" timeout="5"/>
- <action name="verify-all" timeout="5"/>
+ <action name="validate-all" timeout="5"/>
</actions>
<special tag="rgmanager">
@@ -240,84 +256,55 @@ meta_data()
EOT
}
-
-#
-# "action"-like macro supporting functions
-#
-faction()
-{
- echo -n "$1"
- shift
- $*
- if [ $? -eq 0 ]; then
- echo_success
- echo
- return 0
- fi
-
- echo_failure
- echo
- return 1
-}
-
-
#
-# Start Oracle9i (database portion)
+# Start Oracle9i/10g/11g (database portion)
#
start_db()
{
- declare tmpfile
- declare logfile
declare -i rv
+ declare startup_cmd
+ declare startup_stdout
- tmpfile=$(mktemp /tmp/$SCRIPT-start.tmp.XXXXXX)
- logfile=$(mktemp /tmp/$SCRIPT-start.log.XXXXXX)
+ ocf_log info "Starting Oracle DB $ORACLE_SID"
- #
# Set up our sqlplus script. Basically, we're trying to
# capture output in the hopes that it's useful in the case
# that something doesn't work properly.
- #
- echo "startup" > $tmpfile
- echo "quit" >> $tmpfile
-
- sqlplus "/ as sysdba" < $tmpfile &> $logfile
+ startup_cmd="set heading off;\nstartup;\nquit;\n"
+ startup_stdout=$(echo -e "$startup_cmd" | sqlplus -S "/ as sysdba")
rv=$?
- # Dump logfile to /var/log/messages
- initlog -q -c "cat $logfile"
-
+ # Dump output to syslog for debugging
+ ocf_log debug "[$ORACLE_SID] [$rv] sent $startup_cmd"
+ ocf_log debug "[$ORACLE_SID] [$rv] got $startup_stdout"
+
if [ $rv -ne 0 ]; then
- echo "ORACLE_HOME Incorrectly set?"
- echo "See $logfile for more information."
- return 1
+ ocf_log error "Starting Oracle DB $ORACLE_SID failed, sqlplus returned $rv"
+ return 1
fi
- #
# If we see:
# ORA-.....: failure, we failed
- #
-
- rm -f $tmpfile
- grep -q "^ORA-" $logfile
- if [ $? -eq 0 ]; then
- rm -f $tmpfile
- echo "ORACLE_SID Incorrectly set?"
- echo "See $logfile for more information."
- return 1
+ # Troubleshooting:
+ # ORA-00845 - Try rm -f /dev/shm/ora_*
+ # ORA-01081 - Try echo -e 'shutdown abort;\nquit;'|sqlplus "/ as sysdba"
+ if [[ "$startup_stdout" =~ "ORA-" ]] || [[ "$startup_stdout" =~ "failure" ]]; then
+ ocf_log error "Starting Oracle DB $ORACLE_SID failed, found errors in stdout"
+ return 1
fi
+ ocf_log info "Started Oracle DB $ORACLE_SID successfully"
return 0
}
#
-# Stop Oracle9i (database portion)
+# Stop Oracle (database portion)
#
stop_db()
{
- declare tmpfile
- declare logfile
+ declare stop_cmd
+ declare stop_stdout
declare -i rv
declare how_shutdown="$1"
@@ -325,38 +312,30 @@ stop_db()
how_shutdown="immediate"
fi
- tmpfile=$(mktemp /tmp/$SCRIPT-stop.tmp.XXXXXX)
- logfile=$(mktemp /tmp/$SCRIPT-stop.log.XXXXXX)
+ ocf_log info "Stopping Oracle DB $ORACLE_SID $how_shutdown"
# Setup for Stop ...
- echo "shutdown $how_shutdown" > $tmpfile
- echo "quit" >> $tmpfile
-
- sqlplus "/ as sysdba" < $tmpfile &> $logfile
+ stop_cmd="set heading off;\nshutdown $how_shutdown;\nquit;\n"
+ stop_stdout=$(echo -e "$stop_cmd" | sqlplus -S "/ as sysdba")
rv=$?
- # Dump logfile to /var/log/messages
- initlog -q -c "cat $logfile"
-
+ # Log stdout of the stop command
+ ocf_log debug "[$ORACLE_SID] sent stop command $stop_cmd"
+ ocf_log debug "[$ORACLE_SID] got $stop_stdout"
+
+ # sqlplus returned failure. We'll return failed to rhcs
if [ $rv -ne 0 ]; then
- echo "ORACLE_HOME Incorrectly set?"
- echo "See $logfile for more information."
- return 1
+ ocf_log error "Stopping Oracle DB $ORACLE_SID failed, sqlplus returned $rv"
+ return 1
fi
- #
- # If we see 'failure' in the log, we're done.
- #
- rm -f $tmpfile
- grep -q "^ORA-" $logfile
- if [ $? -eq 0 ]; then
- echo_failure
- echo
- echo "Possible reason: ORACLE_SID Incorrectly set."
- echo "See $logfile for more information."
- return 1
+ # If we see 'ORA-' or 'failure' in stdout, we're done.
+ if [[ "$startup_stdout" =~ "ORA-" ]] || [[ "$startup_stdout" =~ "failure" ]]; then
+ ocf_log error "Stopping Oracle DB $ORACLE_SID failed, errors in stdout"
+ return 1
fi
+ ocf_log info "Stopped Oracle DB $ORACLE_SID successfully"
return 0
}
@@ -372,12 +351,15 @@ force_cleanup()
# Patch from Shane Bradley to fix 471266
pids=`ps ax | grep $ORACLE_HOME | grep "ora_.*_${ORACLE_SID}" | grep -v grep | awk '{print $1}'`
- initlog -n $SCRIPT -s "<err> Not all Oracle processes exited cleanly, killing"
-
+ ocf_log error "Not all Oracle processes for $ORACLE_SID exited cleanly, killing"
+
for pid in $pids; do
kill -9 $pid
- if [ $? -eq 0 ]; then
- initlog -n $SCRIPT -s "Killed $pid"
+ rv=$?
+ if [ $rv -eq 0 ]; then
+ ocf_log info "Cleanup $ORACLE_SID Killed PID $pid"
+ else
+ ocf_log error "Cleanup $ORACLE_SID Kill PID $pid failed: $rv"
fi
done
@@ -392,14 +374,19 @@ force_cleanup()
exit_idle()
{
declare -i n=0
+
+ ocf_log debug "Waiting for Oracle processes for $ORACLE_SID to terminate..."
while ps ax | grep $ORACLE_HOME | grep -q -v grep; do
if [ $n -ge 90 ]; then
+ ocf_log debug "Timed out while waiting for Oracle processes for $ORACLE_SID to terminate"
force_cleanup
return 0
fi
sleep 1
((n++))
done
+
+ ocf_log debug "All Oracle processes for $ORACLE_SID have terminated"
return 0
}
@@ -438,24 +425,27 @@ get_db_status()
for (( i=$RESTART_RETRIES ; i; i-- )) ; do
# this db process is down - stop and
# (re)start all ora_XXXX_$ORACLE_SID processes
- initlog -q -n $SCRIPT -s "Restarting Oracle Database..."
+ ocf_log info "Restarting Oracle Database $ORACLE_SID"
stop_db immediate
- if [ $? != 0 ] ; then
+ if [ $? -ne 0 ] ; then
# stop failed - return 1
+ ocf_log error "Error stopping Oracle Database $ORACLE_SID"
return 1
fi
start_db
- if [ $? == 0 ] ; then
+ if [ $? -eq 0 ] ; then
# ora_XXXX_$ORACLE_SID processes started
# successfully, so break out of the
# stop/start # 'for' loop
+ ocf_log info "Restarted Oracle Database $ORACLE_SID successfully"
break
fi
done
if [ $i -eq 0 ]; then
# stop/start's failed - return 1 (failure)
+ ocf_log error "Failed to restart Oracle Database $ORACLE_SID after $RESTART_RETRIES tries"
return 1
fi
done
@@ -471,42 +461,46 @@ get_lsnr_status()
declare -i subsys_lock=$1
declare -i rv
- status $LSNR_PROCNAME
+ ocf_log debug "Checking status for listener $ORACLE_LISTENER"
+ lsnrctl status "$ORACLE_LISTENER" >& /dev/null
rv=$?
- if [ $rv == 0 ] ; then
+ if [ $rv -eq 0 ] ; then
+ ocf_log debug "Listener $ORACLE_LISTENER is up"
return 0 # Listener is running fine
fi
- #
# We're not supposed to be running, and we are,
# in fact, not running. Return 3
- #
if [ $subsys_lock -ne 0 ]; then
+ ocf_log debug "Listener $ORACLE_LISTENER is stopped as expected"
return 3
fi
- #
# Listener is NOT running (but should be) - try to restart
- #
for (( i=$RESTART_RETRIES ; i; i-- )) ; do
-
- action "Restarting Oracle listener:" lsnrctl start \
- $ORACLE_LISTENER
- lsnrctl status $ORACLE_LISTENER >& /dev/null
- if [ $? == 0 ] ; then
+ ocf_log info "Listener $ORACLE_LISTENER is down, attempting to restart"
+ lsnrctl start "$ORACLE_LISTENER" >& /dev/null
+ lsnrctl status "$ORACLE_LISTENER" >& /dev/null
+ if [ $? -eq 0 ] ; then
+ ocf_log info "Listener $ORACLE_LISTENER was restarted successfully"
break # Listener was (re)started and is running fine
fi
done
if [ $i -eq 0 ]; then
# stop/start's failed - return 1 (failure)
+ ocf_log error "Failed to restart listener $ORACLE_LISTENER after $RESTART_RETRIES tries"
return 1
fi
- status $LSNR_PROCNAME
- if [ $? != 0 ] ; then
+ lsnrctl_stdout=$(lsnrctl status "$ORACLE_LISTENER")
+ rv=$?
+ if [ $rv -ne 0 ] ; then
+ ocf_log error "Starting listener $ORACLE_LISTENER failed: $rv output $lsnrctl_stdout"
return 1 # Problem restarting the Listener
fi
+
+ ocf_log info "Listener $ORACLE_LISTENER started successfully"
return 0 # Success restarting the Listener
}
@@ -541,12 +535,12 @@ get_opmn_proc_status()
_status=`echo $_status | cut -f2 -d' '`
if [ "${_status}" == "Alive" ] || [ "${_status}" == "Init" ]; then
if [ $i -lt $RESTART_RETRIES ] ; then
- echo " $comp$type_pretty restarted"
+ ocf_log info "$comp$type_pretty restarted"
fi
- echo " $comp$type_pretty (pid $_pid) is running..."
+ ocf_log info "$comp$type_pretty (pid $_pid) is running..."
break
else
- echo " $comp$type_pretty is stopped"
+ ocf_log info "$comp$type_pretty is stopped"
#
# Try to restart it, but don't worry if we fail. OPMN
@@ -564,6 +558,7 @@ get_opmn_proc_status()
if [ $i -eq 0 ]; then
# restarts failed - return 1 (failure)
+ ocf_log error "Failed to restart OPMN process $comp"
return 1
fi
@@ -584,7 +579,7 @@ get_opmn_status()
#
# OPMN not running??
#
- echo "opmn is stopped"
+ ocf_log info "OPMN is stopped"
if [ $subsys_lock -eq 0 ]; then
#
@@ -600,8 +595,8 @@ get_opmn_status()
#
# Print out the PIDs for everyone.
#
- echo "opmn is running..."
- echo "opmn components:"
+ ocf_log info "OPMN is running..."
+ ocf_log info "opmn components:"
#
# Check the OPMN-managed processes
@@ -615,6 +610,7 @@ get_opmn_status()
# restarted.
#
if [ $ct_errors -ne 0 ]; then
+ ocf_log error "$ct_errors errors occurred while restarting OPMN-managed processes"
return 1
fi
return 0
@@ -659,10 +655,7 @@ update_status()
#
oops()
{
- echo "Please configure this script ($0) to"
- echo "match your installation."
- echo
- echo " $1 failed validation checks."
+ ocf_log error "$ORACLE_SID: Fatal: $1 failed validation checks"
exit 1
}
@@ -673,6 +666,8 @@ oops()
#
validation_checks()
{
+ ocf_log debug "Validating configuration for $ORACLE_SID"
+
#
# If the oracle user doesn't exist, we're done.
#
@@ -701,8 +696,12 @@ validation_checks()
ORACLE_TYPE="base-em"
elif [ "$ORACLE_TYPE" = "10g-ias" ] || [ "$ORACLE_TYPE" = "ias" ]; then
ORACLE_TYPE="ias"
+ elif [ "$ORACLE_TYPE" = "11g" ] || [ "$ORACLE_TYPE" = "base-em-11g" ]; then
+ ORACLE_TYPE="base-em-11g"
+ elif [ "$ORACLE_TYPE" = "base-11g" ]; then
+ ORACLE_TYPE="base-11g"
else
- oops ORACLE_TYPE
+ oops "ORACLE_TYPE $ORACLE_TYPE"
fi
#
@@ -715,7 +714,7 @@ validation_checks()
# Oracle needs to be run as the Oracle user, not root!
#
if [ "`id -u`" = "0" ]; then
- echo "Restarting $0 as $ORACLE_USER."
+ #echo "Restarting $0 as $ORACLE_USER."
#
# Breaks on RHEL5
# exec sudo -u $ORACLE_USER $0 $*
@@ -727,75 +726,191 @@ validation_checks()
#
# If we're not root and not the Oracle user, we're done.
#
- [ "`id -u`" = "`id -u $ORACLE_USER`" ] || exit 1
- [ "`id -g`" = "`id -g $ORACLE_USER`" ] || exit 1
+ [ "`id -u`" = "`id -u $ORACLE_USER`" ] || oops "not ORACLE_USER after su"
+ [ "`id -g`" = "`id -g $ORACLE_USER`" ] || oops "not ORACLE_GROUP after su"
#
# Go home.
#
- cd $ORACLE_HOME
+ cd "$ORACLE_HOME"
+ ocf_log debug "Validation checks for $ORACLE_SID succeeded"
return 0
}
#
-# Start Oracle9i Application Server Infrastructure
+# Start Oracle 9i/10g/11g Application Server Infrastructure
#
start_oracle()
{
- faction "Starting Oracle Database:" start_db || return 1
- action "Starting Oracle Listener:" lsnrctl start $ORACLE_LISTENER || return 1
+ ocf_log info "Starting service $ORACLE_SID"
+
+ start_db
+ rv=$?
+ if [ $rv -ne 0 ]; then
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ fi
+
+ ocf_log info "Starting listener $ORACLE_LISTENER"
+ lsnrctl_stdout=$(lsnrctl start "$ORACLE_LISTENER")
+ rv=$?
+ if [ $rv -ne 0 ]; then
+ ocf_log debug "[$ORACLE_SID] Listener $ORACLE_LISTENER start returned $rv output $lsnrctl_stdout"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ fi
if [ "$ORACLE_TYPE" = "base-em" ]; then
- action "Starting iSQL*Plus:" isqlplusctl start || return 1
- action "Starting Oracle EM DB Console:" emctl start dbconsole || return 1
+ ocf_log info "Starting iSQL*Plus for $ORACLE_SID"
+ isqlplusctl start
+ if [ $? -ne 0 ]; then
+ ocf_log error "iSQL*Plus startup for $ORACLE_SID failed"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "iSQL*Plus startup for $ORACLE_SID succeeded"
+ fi
+
+ ocf_log info "Starting Oracle EM DB Console for $ORACLE_SID"
+ emctl start dbconsole
+ if [ $? -ne 0 ]; then
+ ocf_log error "Oracle EM DB Console startup for $ORACLE_SID failed"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Oracle EM DB Console startup for $ORACLE_SID succeeded"
+ fi
elif [ "$ORACLE_TYPE" = "ias" ]; then
- action "Starting Oracle EM:" emctl start em || return 1
- action "Starting iAS Infrastructure:" opmnctl startall || return 1
+ ocf_log info "Starting Oracle EM for $ORACLE_SID"
+ emctl start em
+ if [ $? -ne 0 ]; then
+ ocf_log error "Oracle EM startup for $ORACLE_SID failed"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Oracle EM startup for $ORACLE_SID succeeded"
+ fi
+
+ ocf_log info "Starting iAS Infrastructure for $ORACLE_SID"
+ opmnctl startall
+ if [ $? -ne 0 ]; then
+ ocf_log error "iAS Infrastructure startup for $ORACLE_SID failed"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "iAS Infrastructure startup for $ORACLE_SID succeeded"
+ fi
+ elif [ "$ORACLE_TYPE" = "base-em-11g" ]; then
+ ocf_log info "Starting Oracle EM DB Console for $ORACLE_SID"
+ emctl start dbconsole
+ if [ $? -ne 0 ]; then
+ ocf_log error "Oracle EM DB Console startup for $ORACLE_SID failed"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Oracle EM DB Console startup for $ORACLE_SID succeeded"
+ fi
fi
if [ -n "$LOCKFILE" ]; then
- touch $LOCKFILE
+ touch "$LOCKFILE"
fi
+
+ ocf_log info "Starting service $ORACLE_SID completed successfully"
return 0
}
#
-# Stop Oracle9i Application Server Infrastructure
+# Stop Oracle 9i/10g/11g Application Server Infrastructure
#
stop_oracle()
{
+ ocf_log info "Stopping service $ORACLE_SID"
+
if ! [ -e "$ORACLE_HOME/bin/lsnrctl" ]; then
- echo "Oracle Listener Control is not available"
- echo " ($ORACLE_HOME not mounted?)"
+ ocf_log error "Oracle Listener Control is not available ($ORACLE_HOME not mounted?)"
return 0
fi
if [ "$ORACLE_TYPE" = "base-em" ]; then
- action "Stopping Oracle EM DB Console:" emctl stop dbconsole || return 1
- action "Stopping iSQL*Plus:" isqlplusctl stop || return 1
+ ocf_log info "Stopping Oracle EM DB Console for $ORACLE_SID"
+ emctl stop dbconsole
+ if [ $? -ne 0 ]; then
+ ocf_log error "Stopping Oracle EM DB Console for $ORACLE_SID failed"
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Stopping Oracle EM DB Console for $ORACLE_SID succeeded"
+ fi
+
+ ocf_log info "Stopping iSQL*Plus for $ORACLE_SID"
+ isqlplusctl stop
+ if [ $? -ne 0 ]; then
+ ocf_log error "Stopping iSQL*Plus for $ORACLE_SID failed"
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Stopping iSQL*Plus for $ORACLE_SID succeeded"
+ fi
elif [ "$ORACLE_TYPE" = "ias" ]; then
- action "Stopping iAS Infrastructure:" opmnctl stopall || return 1
- action "Stopping Oracle EM:" emctl stop em || return 1
+ ocf_log info "Stopping iAS Infrastructure for $ORACLE_SID"
+ opmnctl stopall
+ if [ $? -ne 0 ]; then
+ ocf_log error "Stopping iAS Infrastructure for $ORACLE_SID failed"
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Stopping iAS Infrastructure for $ORACLE_SID succeeded"
+ fi
+
+ ocf_log info "Stopping Oracle EM for $ORACLE_SID"
+ emctl stop em
+ if [ $? -ne 0 ]; then
+ ocf_log error "Stopping Oracle EM for $ORACLE_SID failed"
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Stopping Oracle EM for $ORACLE_SID succeeded"
+ fi
+ elif [ "$ORACLE_TYPE" = "base-em-11g" ]; then
+ ocf_log info "Stopping Oracle EM DB Console for $ORACLE_SID"
+ emctl stop dbconsole
+ if [ $? -ne 0 ]; then
+ ocf_log error "Stopping Oracle EM DB Console for $ORACLE_SID failed"
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
+ else
+ ocf_log info "Stopping Oracle EM DB Console for $ORACLE_SID succeeded"
+ fi
fi
- faction "Stopping Oracle Database:" stop_db immediate
+ stop_db immediate || stop_db abort
if [ $? -ne 0 ]; then
- faction "Stopping Oracle Database (hard):" stop_db abort || return 1
+ ocf_log error "Stopping service $ORACLE_SID failed"
+ return 1
fi
- action "Stopping Oracle Listener:" lsnrctl stop $ORACLE_LISTENER
- faction "Waiting for all Oracle processes to exit:" exit_idle
+ ocf_log info "Stopping listener $ORACLE_LISTENER for $ORACLE_SID"
+ lsnrctl_stdout=$(lsnrctl stop "$ORACLE_LISTENER")
+ rv=$?
+ if [ $? -ne 0 ]; then
+ ocf_log error "Listener $ORACLE_LISTENER stop failed for $ORACLE_SID: $rv output $lsnrctl_stdout"
+ # XXX - failure?
+ fi
+ exit_idle
if [ $? -ne 0 ]; then
- echo "WARNING: Not all Oracle processes exited cleanly"
+ ocf_log warning "WARNING: Not all Oracle processes exited cleanly for $ORACLE_SID"
fi
if [ -n "$LOCKFILE" ]; then
- rm -f $LOCKFILE
+ rm -f "$LOCKFILE"
fi
+
+ ocf_log info "Stopping service $ORACLE_SID succeeded"
return 0
}
@@ -823,10 +938,12 @@ status_oracle()
declare -i subsys_lock=1
declare -i last
+ ocf_log debug "Checking status for $ORACLE_SID depth $depth"
+
#
# Check for lock file. Crude and rudimentary, but it works
#
- if [ -z "$LOCKFILE" ] || [ -f $LOCKFILE ]; then
+ if [ -z "$LOCKFILE" ] || [ -f "$LOCKFILE" ]; then
subsys_lock=0
fi
@@ -840,9 +957,9 @@ status_oracle()
update_status $? $last
last=$?
- if [ "$ORACLE_TYPE" = "base-em" ]; then
+ if [ "$ORACLE_TYPE" = "base-em" ] || [ "$ORACLE_TYPE" = "base-em-11g" ]; then
# XXX Add isqlplus status check?!
- emctl status dbconsole 2>&1 | grep "is running"
+ emctl status dbconsole >&/dev/null
update_status $? $last
last=$?
elif [ "$ORACLE_TYPE" = "ias" ]; then
@@ -857,9 +974,10 @@ status_oracle()
# file back. XXX - this kosher?
#
if [ $last -eq 0 ] && [ $subsys_lock -ne 0 ]; then
- touch $LOCKFILE
+ touch "$LOCKFILE"
fi
+ ocf_log debug "Status returning $last for $ORACLE_SID"
return $last
}
diff --git a/rgmanager/src/resources/orainstance.sh b/rgmanager/src/resources/orainstance.sh
index a9f690d..ac71a92 100755
--- a/rgmanager/src/resources/orainstance.sh
+++ b/rgmanager/src/resources/orainstance.sh
@@ -1,11 +1,12 @@
#!/bin/bash
#
-# Copyright 2003-2004, 2006-2007 Red Hat, Inc.
+# Copyright 2003-2004, 2006-2013 Red Hat, Inc.
#
# Author(s):
# Hardy Merrill <hmerrill at redhat.com>
# Lon Hohberger <lhh at redhat.com>
# Michael Moon <Michael dot Moon at oracle.com>
+# Ryan McCabe <rmccabe at redhat.com>
#
# This program is Open Source software. You may modify and/or redistribute
# it persuant to the terms of the Open Software License version 2.1, which
@@ -13,11 +14,6 @@
#
# http://opensource.org/licenses/osl-2.1.php
#
-# chkconfig: 345 99 01
-# description: Service script for starting/stopping \
-# Oracle(R) Database 10g on \
-# Red Hat Enterprise Linux 5
-#
# NOTES:
#
# (1) You can comment out the LOCKFILE declaration below. This will prevent
@@ -35,6 +31,7 @@
# Oracle is a registered trademark of Oracle Corporation.
# Oracle9i is a trademark of Oracle Corporation.
# Oracle10g is a trademark of Oracle Corporation.
+# Oracle11g is a trademark of Oracle Corporation.
# All other trademarks are property of their respective owners.
#
#
@@ -48,6 +45,12 @@
# - SysV init support removed. Only usable with rgmanager
#
+# Grab the global RHCS helper functions
+. $(dirname $0)/ocf-shellfuncs
+. $(dirname $0)/utils/config-utils.sh
+. $(dirname $0)/utils/messages.sh
+. $(dirname $0)/utils/ra-skelet.sh
+
. /etc/init.d/functions
declare SCRIPT="`basename $0`"
@@ -60,60 +63,64 @@ ORACLE_SID=$OCF_RESKEY_name
# Optional parameters with default values
LISTENERS=$OCF_RESKEY_listeners
-LOCKFILE="/tmp/.oracle10g-${ORACLE_SID}.lock"
+LOCKFILE="$ORACLE_HOME/.orainstance-${ORACLE_SID}.lock"
[ -n "$OCF_RESKEY_lockfile" ] && LOCKFILE=$OCF_RESKEY_lockfile
export LISTENERS ORACLE_USER ORACLE_HOME ORACLE_SID LOCKFILE
export LD_LIBRARY_PATH=$ORACLE_HOME/lib
-export PATH=$ORACLE_HOME/bin:$PATH
+export PATH=$ORACLE_HOME/bin:/bin:/sbin:/usr/bin:/usr/sbin
declare -i RESTART_RETRIES=3
declare -r DB_PROCNAMES="pmon"
declare -r LSNR_PROCNAME="tnslsnr"
+# clulog will not log messages when run by the oracle user.
+# This is a hack to work around that.
+if [ "`id -u`" = "`id -u $ORACLE_USER`" ]; then
+ ocf_log() {
+ prio=$1
+ shift
+ logger -i -p daemon."$prio" -- "$*"
+ }
+fi
#
# Start Oracle (database portion)
#
start_db() {
- declare tmpfile
- declare logfile
declare -i rv
+ declare startup_cmd
+ declare startup_stdout
- tmpfile=/tmp/$SCRIPT-start.$$
- logfile=/tmp/$SCRIPT-start.log.$$
+ ocf_log info "Starting Oracle DB $ORACLE_SID"
# Set up our sqlplus script. Basically, we're trying to
# capture output in the hopes that it's useful in the case
# that something doesn't work properly.
- echo "startup" > $tmpfile
- echo "quit" >> $tmpfile
-
- sqlplus "/ as sysdba" < $tmpfile > $logfile
+ startup_cmd="set heading off;\nstartup;\nquit;\n"
+ startup_stdout=$(echo -e "$startup_cmd" | sqlplus -S "/ as sysdba")
rv=$?
- rm -f $tmpfile
-
- # Dump logfile to /var/log/messages
- initlog -q -c "cat $logfile"
+ # Dump output to syslog for debugging
+ ocf_log debug "[$ORACLE_SID] [$rv] sent $startup_cmd"
+ ocf_log debug "[$ORACLE_SID] [$rv] got $startup_stdout"
if [ $rv -ne 0 ]; then
- rm -f $logfile
- initlog -n $SCRIPT -q -s "sqlplus returned 1, failed"
+ ocf_log error "Starting Oracle DB $ORACLE_SID failed, sqlplus returned $rv"
return 1
fi
# If we see:
# ORA-.....: failure, we failed
- grep -q "^ORA-" $logfile
- rv=$?
-
- rm -f $logfile
- if [ $rv -eq 0 ]; then
- initlog -n $SCRIPT -q -s "found failure in stdout, returning 1"
+ # Troubleshooting:
+ # ORA-00845 - Try rm -f /dev/shm/ora_*
+ # ORA-01081 - Try echo -e 'shutdown abort;\nquit;'|sqlplus "/ as sysdba"
+ if [[ "$startup_stdout" =~ "ORA-" ]] || [[ "$startup_stdout" =~ "failure" ]]; then
+ ocf_log error "Starting Oracle DB $ORACLE_SID failed, found errors in stdout"
return 1
fi
+ ocf_log info "Started Oracle DB $ORACLE_SID successfully"
return 0
}
@@ -122,49 +129,47 @@ start_db() {
# Stop Oracle (database portion)
#
stop_db() {
- declare tmpfile
- declare logfile
+ declare stop_cmd
+ declare stop_stdout
declare -i rv
+ declare how_shutdown="$1"
- tmpfile=/tmp/$SCRIPT-stop.$$
- logfile=/tmp/$SCRIPT-stop.log.$$
+ if [ -z "$1" ]; then
+ how_shutdown="immediate"
+ fi
- ora_procname="ora_${DB_PROCNAMES}_${ORACLE_SID}"
- status $ora_procname
- if [ $? -ne 0 ]; then
- # No pmon process found, db already down
- return 0
- fi
+ ocf_log info "Stopping Oracle DB $ORACLE_SID $how_shutdown"
- # Setup for Stop ...
- echo "shutdown immediate" > $tmpfile
- echo "quit" >> $tmpfile
+ ora_procname="ora_${DB_PROCNAMES}_${ORACLE_SID}"
+ status $ora_procname
+ if [ $? -ne 0 ]; then
+ ocf_log debug "no pmon process -- DB $ORACLE_SID already stopped"
+ # No pmon process found, db already down
+ return 0
+ fi
- sqlplus "/ as sysdba" < $tmpfile > $logfile
+ # Setup for Stop ...
+ stop_cmd="set heading off;\nshutdown $how_shutdown;\nquit;\n"
+ stop_stdout=$(echo -e "$stop_cmd" | sqlplus -S "/ as sysdba")
rv=$?
- rm -f $tmpfile
-
- # Dump logfile to /var/log/messages
- initlog -q -c "cat $logfile"
+ # Log stdout of the stop command
+ ocf_log debug "[$ORACLE_SID] sent stop command $stop_cmd"
+ ocf_log debug "[$ORACLE_SID] got $stop_stdout"
- # sqlplus returned failure. We'll return failed to rhcs
+ # sqlplus returned failure. We'll return failed to rhcs
if [ $rv -ne 0 ]; then
- rm -f $logfile
- initlog -n $SCRIPT -q -s "sqlplus returned 1, failed"
+ ocf_log error "Stopping Oracle DB $ORACLE_SID failed, sqlplus returned $rv"
return 1
fi
- grep -q "^ORA-" $logfile
- rv=$?
- rm -f $logfile
-
- # If we see 'failure' in the log, we're done.
- if [ $rv -eq 0 ]; then
- initlog -n $SCRIPT -q -s "found failure in stdout, returning 1"
+ # If we see 'ORA-' or 'failure' in stdout, we're done.
+ if [[ "$startup_stdout" =~ "ORA-" ]] || [[ "$startup_stdout" =~ "failure" ]]; then
+ ocf_log error "Stopping Oracle DB $ORACLE_SID failed, errors in stdout"
return 1
fi
+ ocf_log info "Stopped Oracle DB $ORACLE_SID successfully"
return 0
}
@@ -176,14 +181,17 @@ force_cleanup() {
declare pids
declare pid
- pids=`ps ax | grep $ORACLE_SID | grep -v grep | awk '{print $1}'`
-
- initlog -n $SCRIPT -s "<err> Not all Oracle processes exited cleanly, killing"
+ ocf_log error "Not all Oracle processes for $ORACLE_SID exited cleanly, killing"
+ pids=`ps ax | grep "ora_.*_${ORACLE_SID}" | grep -v grep | awk '{print $1}'`
+
for pid in $pids; do
kill -9 $pid
- if [ $? -eq 0 ]; then
- initlog -n $SCRIPT -s "Killed $pid"
+ rv=$?
+ if [ $rv -eq 0 ]; then
+ ocf_log info "Cleanup $ORACLE_SID Killed PID $pid"
+ else
+ ocf_log error "Cleanup $ORACLE_SID Kill PID $pid failed: $rv"
fi
done
@@ -197,14 +205,18 @@ force_cleanup() {
exit_idle() {
declare -i n=0
+ ocf_log debug "Waiting for Oracle processes for $ORACLE_SID to terminate..."
while ps ax | grep $ORACLE_SID | grep -q -v $LSNR_PROCNAME | grep -q -v grep; do
if [ $n -ge 90 ]; then
+ ocf_log debug "Timed out while waiting for Oracle processes for $ORACLE_SID to terminate"
force_cleanup
return 0
fi
sleep 1
((n++))
done
+
+ ocf_log debug "All Oracle processes for $ORACLE_SID have terminated"
return 0
}
@@ -219,6 +231,8 @@ get_db_status() {
declare -i rv=0
declare ora_procname
+ ocf_log debug "Checking status of DB $ORACLE_SID"
+
for procname in $DB_PROCNAMES ; do
ora_procname="ora_${procname}_${ORACLE_SID}"
@@ -231,30 +245,34 @@ get_db_status() {
# We're not supposed to be running, and we are,
# in fact, not running...
if [ $subsys_lock -ne 0 ]; then
+ ocf_log debug "DB $ORACLE_SID is already stopped"
return 3
fi
for (( i=$RESTART_RETRIES ; i; i-- )) ; do
# this db process is down - stop and
# (re)start all ora_XXXX_$ORACLE_SID processes
- initlog -q -n $SCRIPT -s "Restarting Oracle Database..."
+ ocf_log info "Restarting Oracle Database $ORACLE_SID"
stop_db
start_db
- if [ $? == 0 ] ; then
+ if [ $? -eq 0 ] ; then
# ora_XXXX_$ORACLE_SID processes started
# successfully, so break out of the
# stop/start # 'for' loop
+ ocf_log info "Restarted Oracle DB $ORACLE_SID successfully"
break
fi
done
if [ $i -eq 0 ]; then
# stop/start's failed - return 1 (failure)
- initlog -q -n $SCRIPT -s "Restart failed, retuning 1"
+ ocf_log error "Failed to restart Oracle DB $ORACLE_SID after $RESTART_RETRIES tries"
return 1
fi
done
+
+ ocf_log debug "Checking status of DB $ORACLE_SID success"
return 0
}
@@ -265,41 +283,48 @@ get_db_status() {
get_lsnr_status() {
declare -i subsys_lock=$1
declare -i rv
- declare -r LISTENER=$3
+ declare -r LISTENER=$3
- lsnrctl status $LISTENER >& /dev/null
+ ocf_log debug "Checking status for listener $LISTENER"
+ lsnrctl status "$LISTENER" >& /dev/null
rv=$?
- if [ $rv == 0 ] ; then
+ if [ $rv -eq 0 ] ; then
+ ocf_log debug "Listener $LISTENER is up"
return 0 # Listener is running fine
fi
# We're not supposed to be running, and we are,
# in fact, not running. Return 3
if [ $subsys_lock -ne 0 ]; then
+ ocf_log debug "Listener $LISTENER is stopped as expected"
return 3
fi
# Listener is NOT running (but should be) - try to restart
for (( i=$RESTART_RETRIES ; i; i-- )) ; do
- initlog -n $SCRIPT -q -s "Restarting Oracle listener ($LISTENER)"
- lsnrctl start $LISTENER
- lsnrctl status $LISTENER >& /dev/null
- if [ $? == 0 ] ; then
+ ocf_log info "Listener $LISTENER is down, attempting to restart"
+ lsnrctl start "$LISTENER" >& /dev/null
+ lsnrctl status "$LISTENER" >& /dev/null
+ if [ $? -eq 0 ]; then
+ ocf_log info "Listener $LISTENER was restarted successfully"
break # Listener was (re)started and is running fine
fi
done
if [ $i -eq 0 ]; then
# stop/start's failed - return 1 (failure)
- initlog -n $SCRIPT -q -s "Listener restart failed, retuning 1"
+ ocf_log error "Failed to restart listener $LISTENER after $RESTART_RETRIES tries"
return 1
fi
- lsnrctl status $LISTENER >& /dev/null
- if [ $? != 0 ] ; then
- initlog -n $SCRIPT -q -s "Listener status failed, retuning 1"
+ lsnrctl_stdout=$(lsnrctl status "$LISTENER")
+ rv=$?
+ if [ $rv -ne 0 ] ; then
+ ocf_log error "Starting listener $LISTENER failed: $rv output $lsnrctl_stdout"
return 1 # Problem restarting the Listener
fi
+
+ ocf_log info "Listener $LISTENER started successfully"
return 0 # Success restarting the Listener
}
@@ -329,7 +354,7 @@ update_status() {
fi
if [ $old_status -ne $new_status ]; then
- initlog -n $SCRIPT -q -s "$old_status vs $new_status - returning 1"
+ ocf_log error "Error: $old_status vs $new_status for $ORACLE_SID - returning 1"
return 1
fi
@@ -341,11 +366,7 @@ update_status() {
# Print an error message to the user and exit.
#
oops() {
- #echo "Please configure this script ($0) to"
- #echo "match your installation."
- #echo
- #echo " $1 failed validation checks."
- initlog -n $SCRIPT -q -s "$1 failed validation checks"
+ ocf_log error "$ORACLE_SID: Fatal: $1 failed validation checks"
exit 1
}
@@ -355,16 +376,18 @@ oops() {
# script.
#
validation_checks() {
+ ocf_log debug "Validating configuration for $ORACLE_SID"
+
# If the oracle user doesn't exist, we're done.
[ -n "$ORACLE_USER" ] || oops "ORACLE_USER"
id -u $ORACLE_USER > /dev/null || oops "ORACLE_USER"
- id -g $ORACLE_USER > /dev/null || oops "ORACLE_USER"
+ id -g $ORACLE_USER > /dev/null || oops "ORACLE_GROUP"
# If the oracle home isn't a directory, we're done
- [ -n "$ORACLE_HOME" ] || oops ORACLE_HOME
+ [ -n "$ORACLE_HOME" ] || oops "ORACLE_HOME"
# If the oracle SID is NULL, we're done
- [ -n "$ORACLE_SID" ] || oops ORACLE_SID
+ [ -n "$ORACLE_SID" ] || oops "ORACLE_SID"
# Super user? Automatically change UID and exec as oracle user.
# Oracle needs to be run as the Oracle user, not root!
@@ -374,12 +397,13 @@ validation_checks() {
fi
# If we're not root and not the Oracle user, we're done.
- [ "`id -u`" = "`id -u $ORACLE_USER`" ] || exit 1
- [ "`id -g`" = "`id -g $ORACLE_USER`" ] || exit 1
+ [ "`id -u`" = "`id -u $ORACLE_USER`" ] || oops "not ORACLE_USER after su"
+ [ "`id -g`" = "`id -g $ORACLE_USER`" ] || oops "not ORACLE_GROUP after su"
# Go home.
- cd $ORACLE_HOME
+ cd "$ORACLE_HOME"
+ ocf_log debug "Validation checks for $ORACLE_SID succeeded"
return 0
}
@@ -388,20 +412,31 @@ validation_checks() {
# Start Oracle
#
start_oracle() {
- initlog -n $SCRIPT -q -s "Starting Oracle Database"
- start_db || return 1
-
- for LISTENER in ${LISTENERS}; do
- logfile=/tmp/$SCRIPT-lsn-$$.log
- initlog -n $SCRIPT -q -s "Starting Oracle Listener $LISTENER"
- lsnrctl start $LISTENER > $logfile
- initlog -q -c "cat $logfile"
- rm -f $logfile
- done
+ ocf_log info "Starting service $ORACLE_SID"
+
+ start_db
+ rv=$?
+ if [ $rv -ne 0 ]; then
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ fi
+
+ for LISTENER in ${LISTENERS}; do
+ ocf_log info "Starting listener $LISTENER"
+ lsnrctl_stdout=$(lsnrctl start "$LISTENER")
+ rv=$?
+ if [ $rv -ne 0 ]; then
+ ocf_log debug "[$ORACLE_SID] Listener $LISTENER start returned $rv output $lsnrctl_stdout"
+ ocf_log error "Starting service $ORACLE_SID failed"
+ return 1
+ fi
+ done
if [ -n "$LOCKFILE" ]; then
- touch $LOCKFILE
+ touch "$LOCKFILE"
fi
+
+ ocf_log info "Starting service $ORACLE_SID completed successfully"
return 0
}
@@ -410,30 +445,42 @@ start_oracle() {
# Stop Oracle
#
stop_oracle() {
+ ocf_log info "Stopping service $ORACLE_SID"
+
if ! [ -e "$ORACLE_HOME/bin/lsnrctl" ]; then
- initlog -n $SCRIPT -q -s "Oracle Listener Control is not available ($ORACLE_HOME not mounted?)"
+ ocf_log error "Oracle Listener Control is not available ($ORACLE_HOME not mounted?)"
+ # XXX should this return 1?
return 0
fi
- initlog -n $SCRIPT -q -s "Stopping Oracle Database"
- stop_db || return 1
+ stop_db || stop_db abort
+ if [ $? -ne 0 ]; then
+ ocf_log error "Unable to stop DB for $ORACLE_SID"
+ return 1
+ fi
-
- for LISTENER in ${LISTENERS}; do
- initlog -n $SCRIPT -q -s "Stopping Oracle Listener $LISTENER"
- lsnrctl stop $LISTENER
- done
+ for LISTENER in ${LISTENERS}; do
+ ocf_log info "Stopping listener $LISTENER for $ORACLE_SID"
+ lsnrctl_stdout=$(lsnrctl stop "$LISTENER")
+ rv=$?
+ if [ $? -ne 0 ]; then
+ ocf_log error "Listener $LISTENER stop failed for $ORACLE_SID: $rv output $lsnrctl_stdout"
+ # XXX - failure?
+ fi
+ done
- initlog -n $SCRIPT -q -s "Waiting for all Oracle processes to exit"
- exit_idle
+ exit_idle
if [ $? -ne 0 ]; then
- initlog -n $SCRIPT -q -s "WARNING: Not all Oracle processes exited cleanly"
+ ocf_log error "WARNING: Not all Oracle processes exited cleanly for $ORACLE_SID"
+ # XXX - failure?
fi
if [ -n "$LOCKFILE" ]; then
- rm -f $LOCKFILE
+ rm -f "$LOCKFILE"
fi
+
+ ocf_log info "Stopping service $ORACLE_SID succeeded"
return 0
}
@@ -461,8 +508,10 @@ status_oracle() {
declare -i last
declare -i depth=$1
+ ocf_log debug "Checking status for $ORACLE_SID depth $depth"
+
# Check for lock file. Crude and rudimentary, but it works
- if [ -z "$LOCKFILE" ] || [ -f $LOCKFILE ]; then
+ if [ -z "$LOCKFILE" ] || [ -f "$LOCKFILE" ]; then
subsys_lock=0
fi
@@ -472,18 +521,19 @@ status_oracle() {
last=$?
# Check & report listener status
- for LISTENER in ${LISTENERS}; do
- get_lsnr_status $subsys_lock $depth $LISTENER
- update_status $? $last
- last=$?
- done
+ for LISTENER in ${LISTENERS}; do
+ get_lsnr_status $subsys_lock $depth "$LISTENER"
+ update_status $? $last
+ last=$?
+ done
# No lock file, but everything's running. Put the lock
# file back. XXX - this kosher?
if [ $last -eq 0 ] && [ $subsys_lock -ne 0 ]; then
- touch $LOCKFILE
+ touch "$LOCKFILE"
fi
+ ocf_log debug "Status returning $last for $ORACLE_SID"
return $last
}
@@ -493,22 +543,22 @@ status_oracle() {
########################
case $1 in
- meta-data)
- cat `echo $0 | sed 's/^\(.*\)\.sh$/\1.metadata/'`
- exit 0
- ;;
+ meta-data)
+ cat `echo $0 | sed 's/^\(.*\)\.sh$/\1.metadata/'`
+ exit 0
+ ;;
start)
- validation_checks $*
+ validation_checks $*
start_oracle
exit $?
;;
stop)
- validation_checks $*
+ validation_checks $*
stop_oracle
exit $?
;;
status|monitor)
- validation_checks $*
+ validation_checks $*
status_oracle $OCF_CHECK_LEVEL
exit $?
;;
diff --git a/rgmanager/src/resources/oralistener.sh b/rgmanager/src/resources/oralistener.sh
index 3d6b839..30b8e06 100755
--- a/rgmanager/src/resources/oralistener.sh
+++ b/rgmanager/src/resources/oralistener.sh
@@ -1,7 +1,5 @@
#!/bin/bash
#
-# $Id: oralistener.sh 127 2009-08-21 09:17:52Z hevirtan $
-#
# Red Hat Cluster Suite resource agent for controlling Oracle 10g
# listener instances. This script will start, stop and monitor running
# listeners.
@@ -12,6 +10,24 @@
#
# monitor: Will check that the listener is OK by calling lsnrctl status
#
+#
+# Copyright (C) 1997-2003 Sistina Software, Inc. All rights reserved.
+# Copyright (C) 2004-2013 Red Hat, Inc. All rights reserved.
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation; either version 2
+# of the License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
# Grab the global RHCS helper functions
. $(dirname $0)/ocf-shellfuncs
@@ -27,126 +43,148 @@ LISTENER=$OCF_RESKEY_name
LC_ALL=C
LANG=C
-PATH=/bin:/sbin:/usr/bin:/usr/sbin:$ORACLE_HOME/bin
-export LC_ALL LANG PATH ORACLE_HOME
+PATH=$ORACLE_HOME/bin:/bin:/sbin:/usr/bin:/usr/sbin
+export LC_ALL LANG PATH ORACLE_USER ORACLE_HOME
+
+# clulog will not log messages when run by the oracle user.
+# This is a hack to work around that.
+if [ "`id -u`" = "`id -u $ORACLE_USER`" ]; then
+ ocf_log() {
+ prio=$1
+ shift
+ logger -i -p daemon."$prio" -- "$*"
+ }
+fi
verify_all() {
- clog_service_verify $CLOG_INIT
-
- if [ -z "$OCF_RESKEY_name" ]; then
- clog_service_verify $CLOG_FAILED "Invalid name of service (listener name)"
- return $OCF_ERR_ARGS
- fi
-
- if [ -z "$OCF_RESKEY_home" ]; then
- clog_service_verify $CLOG_FAILED "No Oracle home specified."
- return $OCF_ERR_ARGS
- fi
-
- if [ -z "$OCF_RESKEY_user" ]; then
- clog_service_verify $CLOG_FAILED "No Oracle username specified."
- return $OCF_ERR_ARGS
- fi
-
- # Make sure the lsnrctl binary is in our $PATH
- if [ ! -x $(which lsnrctl) ]; then
- clog_service_verify $CLOG_FAILED "oralistener:${OCF_RESKEY_home}: Unable to locate lsnrctl command from path! ($PATH)"
- return $OCF_ERR_GENERIC
- fi
-
- clog_service_verify $CLOG_SUCCEED
- return 0
-}
+ ocf_log debug "Validating configuration for $LISTENER"
-start () {
- clog_service_start $CLOG_INIT
-
- logfile="/tmp/oracle_lsn.$$"
- su -p - $ORACLE_USER -c "lsnrctl start $LISTENER > $logfile"
+ if [ -z "$OCF_RESKEY_name" ]; then
+ ocf_log error "Validation for $LISTENER failed: Invalid name of service (listener name)"
+ return $OCF_ERR_ARGS
+ fi
- initlog -q -c "cat $logfile"
- rm -f $logfile
+ if [ -z "$OCF_RESKEY_home" ]; then
+ ocf_log error "Validation for $LISTENER failed: No Oracle home specified."
+ return $OCF_ERR_ARGS
+ fi
- clog_service_start $CLOG_SUCCEED
- return 0
+ if [ -z "$OCF_RESKEY_user" ]; then
+ ocf_log error "Validation for $LISTENER failed: No Oracle username specified."
+ return $OCF_ERR_ARGS
+ fi
+
+ # Super user? Automatically change UID and exec as oracle user.
+ # Oracle needs to be run as the Oracle user, not root!
+ if [ "`id -u`" = "0" ]; then
+ su $OCF_RESKEY_user -c "$0 $*"
+ exit $?
+ fi
+
+ # Make sure the lsnrctl binary is in our $PATH
+ if [ ! -x $(which lsnrctl) ]; then
+ ocf_log error "Validation for $LISTENER failed: Unable to locate lsnrctl command from path! ($PATH)"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log debug "Validation checks for $LISTENER succeeded"
+ return 0
+}
+
+start() {
+ ocf_log info "Starting listener $LISTENER"
+ lsnrctl_stdout=$(lsnrctl start "$LISTENER")
+ if [ $? -ne 0 ]; then
+ ocf_log error "start listener $LISTENER failed $lsnrctl_stdout"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log info "Listener $LISTENER started successfully"
+ return 0
}
-stop () {
- clog_service_stop $CLOG_INIT
-
- logfile="/tmp/oracle_lsn.$$"
- su -p - $ORACLE_USER -c "lsnrctl stop $LISTENER > $logfile"
+stop() {
+ ocf_log info "Stopping listener $LISTENER"
- initlog -q -c "cat $logfile"
- rm -f $logfile
+ lsnrctl_stdout=$(lsnrctl stop "$LISTENER")
+ if [ $? -ne 0 ]; then
+ ocf_log debug "stop listener $LISTENER failed $lsnrctl_stdout"
+ return $OCF_ERR_GENERIC
+ fi
- clog_service_stop $CLOG_SUCCEED
- return 0
+ ocf_log info "Listener $LISTENER stopped successfully"
+ return 0
}
-monitor () {
- clog_service_status $CLOG_INIT
-
- su -p - $ORACLE_USER -c "lsnrctl status $LISTENER"
- rv=$?
- if [ $rv == 0 ]; then
- clog_service_status $CLOG_SUCCEED
- return 0 # Listener is running fine
- else
- clog_service_status $CLOG_FAILED
- return $OCF_ERR_GENERIC
- fi
+monitor() {
+ declare -i depth=$1
+
+ ocf_log debug "Checking status for listener $LISTENER depth $depth"
+ lsnrctl status "$LISTENER" >& /dev/null
+ if [ $? -ne 0 ]; then
+ ocf_log error "Listener $LISTENER not running"
+ return $OCF_ERR_GENERIC
+ fi
+
+ ocf_log debug "Listener $LISTENER is up"
+ return 0 # Listener is running fine
}
recover() {
+ ocf_log debug "Recovering listener $LISTENER"
+
for (( i=$RESTART_RETRIES ; i; i-- )); do
start
- if [ $? == 0 ] ; then
- break
+ if [ $? -eq 0 ] ; then
+ ocf_log debug "Restarted listener $LISTENER successfully"
+ break
fi
done
if [ $i -eq 0 ]; then
# stop/start's failed - return 1 (failure)
+ ocf_log debug "Failed to restart listener $LISTENER after $RESTART_RETRIES tries"
return 1
fi
- status
- if [ $? != 0 ] ; then
+ status
+ if [ $? -ne 0 ] ; then
+ ocf_log debug "Failed to restart listener $LISTENER"
return 1 # Problem restarting the Listener
fi
+ ocf_log debug "Restarted listener $LISTENER successfully"
return 0 # Success restarting the Listener
}
case $1 in
- meta-data)
- cat `echo $0 | sed 's/^\(.*\)\.sh$/\1.metadata/'`
- exit 0
- ;;
- verify-all)
- verify_all
- exit $?
- ;;
- start)
- verify_all && start
- exit $?
- ;;
- stop)
- verify_all && stop
- exit $?
- ;;
- recover)
- verify_all && recover
- exit $?
- ;;
- status|monitor)
- verify_all
- monitor
- exit $?
- ;;
- *)
- echo "Usage: $0 {start|stop|recover|monitor|status|meta-data|verify-all}"
- exit $OCF_ERR_GENERIC
- ;;
+ meta-data)
+ cat `echo $0 | sed 's/^\(.*\)\.sh$/\1.metadata/'`
+ exit 0
+ ;;
+ verify-all)
+ verify_all $*
+ exit $?
+ ;;
+ start)
+ verify_all $* && start
+ exit $?
+ ;;
+ stop)
+ verify_all $* && stop
+ exit $?
+ ;;
+ recover)
+ verify_all $* && recover
+ exit $?
+ ;;
+ status|monitor)
+ verify_all $*
+ monitor $OCF_CHECK_LEVEL
+ exit $?
+ ;;
+ *)
+ echo "Usage: $0 {start|stop|recover|monitor|status|meta-data|verify-all}"
+ exit $OCF_ERR_GENERIC
+ ;;
esac
10 years, 12 months
gfs2-utils: master - fsck.gfs2: delete all duplicates from unrecoverable damaged dinodes
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=98297970...
Commit: 982979702a7f23b069e31f171cc38fc4c87a018a
Parent: e8d58780c43e0befeacab299c6d099e196bc83b9
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Tue May 14 09:51:28 2013 -0500
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Mon May 20 11:12:47 2013 -0500
fsck.gfs2: delete all duplicates from unrecoverable damaged dinodes
When pass1 encounters a dinode with unrecoverable damage, it tries
to "undo" the metadata and data block designations it marked in the
blockmap prior to finding the damage. That's all fine, but if the
damaged dinode has a duplicate reference, we also need to delete that
from the duplicate reference list. Otherwise pass1b may try to
resolve the duplicate reference and reinstate the damaged dinode.
---
gfs2/fsck/metawalk.c | 5 ++++
gfs2/fsck/pass1b.c | 60 --------------------------------------------------
gfs2/fsck/util.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++++
gfs2/fsck/util.h | 1 +
4 files changed, 65 insertions(+), 60 deletions(-)
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index 44b5c66..22f3334 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -1530,6 +1530,11 @@ undo_metalist:
brelse(bh);
}
}
+ /* There may be leftover duplicate records, so we need to delete them.
+ For example, if a metadata block was found to be a duplicate, we
+ may not have added it to the metalist, which means it's not there
+ to undo. */
+ delete_all_dups(ip);
/* Set the dinode as "bad" so it gets deleted */
fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
_("corrupt"), gfs2_block_free);
diff --git a/gfs2/fsck/pass1b.c b/gfs2/fsck/pass1b.c
index 9a23197..0dcb306 100644
--- a/gfs2/fsck/pass1b.c
+++ b/gfs2/fsck/pass1b.c
@@ -52,66 +52,6 @@ static void log_inode_reference(struct duptree *dt, osi_list_t *tmp, int inval)
}
/* delete_all_dups - delete all duplicate records for a given inode */
-static void delete_all_dups(struct gfs2_inode *ip)
-{
- struct osi_node *n, *next;
- struct duptree *dt;
- osi_list_t *tmp, *x;
- struct inode_with_dups *id;
- int found;
-
- for (n = osi_first(&dup_blocks); n; n = next) {
- next = osi_next(n);
- dt = (struct duptree *)n;
-
- found = 0;
- id = NULL;
-
- osi_list_foreach_safe(tmp, &dt->ref_invinode_list, x) {
- id = osi_list_entry(tmp, struct inode_with_dups, list);
- if (id->block_no == ip->i_di.di_num.no_addr) {
- dup_listent_delete(dt, id);
- found = 1;
- }
- }
- osi_list_foreach_safe(tmp, &dt->ref_inode_list, x) {
- id = osi_list_entry(tmp, struct inode_with_dups, list);
- if (id->block_no == ip->i_di.di_num.no_addr) {
- dup_listent_delete(dt, id);
- found = 1;
- }
- }
- if (!found)
- continue;
-
- if (dt->refs == 0) {
- log_debug(_("This was the last reference: 0x%llx is "
- "no longer a duplicate.\n"),
- (unsigned long long)dt->block);
- dup_delete(dt); /* not duplicate now */
- } else {
- log_debug(_("%d references remain to 0x%llx\n"),
- dt->refs, (unsigned long long)dt->block);
- if (dt->refs > 1)
- continue;
-
- id = NULL;
- osi_list_foreach(tmp, &dt->ref_invinode_list)
- id = osi_list_entry(tmp,
- struct inode_with_dups,
- list);
- osi_list_foreach(tmp, &dt->ref_inode_list)
- id = osi_list_entry(tmp,
- struct inode_with_dups,
- list);
- if (id)
- log_debug("Last reference is from inode "
- "0x%llx\n",
- (unsigned long long)id->block_no);
- }
- }
-}
-
/*
* resolve_dup_references - resolve all but the last dinode that has a
* duplicate reference to a given block.
diff --git a/gfs2/fsck/util.c b/gfs2/fsck/util.c
index 9d6f163..fd1b292 100644
--- a/gfs2/fsck/util.c
+++ b/gfs2/fsck/util.c
@@ -725,3 +725,62 @@ uint64_t *get_dir_hash(struct gfs2_inode *ip)
return tbl;
}
+void delete_all_dups(struct gfs2_inode *ip)
+{
+ struct osi_node *n, *next;
+ struct duptree *dt;
+ osi_list_t *tmp, *x;
+ struct inode_with_dups *id;
+ int found;
+
+ for (n = osi_first(&dup_blocks); n; n = next) {
+ next = osi_next(n);
+ dt = (struct duptree *)n;
+
+ found = 0;
+ id = NULL;
+
+ osi_list_foreach_safe(tmp, &dt->ref_invinode_list, x) {
+ id = osi_list_entry(tmp, struct inode_with_dups, list);
+ if (id->block_no == ip->i_di.di_num.no_addr) {
+ dup_listent_delete(dt, id);
+ found = 1;
+ }
+ }
+ osi_list_foreach_safe(tmp, &dt->ref_inode_list, x) {
+ id = osi_list_entry(tmp, struct inode_with_dups, list);
+ if (id->block_no == ip->i_di.di_num.no_addr) {
+ dup_listent_delete(dt, id);
+ found = 1;
+ }
+ }
+ if (!found)
+ continue;
+
+ if (dt->refs == 0) {
+ log_debug(_("This was the last reference: 0x%llx is "
+ "no longer a duplicate.\n"),
+ (unsigned long long)dt->block);
+ dup_delete(dt); /* not duplicate now */
+ } else {
+ log_debug(_("%d references remain to 0x%llx\n"),
+ dt->refs, (unsigned long long)dt->block);
+ if (dt->refs > 1)
+ continue;
+
+ id = NULL;
+ osi_list_foreach(tmp, &dt->ref_invinode_list)
+ id = osi_list_entry(tmp,
+ struct inode_with_dups,
+ list);
+ osi_list_foreach(tmp, &dt->ref_inode_list)
+ id = osi_list_entry(tmp,
+ struct inode_with_dups,
+ list);
+ if (id)
+ log_debug("Last reference is from inode "
+ "0x%llx\n",
+ (unsigned long long)id->block_no);
+ }
+ }
+}
diff --git a/gfs2/fsck/util.h b/gfs2/fsck/util.h
index 361b1a2..580acd8 100644
--- a/gfs2/fsck/util.h
+++ b/gfs2/fsck/util.h
@@ -187,6 +187,7 @@ extern char generic_interrupt(const char *caller, const char *where,
extern char gfs2_getch(void);
extern uint64_t find_free_blk(struct gfs2_sbd *sdp);
extern uint64_t *get_dir_hash(struct gfs2_inode *ip);
+extern void delete_all_dups(struct gfs2_inode *ip);
#define stack log_debug("<backtrace> - %s()\n", __func__)
11 years
gfs2-utils: master - fsck.gfs2: take hash table start boundaries into account
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=e8d58780...
Commit: e8d58780c43e0befeacab299c6d099e196bc83b9
Parent: 58a213659fb8afd5d25fa25d7ec3ec0a8d5e21dd
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Fri Apr 19 09:25:51 2013 -0700
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Mon May 20 11:12:47 2013 -0500
fsck.gfs2: take hash table start boundaries into account
When checking the hash table in pass2, we can't just keep doubling
the length for each consecutive check because the number of pointer
copies (aka length) is also tied to the starting offset. If the
starting offset is invalid for the length, it might treat a chunk of
the hash table as bigger than it should, eventually overwriting good
entries. Along the same lines, while we're trying to determine the
length, it's not good enough to double the length and check if the
hash table entry matches. The reason is: there can be several values
overwritten with the same value, 0x00, that indicates places where
pass1 found an invalid leaf block pointer. To avoid that, we need to
check intermediate values as well, and stop if we find a gap.
---
gfs2/fsck/metawalk.c | 5 +++--
gfs2/fsck/pass2.c | 43 ++++++++++++++++++++++++++++++++++---------
2 files changed, 37 insertions(+), 11 deletions(-)
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index ffc3555..44b5c66 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -473,11 +473,12 @@ static int check_entries(struct gfs2_inode *ip, struct gfs2_buffer_head *bh,
if ((char *)dent + de.de_rec_len >= bh_end){
log_debug( _("Last entry processed for %lld->%lld "
- "(0x%llx->0x%llx).\n"),
+ "(0x%llx->0x%llx), di_blocks=%llu.\n"),
(unsigned long long)ip->i_di.di_num.no_addr,
(unsigned long long)bh->b_blocknr,
(unsigned long long)ip->i_di.di_num.no_addr,
- (unsigned long long)bh->b_blocknr);
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)ip->i_di.di_blocks);
break;
}
diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c
index a24edbe..3d0bb49 100644
--- a/gfs2/fsck/pass2.c
+++ b/gfs2/fsck/pass2.c
@@ -370,9 +370,10 @@ static int wrong_leaf(struct gfs2_inode *ip, struct gfs2_inum *entry,
gfs2_get_leaf_nr(ip, hash_index, &real_leaf);
if (real_leaf != planned_leaf) {
log_err(_("The planned leaf was split. The new leaf "
- "is: %llu (0x%llx)"),
+ "is: %llu (0x%llx). di_blocks=%llu\n"),
(unsigned long long)real_leaf,
- (unsigned long long)real_leaf);
+ (unsigned long long)real_leaf,
+ (unsigned long long)ip->i_di.di_blocks);
fsck_blockmap_set(ip, real_leaf, _("split leaf"),
gfs2_indir_blk);
}
@@ -1032,6 +1033,7 @@ static int basic_check_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent,
log_err( _("Bad directory entry '%s' cleared.\n"), tmp_name);
return 1;
} else {
+ (*count)++;
return 0;
}
}
@@ -1150,11 +1152,13 @@ static int fix_hashtable(struct gfs2_inode *ip, uint64_t *tbl, unsigned hsize,
/* Look at the first dirent and check its hash value to see if it's
at the proper starting offset. */
hash_index = hash_table_index(dentry.de_hash, ip);
+ /* Need to use len here, not *proper_len because the leaf block may
+ be valid within the range, but starts too soon in the hash table. */
if (hash_index < lindex || hash_index > lindex + len) {
log_err(_("This leaf block has hash index %d, which is out of "
"bounds for where it appears in the hash table "
"(%d - %d)\n"),
- hash_index, lindex, lindex + len);
+ hash_index, lindex, lindex + *proper_len);
error = lost_leaf(ip, tbl, leafblk, len, lindex, lbh);
brelse(lbh);
return error;
@@ -1291,6 +1295,8 @@ static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl,
struct gfs2_buffer_head *lbh;
int factor;
uint32_t proper_start;
+ uint32_t next_proper_start;
+ int anomaly;
lindex = 0;
while (lindex < hsize) {
@@ -1299,10 +1305,23 @@ static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl,
len = 1;
factor = 0;
leafblk = be64_to_cpu(tbl[lindex]);
+ next_proper_start = lindex;
+ anomaly = 0;
while (lindex + (len << 1) - 1 < hsize) {
if (be64_to_cpu(tbl[lindex + (len << 1) - 1]) !=
leafblk)
break;
+ next_proper_start = (lindex & ~((len << 1) - 1));
+ if (lindex != next_proper_start)
+ anomaly = 1;
+ /* Check if there are other values written between
+ here and the next factor. */
+ for (i = len; !anomaly && i + lindex < hsize &&
+ i < (len << 1); i++)
+ if (be64_to_cpu(tbl[lindex + i]) != leafblk)
+ anomaly = 1;
+ if (anomaly)
+ break;
len <<= 1;
factor++;
}
@@ -1344,8 +1363,10 @@ static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl,
proper_start = (lindex & ~(proper_len - 1));
if (lindex != proper_start) {
log_debug(_("lindex 0x%llx is not a proper starting "
- "point for this leaf: 0x%llx\n"),
+ "point for leaf %llu (0x%llx): 0x%llx\n"),
(unsigned long long)lindex,
+ (unsigned long long)leafblk,
+ (unsigned long long)leafblk,
(unsigned long long)proper_start);
changes = fix_hashtable(ip, tbl, hsize, leafblk,
lindex, proper_start, len,
@@ -1368,9 +1389,11 @@ static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl,
depth, and adjust the hash table accordingly. */
if (len != proper_len) {
log_err(_("Length %d (0x%x) is not a proper length "
- "for this leaf. Valid boundary assumed to "
- "be %d (0x%x).\n"),
- len, len, proper_len, proper_len);
+ "for leaf %llu (0x%llx). Valid boundary "
+ "assumed to be %d (0x%x).\n"), len, len,
+ (unsigned long long)leafblk,
+ (unsigned long long)leafblk,
+ proper_len, proper_len);
lbh = bread(ip->i_sbd, leafblk);
gfs2_leaf_in(&leaf, lbh);
if (gfs2_check_meta(lbh, GFS2_METATYPE_LF) ||
@@ -1419,8 +1442,10 @@ static int check_hash_tbl(struct gfs2_inode *ip, uint64_t *tbl,
proper_len = 1 << (ip->i_di.di_depth - leaf.lf_depth);
if (proper_len != len) {
log_debug(_("Length 0x%x is not proper for "
- "this leaf: 0x%x"),
- len, proper_len);
+ "leaf %llu (0x%llx): 0x%x"),
+ len, (unsigned long long)leafblk,
+ (unsigned long long)leafblk,
+ proper_len);
changes = fix_hashtable(ip, tbl, hsize,
leafblk, lindex,
lindex, len,
11 years
gfs2-utils: master - fsck.gfs2: Don't allocate leaf blocks in pass1
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=58a21365...
Commit: 58a213659fb8afd5d25fa25d7ec3ec0a8d5e21dd
Parent: c2a39034d9f2888dc0a9431cea86998a929c30ba
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Wed Apr 17 14:09:30 2013 -0700
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Mon May 20 11:12:47 2013 -0500
fsck.gfs2: Don't allocate leaf blocks in pass1
Before this patch, if leaf blocks were found to be corrupt, pass1
tried to fix them by allocating new leaf blocks in place of the bad
ones. That's a bad idea, because pass1 populates the blockmap and
sets the bitmap accordingly. In other words, it's dynamically changing.
Say, for example, that you're checking a directory a dinode 0x1234, and
it has a corrupt hash table, and needs new leaf blocks inserted.
Now suppose you have a second directory that occurs later in the bitmap,
say at block 0x2345, and it references leaf block 0x2346, but for some
reason that block (0x2346) is improperly set to "free" in the bitmap.
If pass1 goes out looking for a free block in order to allocate a new
leaf for 0x1234, it will naturally find block 0x2346, because it's
marked free. It writes a new leaf at that block and adds a new
reference in the hash table of 0x1234. Later, when pass1 processes
directory 0x2345, it discovers the reference to 0x2346. Not only has
it wiped out the perfectly good leaf block, it has also created a
duplicate block reference that it needs to sort out in pass1b, which
will likely keep the replaced reference and throw the good one we
had. Thus, we introduced corruption into the file system when we
should have kept the only good reference to 0x2346 and fixed the
bitmap.
The solution provided by this patch is to simply zero out the bad
hash table entries when pass1 comes across them. Later, when pass2
discovers the zero leaf blocks, it can safely allocate new blocks
(since pass1 synced the bitmap according to the blockmap) for the new
leaf blocks and replace the zeros with valid block references.
---
gfs2/fsck/metawalk.c | 31 ++++++++++++++++++++++++++++++-
gfs2/fsck/metawalk.h | 2 +-
gfs2/fsck/pass1.c | 9 ++-------
gfs2/fsck/pass2.c | 2 +-
4 files changed, 34 insertions(+), 10 deletions(-)
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index 161c183..ffc3555 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -1955,7 +1955,7 @@ int write_new_leaf(struct gfs2_inode *dip, int start_lindex, int num_copies,
* leaf a bit, but it's better than deleting the whole directory,
* which is what used to happen before. */
int repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, int lindex,
- int ref_count, const char *msg)
+ int ref_count, const char *msg, int allow_alloc)
{
int new_leaf_blks = 0, error, refs;
uint64_t bn = 0;
@@ -1970,6 +1970,35 @@ int repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, int lindex,
log_err( _("Bad leaf left in place.\n"));
goto out;
}
+ if (!allow_alloc) {
+ uint64_t *cpyptr;
+ char *padbuf;
+ int pad_size, i;
+
+ padbuf = malloc(ref_count * sizeof(uint64_t));
+ cpyptr = (uint64_t *)padbuf;
+ for (i = 0; i < ref_count; i++) {
+ *cpyptr = 0;
+ cpyptr++;
+ }
+ pad_size = ref_count * sizeof(uint64_t);
+ log_err(_("Writing zeros to the hash table of directory %lld "
+ "(0x%llx) at index: 0x%x for 0x%x pointers.\n"),
+ (unsigned long long)ip->i_di.di_num.no_addr,
+ (unsigned long long)ip->i_di.di_num.no_addr,
+ lindex, ref_count);
+ if (ip->i_sbd->gfs1)
+ gfs1_writei(ip, padbuf, lindex * sizeof(uint64_t),
+ pad_size);
+ else
+ gfs2_writei(ip, padbuf, lindex * sizeof(uint64_t),
+ pad_size);
+ free(padbuf);
+ log_err( _("Directory Inode %llu (0x%llx) patched.\n"),
+ (unsigned long long)ip->i_di.di_num.no_addr,
+ (unsigned long long)ip->i_di.di_num.no_addr);
+ goto out;
+ }
/* We can only write leafs in quantities that are factors of
two, since leaves are doubled, not added sequentially.
So if we have a hole that's not a factor of 2, we have to
diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h
index aacb962..a5a51c2 100644
--- a/gfs2/fsck/metawalk.h
+++ b/gfs2/fsck/metawalk.h
@@ -61,7 +61,7 @@ extern int write_new_leaf(struct gfs2_inode *dip, int start_lindex,
int num_copies, const char *before_or_after,
uint64_t *bn);
extern int repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, int lindex,
- int ref_count, const char *msg);
+ int ref_count, const char *msg, int allow_alloc);
#define is_duplicate(dblock) ((dupfind(dblock)) ? 1 : 0)
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index 2c1c046..df778ef 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -84,13 +84,8 @@ static int pass1_repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no,
int lindex, int ref_count, const char *msg,
void *private)
{
- struct block_count *bc = (struct block_count *)private;
- int new_leaf_blks;
-
- new_leaf_blks = repair_leaf(ip, leaf_no, lindex, ref_count, msg);
- bc->indir_count += new_leaf_blks;
-
- return new_leaf_blks;
+ repair_leaf(ip, leaf_no, lindex, ref_count, msg, 0);
+ return 0;
}
struct metawalk_fxns pass1_fxns = {
diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c
index 5767c4d..a24edbe 100644
--- a/gfs2/fsck/pass2.c
+++ b/gfs2/fsck/pass2.c
@@ -1040,7 +1040,7 @@ static int pass2_repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no,
int lindex, int ref_count, const char *msg,
void *private)
{
- return repair_leaf(ip, leaf_no, lindex, ref_count, msg);
+ return repair_leaf(ip, leaf_no, lindex, ref_count, msg, 1);
}
/* The purpose of leafck_fxns is to provide a means for function fix_hashtable
11 years
gfs2-utils: master - fsck.gfs2: Stop "undo" process when error data block is reached
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=c2a39034...
Commit: c2a39034d9f2888dc0a9431cea86998a929c30ba
Parent: fb2ef82d8dd9b4c5304d377b9d2fa1ad3da1a82c
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Fri Apr 12 08:42:34 2013 -0700
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Mon May 20 11:12:47 2013 -0500
fsck.gfs2: Stop "undo" process when error data block is reached
When fsck.gfs2 discovers a data block in error, it flags the error
and especially in pass1, it tries to "undo" the block designations
it previously marked in the blockmap. Before this patch, the "undo"
functions didn't know when to stop. So it could "undo" designations
in the blockmap that it hadn't "done" in the first place. With this
patch, if an error is encountered while processing data blocks
(not counting duplicate references--for example, blocks marked as
'data' that are really dinodes which it hasn't gotten to yet) it
saves off the block where the error occurred. Later, during the
"undo" processing, it stops when it reaches the block that flagged
the error.
---
gfs2/fsck/metawalk.c | 36 +++++++++++++++++++++++++++---------
gfs2/fsck/pass1.c | 2 +-
2 files changed, 28 insertions(+), 10 deletions(-)
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index 6e9e593..161c183 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -1325,7 +1325,7 @@ static int build_and_check_metalist(struct gfs2_inode *ip, osi_list_t *mlp,
*/
static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
struct gfs2_buffer_head *bh, int head_size,
- uint64_t *blks_checked)
+ uint64_t *blks_checked, uint64_t *error_blk)
{
int error = 0, rc = 0;
uint64_t block, *ptr;
@@ -1349,8 +1349,13 @@ static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
rc = pass->check_data(ip, metablock, block, pass->private);
if (!error && rc) {
error = rc;
- log_info(_("\nUnrecoverable data block error %d on "
- "block %llu (0x%llx).\n"), rc,
+ log_info("\n");
+ if (rc < 0) {
+ *error_blk = block;
+ log_info(_("Unrecoverable "));
+ }
+ log_info(_("data block error %d on block %llu "
+ "(0x%llx).\n"), rc,
(unsigned long long)block,
(unsigned long long)block);
}
@@ -1362,7 +1367,8 @@ static int check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
}
static int undo_check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
- uint64_t *ptr_start, char *ptr_end)
+ uint64_t *ptr_start, char *ptr_end,
+ uint64_t error_blk)
{
int rc = 0;
uint64_t block, *ptr;
@@ -1375,6 +1381,8 @@ static int undo_check_data(struct gfs2_inode *ip, struct metawalk_fxns *pass,
if (skip_this_pass || fsck_abort)
return 1;
block = be64_to_cpu(*ptr);
+ if (block == error_blk)
+ return 1;
rc = pass->undo_check_data(ip, block, pass->private);
if (rc < 0)
return rc;
@@ -1415,6 +1423,8 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass)
uint64_t blks_checked = 0;
int error, rc;
int metadata_clean = 0;
+ uint64_t error_blk = 0;
+ int hit_error_blk = 0;
if (!height && !is_dir(&ip->i_di, ip->i_sbd->gfs1))
return 0;
@@ -1460,7 +1470,7 @@ int check_metatree(struct gfs2_inode *ip, struct metawalk_fxns *pass)
if (pass->check_data)
error = check_data(ip, pass, bh, head_size,
- &blks_checked);
+ &blks_checked, &error_blk);
if (pass->big_file_msg && ip->i_di.di_blocks > COMFORTABLE_BLKS)
pass->big_file_msg(ip, blks_checked);
}
@@ -1498,12 +1508,20 @@ undo_metalist:
i, pass->private);
else
rc = 0;
- if (metadata_clean && rc == 0 && i == height - 1) {
+ if (metadata_clean && rc == 0 && i == height - 1 &&
+ !hit_error_blk) {
head_size = hdr_size(bh, height);
- if (head_size)
- undo_check_data(ip, pass, (uint64_t *)
+ if (head_size) {
+ rc = undo_check_data(ip, pass,
+ (uint64_t *)
(bh->b_data + head_size),
- (bh->b_data + ip->i_sbd->bsize));
+ (bh->b_data + ip->i_sbd->bsize),
+ error_blk);
+ if (rc > 0) {
+ hit_error_blk = 1;
+ rc = 0;
+ }
+ }
}
if (bh == ip->i_bh)
osi_list_del(&bh->b_altlist);
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index ee828d8..2c1c046 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -462,7 +462,7 @@ static int check_data(struct gfs2_inode *ip, uint64_t metablock,
fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
_("bad (out of range) data"),
gfs2_bad_block);
- return 1;
+ return -1;
}
bc->data_count++; /* keep the count sane anyway */
q = block_type(block);
11 years
gfs2-utils: master - fsck.gfs2: double-check transitions from dinode to data
by Bob Peterson
Gitweb: http://git.fedorahosted.org/git/?p=gfs2-utils.git;a=commitdiff;h=fb2ef82d...
Commit: fb2ef82d8dd9b4c5304d377b9d2fa1ad3da1a82c
Parent: a3c643a9c98dd68138ff6b623fd86923a16fc626
Author: Bob Peterson <rpeterso(a)redhat.com>
AuthorDate: Thu Apr 11 07:22:33 2013 -0700
Committer: Bob Peterson <rpeterso(a)redhat.com>
CommitterDate: Mon May 20 11:12:47 2013 -0500
fsck.gfs2: double-check transitions from dinode to data
If a corrupt dinode references a bunch of blocks as data blocks,
and those blocks occur later in the bitmap (as is usually the case)
but they're really dinodes, we have a problem. Before it finds the
corruption, it can change the bitmap markings from 'dinode' to 'data'
blocks. Later, when it determines the dinode is corrupt. It tries
to "undo" all those data blocks, but since pass1 hasn't processed
them yet, it marks them as 'free' in the bitmap, and we've lost the
fact that they're dinodes. The result is that the files/dinodes
being improperly referenced are deleted by mistake.
This patch adds a check for bitmap transitions in pass1 from 'dinode'
to 'data', where the block hasn't been checked yet. We don't care about
transitions from dinode to free because that's a normal delete of a
dinode. We also don't care about transitions between dinode to
metadata, because all those checks validate that the metadata type is
the correct type of metadata, so we know we're making the right
decision. So the only issue are data blocks referencing dinodes.
What this patch does is: when the bitmap is making a transition from
'dinode' to 'data' in pass1, it basically puts up a red flag.
The block is read in and checked to see if it really looks like a
dinode. We have to be careful here, because customer data is allowed
to look like a dinode. If the block really seems to be a dinode, we
DO NOT want to treat it as a data block and assume the duplicate
reference handler in pass1b will handle it, because the dinode's
metadata blocks will not have been checked in pass1.
Instead, we want to flag it as corruption in the referencing file
dinode, not change the bitmap or blockmap, and allow pass1 to treat
it properly as a dinode when it gets there. The corrupt dinode
referencing the dinode as 'data' should be deleted and the work done
thusfar should be backed out by the pass1 'undo' functions.
---
gfs2/fsck/metawalk.c | 21 +++++++++++++---
gfs2/fsck/metawalk.h | 14 +++++++----
gfs2/fsck/pass1.c | 65 +++++++++++++++++++++++++++++++++++++++++++++----
gfs2/fsck/pass1b.c | 2 +-
gfs2/fsck/pass2.c | 2 +-
gfs2/fsck/pass3.c | 4 +-
6 files changed, 89 insertions(+), 19 deletions(-)
diff --git a/gfs2/fsck/metawalk.c b/gfs2/fsck/metawalk.c
index 22b16ee..6e9e593 100644
--- a/gfs2/fsck/metawalk.c
+++ b/gfs2/fsck/metawalk.c
@@ -27,7 +27,7 @@
is used to set the latter. The two must be kept in sync, otherwise
you'll get bitmap mismatches. This function checks the status of the
bitmap whenever the blockmap changes, and fixes it accordingly. */
-int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk,
+int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk, int error_on_dinode,
enum gfs2_mark_block new_blockmap_state)
{
int old_bitmap_state, new_bitmap_state;
@@ -49,6 +49,16 @@ int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk,
/* gfs1 descriptions: */
{"free", "data", "free meta", "metadata", "reserved"}};
+ if (error_on_dinode && old_bitmap_state == GFS2_BLKST_DINODE &&
+ new_bitmap_state != GFS2_BLKST_FREE) {
+ log_debug(_("Reference as '%s' to block %llu (0x%llx) "
+ "which was marked as dinode. Needs "
+ "further investigation.\n"),
+ allocdesc[sdp->gfs1][new_bitmap_state],
+ (unsigned long long)blk,
+ (unsigned long long)blk);
+ return 1;
+ }
/* Keep these messages as short as possible, or the output
gets to be huge and unmanageable. */
log_err( _("Block %llu (0x%llx) was '%s', should be %s.\n"),
@@ -106,6 +116,7 @@ int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk,
*/
int _fsck_blockmap_set(struct gfs2_inode *ip, uint64_t bblock,
const char *btype, enum gfs2_mark_block mark,
+ int error_on_dinode,
const char *caller, int fline)
{
int error;
@@ -164,9 +175,11 @@ int _fsck_blockmap_set(struct gfs2_inode *ip, uint64_t bblock,
/* First, check the rgrp bitmap against what we think it should be.
If that fails, it's an invalid block--part of an rgrp. */
- error = check_n_fix_bitmap(ip->i_sbd, bblock, mark);
+ error = check_n_fix_bitmap(ip->i_sbd, bblock, error_on_dinode, mark);
if (error) {
- log_err( _("This block is not represented in the bitmap.\n"));
+ if (error < 0)
+ log_err( _("This block is not represented in the "
+ "bitmap.\n"));
return error;
}
@@ -517,7 +530,7 @@ int check_leaf(struct gfs2_inode *ip, int lindex, struct metawalk_fxns *pass,
if (pass->check_leaf) {
error = pass->check_leaf(ip, *leaf_no, pass->private);
- if (error) {
+ if (error == -EEXIST) {
log_info(_("Previous reference to leaf %lld (0x%llx) "
"has already checked it; skipping.\n"),
(unsigned long long)*leaf_no,
diff --git a/gfs2/fsck/metawalk.h b/gfs2/fsck/metawalk.h
index 56f57d9..aacb962 100644
--- a/gfs2/fsck/metawalk.h
+++ b/gfs2/fsck/metawalk.h
@@ -45,10 +45,12 @@ extern int delete_eattr_extentry(struct gfs2_inode *ip, uint64_t *ea_data_ptr,
void *private);
extern int _fsck_blockmap_set(struct gfs2_inode *ip, uint64_t bblock,
- const char *btype, enum gfs2_mark_block mark,
- const char *caller, int line);
+ const char *btype, enum gfs2_mark_block mark,
+ int error_on_dinode,
+ const char *caller, int line);
extern int check_n_fix_bitmap(struct gfs2_sbd *sdp, uint64_t blk,
- enum gfs2_mark_block new_blockmap_state);
+ int error_on_dinode,
+ enum gfs2_mark_block new_blockmap_state);
extern void reprocess_inode(struct gfs2_inode *ip, const char *desc);
extern struct duptree *dupfind(uint64_t block);
extern struct gfs2_inode *fsck_system_inode(struct gfs2_sbd *sdp,
@@ -63,8 +65,10 @@ extern int repair_leaf(struct gfs2_inode *ip, uint64_t *leaf_no, int lindex,
#define is_duplicate(dblock) ((dupfind(dblock)) ? 1 : 0)
-#define fsck_blockmap_set(ip, b, bt, m) _fsck_blockmap_set(ip, b, bt, m, \
- __FUNCTION__, __LINE__)
+#define fsck_blockmap_set(ip, b, bt, m) \
+ _fsck_blockmap_set(ip, b, bt, m, 0, __FUNCTION__, __LINE__)
+#define fsck_blkmap_set_noino(ip, b, bt, m) \
+ _fsck_blockmap_set(ip, b, bt, m, 1, __FUNCTION__, __LINE__)
enum meta_check_rc {
meta_error = -1,
diff --git a/gfs2/fsck/pass1.c b/gfs2/fsck/pass1.c
index ad6690b..ee828d8 100644
--- a/gfs2/fsck/pass1.c
+++ b/gfs2/fsck/pass1.c
@@ -150,7 +150,7 @@ static int resuscitate_metalist(struct gfs2_inode *ip, uint64_t block,
if (fsck_system_inode(ip->i_sbd, block))
fsck_blockmap_set(ip, block, _("system file"), gfs2_indir_blk);
else
- check_n_fix_bitmap(ip->i_sbd, block, gfs2_indir_blk);
+ check_n_fix_bitmap(ip->i_sbd, block, 0, gfs2_indir_blk);
bc->indir_count++;
return meta_is_good;
}
@@ -204,7 +204,7 @@ static int resuscitate_dentry(struct gfs2_inode *ip, struct gfs2_dirent *dent,
if (fsck_system_inode(sdp, block))
fsck_blockmap_set(ip, block, _("system file"), dinode_type);
else
- check_n_fix_bitmap(sdp, block, dinode_type);
+ check_n_fix_bitmap(sdp, block, 0, dinode_type);
/* Return the number of leaf entries so metawalk doesn't flag this
leaf as having none. */
*count = be16_to_cpu(((struct gfs2_leaf *)bh->b_data)->lf_entries);
@@ -339,6 +339,8 @@ static int undo_reference(struct gfs2_inode *ip, uint64_t block, int meta,
struct block_count *bc = (struct block_count *)private;
struct duptree *dt;
struct inode_with_dups *id;
+ int old_bitmap_state = 0;
+ struct rgrp_tree *rgd;
if (!valid_block(ip->i_sbd, block)) { /* blk outside of FS */
fsck_blockmap_set(ip, ip->i_di.di_num.no_addr,
@@ -367,6 +369,12 @@ static int undo_reference(struct gfs2_inode *ip, uint64_t block, int meta,
return 1;
}
}
+ if (!meta) {
+ rgd = gfs2_blk2rgrpd(ip->i_sbd, block);
+ old_bitmap_state = lgfs2_get_bitmap(ip->i_sbd, block, rgd);
+ if (old_bitmap_state == GFS2_BLKST_DINODE)
+ return -1;
+ }
fsck_blockmap_set(ip, block,
meta ? _("bad indirect") : _("referenced data"),
gfs2_block_free);
@@ -385,6 +393,51 @@ static int undo_check_data(struct gfs2_inode *ip, uint64_t block,
return undo_reference(ip, block, 0, private);
}
+/* blockmap_set_as_data - set block as 'data' in the blockmap, if not dinode
+ *
+ * This function tries to set a block that's referenced as data as 'data'
+ * in the fsck blockmap. But if that block is marked as 'dinode' in the
+ * rgrp bitmap, it does additional checks to see if it looks like a dinode.
+ * Note that previous checks were done for duplicate references, so this
+ * is checking for dinodes that we haven't processed yet.
+ */
+static int blockmap_set_as_data(struct gfs2_inode *ip, uint64_t block)
+{
+ int error;
+ struct gfs2_buffer_head *bh;
+ struct gfs2_dinode *di;
+
+ error = fsck_blkmap_set_noino(ip, block, _("data"), gfs2_block_used);
+ if (!error)
+ return 0;
+
+ error = 0;
+ /* The bitmap says it's a dinode, but a block reference begs to differ.
+ So which is it? */
+ bh = bread(ip->i_sbd, block);
+ if (gfs2_check_meta(bh, GFS2_METATYPE_DI) != 0)
+ goto out;
+
+ /* The meta header agrees it's a dinode. But it might be data in
+ disguise, so do some extra checks. */
+ di = (struct gfs2_dinode *)bh->b_data;
+ if (be64_to_cpu(di->di_num.no_addr) != block)
+ goto out;
+
+ log_err(_("Inode %lld (0x%llx) has a reference to block %lld (0x%llx) "
+ "as a data block, but it appears to be a dinode we "
+ "haven't checked yet.\n"),
+ (unsigned long long)ip->i_di.di_num.no_addr,
+ (unsigned long long)ip->i_di.di_num.no_addr,
+ (unsigned long long)block, (unsigned long long)block);
+ error = -1;
+out:
+ if (!error)
+ fsck_blockmap_set(ip, block, _("data"), gfs2_block_used);
+ brelse(bh);
+ return error;
+}
+
static int check_data(struct gfs2_inode *ip, uint64_t metablock,
uint64_t block, void *private)
{
@@ -469,7 +522,7 @@ static int check_data(struct gfs2_inode *ip, uint64_t metablock,
(unsigned long long)block, (unsigned long long)block);
fsck_blockmap_set(ip, block, _("jdata"), gfs2_jdata);
} else
- fsck_blockmap_set(ip, block, _("data"), gfs2_block_used);
+ return blockmap_set_as_data(ip, block);
return 0;
}
@@ -1199,7 +1252,7 @@ static int check_system_inode(struct gfs2_sbd *sdp,
(unsigned long long)iblock,
(unsigned long long)iblock);
gfs2_blockmap_set(bl, iblock, gfs2_block_free);
- check_n_fix_bitmap(sdp, iblock, gfs2_block_free);
+ check_n_fix_bitmap(sdp, iblock, 0, gfs2_block_free);
inode_put(sysinode);
}
}
@@ -1486,7 +1539,7 @@ static int pass1_process_bitmap(struct gfs2_sbd *sdp, struct rgrp_tree *rgd, uin
"%llu (0x%llx)\n"),
(unsigned long long)block,
(unsigned long long)block);
- check_n_fix_bitmap(sdp, block, gfs2_block_free);
+ check_n_fix_bitmap(sdp, block, 0, gfs2_block_free);
} else if (handle_di(sdp, bh) < 0) {
stack;
brelse(bh);
@@ -1596,7 +1649,7 @@ int pass1(struct gfs2_sbd *sdp)
}
/* rgrps and bitmaps don't have bits to represent
their blocks, so don't do this:
- check_n_fix_bitmap(sdp, rgd->ri.ri_addr + i,
+ check_n_fix_bitmap(sdp, rgd->ri.ri_addr + i, 0,
gfs2_meta_rgrp);*/
}
diff --git a/gfs2/fsck/pass1b.c b/gfs2/fsck/pass1b.c
index 9c76eda..9a23197 100644
--- a/gfs2/fsck/pass1b.c
+++ b/gfs2/fsck/pass1b.c
@@ -501,7 +501,7 @@ static int handle_dup_blk(struct gfs2_sbd *sdp, struct duptree *dt)
dup_delete(dh.dt);
/* Now fix the block type of the block in question. */
gfs2_blockmap_set(bl, dup_blk, gfs2_block_free);
- check_n_fix_bitmap(sdp, dup_blk, gfs2_block_free);
+ check_n_fix_bitmap(sdp, dup_blk, 0, gfs2_block_free);
}
}
return 0;
diff --git a/gfs2/fsck/pass2.c b/gfs2/fsck/pass2.c
index dc99869..5767c4d 100644
--- a/gfs2/fsck/pass2.c
+++ b/gfs2/fsck/pass2.c
@@ -1713,7 +1713,7 @@ int pass2(struct gfs2_sbd *sdp)
/* Can't use fsck_blockmap_set here because we don't
have an inode in memory. */
gfs2_blockmap_set(bl, dirblk, gfs2_inode_invalid);
- check_n_fix_bitmap(sdp, dirblk, gfs2_inode_invalid);
+ check_n_fix_bitmap(sdp, dirblk, 0, gfs2_inode_invalid);
}
ip = fsck_load_inode(sdp, dirblk);
if (!ds.dotdir) {
diff --git a/gfs2/fsck/pass3.c b/gfs2/fsck/pass3.c
index 53052b6..4894d8c 100644
--- a/gfs2/fsck/pass3.c
+++ b/gfs2/fsck/pass3.c
@@ -275,7 +275,7 @@ int pass3(struct gfs2_sbd *sdp)
gfs2_blockmap_set(bl, di->dinode.no_addr,
gfs2_block_free);
check_n_fix_bitmap(sdp, di->dinode.no_addr,
- gfs2_block_free);
+ 0, gfs2_block_free);
break;
} else
log_err( _("Unlinked directory with bad block remains\n"));
@@ -299,7 +299,7 @@ int pass3(struct gfs2_sbd *sdp)
because we don't have ip */
gfs2_blockmap_set(bl, di->dinode.no_addr,
gfs2_block_free);
- check_n_fix_bitmap(sdp, di->dinode.no_addr,
+ check_n_fix_bitmap(sdp, di->dinode.no_addr, 0,
gfs2_block_free);
log_err( _("The block was cleared\n"));
break;
11 years