src/cmd.c | 2 +-
src/lockspace.c | 23 ++++++++++++++++++-----
src/main.c | 2 +-
src/watchdog.c | 37 +++++++++++++++++++++++--------------
src/watchdog.h | 13 +++++++------
5 files changed, 50 insertions(+), 27 deletions(-)
New commits:
commit 9841a4c55497631c94c0ed1f063caa4e1cfb2701
Author: David Teigland <teigland(a)redhat.com>
Date: Fri Jun 12 13:59:05 2015 -0500
sanlock: add_lockspace can connect to wdmd earlier
Currently, add_lockspace will finish acquiring the
delta lease, then connect to wdmd and activate the
wdmd connection for the lockspace. If wdmd was
not running, the connect fails, and add_lockspace
fails. If we connect to wdmd before acquiring the
delta lease, we can fail before the lengthy delta
lease delay.
diff --git a/src/cmd.c b/src/cmd.c
index 648ea7c..6bde363 100644
--- a/src/cmd.c
+++ b/src/cmd.c
@@ -1360,7 +1360,7 @@ static void cmd_inq_lockspace(struct cmd_args *ca)
*
* if (sp->killing_pids && all_pids_dead(sp) && all_tokens_released(sp)) {
* sp->thread_stop = 1;
- * unlink_watchdog_file(sp);
+ * deactivate_watchdog(sp);
* list_move(spaces_rem);
* }
*
diff --git a/src/lockspace.c b/src/lockspace.c
index 9322861..24f75d6 100644
--- a/src/lockspace.c
+++ b/src/lockspace.c
@@ -552,6 +552,7 @@ static void *lockspace_thread(void *arg_in)
int acquire_result, delta_result, read_result;
int opened = 0;
int stop = 0;
+ int wd_con;
sp = (struct space *)arg_in;
@@ -588,6 +589,15 @@ static void *lockspace_thread(void *arg_in)
goto set_status;
}
+ /* Connect first so we can fail quickly if wdmd is not running. */
+ wd_con = connect_watchdog(sp);
+ if (wd_con < 0) {
+ log_erros(sp, "connect_watchdog failed %d", wd_con);
+ acquire_result = SANLK_WD_ERROR;
+ delta_result = -1;
+ goto set_status;
+ }
+
/*
* acquire the delta lease
*/
@@ -608,11 +618,14 @@ static void *lockspace_thread(void *arg_in)
before we allow any pid's to begin running */
if (delta_result == SANLK_OK) {
- rv = create_watchdog_file(sp, last_success, id_renewal_fail_seconds);
+ rv = activate_watchdog(sp, last_success, id_renewal_fail_seconds, wd_con);
if (rv < 0) {
- log_erros(sp, "create_watchdog failed %d", rv);
+ log_erros(sp, "activate_watchdog failed %d", rv);
acquire_result = SANLK_WD_ERROR;
}
+ } else {
+ if (com.use_watchdog)
+ close(wd_con);
}
set_status:
@@ -704,7 +717,7 @@ static void *lockspace_thread(void *arg_in)
*/
if (delta_result == SANLK_OK && !sp->thread_stop)
- update_watchdog_file(sp, last_success, id_renewal_fail_seconds);
+ update_watchdog(sp, last_success, id_renewal_fail_seconds);
pthread_mutex_unlock(&sp->mutex);
@@ -728,7 +741,7 @@ static void *lockspace_thread(void *arg_in)
/* watchdog unlink was done in main_loop when thread_stop was set, to
get it done as quickly as possible in case the wd is about to fire. */
- close_watchdog_file(sp);
+ close_watchdog(sp);
out:
if (delta_result == SANLK_OK)
delta_lease_release(&task, sp, &sp->host_id_disk,
@@ -936,7 +949,7 @@ int add_lockspace_wait(struct space *sp)
pthread_mutex_lock(&sp->mutex);
sp->thread_stop = 1;
- unlink_watchdog_file(sp);
+ deactivate_watchdog(sp);
pthread_mutex_unlock(&sp->mutex);
pthread_join(sp->thread, NULL);
rv = -1;
diff --git a/src/main.c b/src/main.c
index 4f7ea84..4e203eb 100644
--- a/src/main.c
+++ b/src/main.c
@@ -791,7 +791,7 @@ static int main_loop(void)
log_space(sp, "set thread_stop");
pthread_mutex_lock(&sp->mutex);
sp->thread_stop = 1;
- unlink_watchdog_file(sp);
+ deactivate_watchdog(sp);
pthread_mutex_unlock(&sp->mutex);
list_move(&sp->list, &spaces_rem);
continue;
diff --git a/src/watchdog.c b/src/watchdog.c
index a879880..2c6c5b8 100644
--- a/src/watchdog.c
+++ b/src/watchdog.c
@@ -39,8 +39,8 @@
#include "../wdmd/wdmd.h"
-void update_watchdog_file(struct space *sp, uint64_t timestamp,
- int id_renewal_fail_seconds)
+void update_watchdog(struct space *sp, uint64_t timestamp,
+ int id_renewal_fail_seconds)
{
int rv;
@@ -53,13 +53,9 @@ void update_watchdog_file(struct space *sp, uint64_t timestamp,
(unsigned long long)timestamp, rv);
}
-int create_watchdog_file(struct space *sp, uint64_t timestamp,
- int id_renewal_fail_seconds)
+int connect_watchdog(struct space *sp)
{
- char name[WDMD_NAME_SIZE];
- int test_interval, fire_timeout;
- uint64_t last_keepalive;
- int con, rv;
+ int con;
if (!com.use_watchdog)
return 0;
@@ -67,9 +63,23 @@ int create_watchdog_file(struct space *sp, uint64_t timestamp,
con = wdmd_connect();
if (con < 0) {
log_erros(sp, "wdmd_connect failed %d", con);
- goto fail;
+ return -1;
}
+ return con;
+}
+
+int activate_watchdog(struct space *sp, uint64_t timestamp,
+ int id_renewal_fail_seconds, int con)
+{
+ char name[WDMD_NAME_SIZE];
+ int test_interval, fire_timeout;
+ uint64_t last_keepalive;
+ int rv;
+
+ if (!com.use_watchdog)
+ return 0;
+
memset(name, 0, sizeof(name));
snprintf(name, WDMD_NAME_SIZE - 1, "sanlock_%s:%llu",
@@ -114,11 +124,10 @@ int create_watchdog_file(struct space *sp, uint64_t timestamp,
wdmd_refcount_clear(con);
fail_close:
close(con);
- fail:
return -1;
}
-void unlink_watchdog_file(struct space *sp)
+void deactivate_watchdog(struct space *sp)
{
int rv;
@@ -129,7 +138,7 @@ void unlink_watchdog_file(struct space *sp)
rv = wdmd_test_live(sp->wd_fd, 0, 0);
if (rv < 0) {
- log_erros(sp, "wdmd_test_live in unlink failed %d", rv);
+ log_erros(sp, "wdmd_test_live in deactivate failed %d", rv);
/* We really want this to succeed to avoid a reset, so retry
after a short delay in case the problem was transient... */
@@ -138,13 +147,13 @@ void unlink_watchdog_file(struct space *sp)
rv = wdmd_test_live(sp->wd_fd, 0, 0);
if (rv < 0)
- log_erros(sp, "wdmd_test_live in unlink 2 failed %d", rv);
+ log_erros(sp, "wdmd_test_live in deactivate 2 failed %d", rv);
}
wdmd_refcount_clear(sp->wd_fd);
}
-void close_watchdog_file(struct space *sp)
+void close_watchdog(struct space *sp)
{
if (!com.use_watchdog)
return;
diff --git a/src/watchdog.h b/src/watchdog.h
index 90e82eb..a462559 100644
--- a/src/watchdog.h
+++ b/src/watchdog.h
@@ -9,11 +9,12 @@
#ifndef __WATCHDOG_H__
#define __WATCHDOG_H__
-void update_watchdog_file(struct space *sp, uint64_t timestamp,
- int id_renewal_fail_seconds);
-int create_watchdog_file(struct space *sp, uint64_t timestamp,
- int id_renewal_fail_seconds);
-void unlink_watchdog_file(struct space *sp);
-void close_watchdog_file(struct space *sp);
+void update_watchdog(struct space *sp, uint64_t timestamp,
+ int id_renewal_fail_seconds);
+int connect_watchdog(struct space *sp);
+int activate_watchdog(struct space *sp, uint64_t timestamp,
+ int id_renewal_fail_seconds, int con);
+void deactivate_watchdog(struct space *sp);
+void close_watchdog(struct space *sp);
#endif