Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=079... Commit: 079f56ecb56b0d53b44b1871fa8937c22981fc1e Parent: 1d56164f2dea0772188696a8578a4144c8379885 Author: Lon Hohberger lhh@redhat.com AuthorDate: Thu Apr 7 16:01:58 2011 -0400 Committer: Lon Hohberger lhh@redhat.com CommitterDate: Thu Apr 7 16:51:00 2011 -0400
rgmanager: Pause during exit if we stopped services
A difference between rgmanager 1.9.x and later versions is that they rely on openais/corosync for messaging. This messaging is quite reliable and has proved useful.
However, one drawback is that if you very quickly stop rgmanager and corosync/cman, the other nodes in the cluster can not restart services because message traffic is interrupted for the duration of the token timeout.
There is no simple solution to this problem. Rgmanager could (in theory) find new placements for services prior to stopping, but this is a large amount of design work; it was never designed to run policies in the exit path.
A far simpler idea is to simply give the other nodes time to restart services.
NOTE: This solution does not and can not work with central processing mode.
Resolves: rhbz#619468
Signed-off-by: Lon Hohberger lhh@redhat.com --- rgmanager/include/event.h | 1 + rgmanager/include/groups.h | 2 +- rgmanager/src/daemons/groups.c | 12 +++++++++++- rgmanager/src/daemons/main.c | 15 ++++++++++++--- rgmanager/src/daemons/rg_event.c | 7 +++++++ 5 files changed, 32 insertions(+), 5 deletions(-)
diff --git a/rgmanager/include/event.h b/rgmanager/include/event.h index ad3fe39..9fc9521 100644 --- a/rgmanager/include/event.h +++ b/rgmanager/include/event.h @@ -119,6 +119,7 @@ int slang_process_event(event_table_t *event_table, event_t *ev);
/* For distributed events. */ void set_transition_throttling(int nsecs); +int get_transition_throttling(void);
/* Simplified service start. */ int service_op_start(char *svcName, int *target_list, int target_list_len, diff --git a/rgmanager/include/groups.h b/rgmanager/include/groups.h index f776a31..f02f0e2 100644 --- a/rgmanager/include/groups.h +++ b/rgmanager/include/groups.h @@ -25,7 +25,7 @@ void kill_resource_groups(void);
/* do this op on all resource groups. The handler for the request will sort out whether or not it's a valid request given the state */ -void rg_doall(int request, int block, const char *debugfmt); +int rg_doall(int request, int block, const char *debugfmt); void do_status_checks(void); /* Queue status checks for locally running services */
diff --git a/rgmanager/src/daemons/groups.c b/rgmanager/src/daemons/groups.c index ec8ab63..e9413cd 100644 --- a/rgmanager/src/daemons/groups.c +++ b/rgmanager/src/daemons/groups.c @@ -1276,13 +1276,21 @@ svc_exists(const char *svcname) }
-void +/* + * Perform an operation on all resources groups. + * + * Returns the number of requests queued. This value is + * only used during shutdown, where we queue RG_STOP_EXITING + * only for services we have running locally as an optimization. + */ +int rg_doall(int request, int block, const char __attribute__ ((unused)) *debugfmt) { resource_node_t *curr; rg_state_t svcblk; char rg[64]; + int queued = 0;
pthread_rwlock_rdlock(&resource_lock); list_do(&_tree, curr) { @@ -1307,6 +1315,7 @@ rg_doall(int request, int block,
rt_enqueue_request(rg, request, NULL, 0, 0, 0, 0); + ++queued; } while (!list_done(&_tree, curr));
pthread_rwlock_unlock(&resource_lock); @@ -1316,6 +1325,7 @@ rg_doall(int request, int block, other rgmanagers to complete. */ if (block) rg_wait_threads(); + return queued; }
diff --git a/rgmanager/src/daemons/main.c b/rgmanager/src/daemons/main.c index 02cd2dc..a3ab7c4 100644 --- a/rgmanager/src/daemons/main.c +++ b/rgmanager/src/daemons/main.c @@ -29,7 +29,6 @@ void dump_thread_states(FILE *); #endif static int configure_rgmanager(int ccsfd, int debug, int *cluster_timeout); -void set_transition_throttling(int);
void flag_shutdown(int sig);
@@ -43,6 +42,7 @@ static int signalled = 0; static uint8_t ALIGNED port = RG_PORT; static char *rgmanager_lsname = (char *)"rgmanager"; /* XXX default */ static int status_poll_interval = DEFAULT_CHECK_INTERVAL; +static int stops_queued = 0;
static void segfault(int __attribute__ ((unused)) sig) @@ -930,7 +930,7 @@ static void * shutdown_thread(void __attribute__ ((unused)) *arg) { rg_lockall(L_SYS|L_SHUTDOWN); - rg_doall(RG_STOP_EXITING, 1, NULL); + stops_queued = rg_doall(RG_STOP_EXITING, 1, NULL); running = 0;
pthread_exit(NULL); @@ -1119,8 +1119,17 @@ out_ls:
out: rgm_dbus_release(); - logt_print(LOG_NOTICE, "Shutdown complete, exiting\n"); + logt_print(LOG_DEBUG, "Stopped %d services\n", stops_queued); + logt_print(LOG_NOTICE, "Disconnecting from CMAN\n"); cman_finish(clu); + + if (stops_queued && !central_events_enabled()) { + logt_print(LOG_DEBUG, "Pausing to allow services to " + "start on other node(s)\n"); + sleep(get_transition_throttling() * 3); + } + + logt_print(LOG_NOTICE, "Exiting\n"); close_logging(); /*malloc_stats();*/ diff --git a/rgmanager/src/daemons/rg_event.c b/rgmanager/src/daemons/rg_event.c index d03f4e3..7048bc6 100644 --- a/rgmanager/src/daemons/rg_event.c +++ b/rgmanager/src/daemons/rg_event.c @@ -56,6 +56,13 @@ set_transition_throttling(int nsecs) }
+int +get_transition_throttling(void) +{ + return transition_throttling; +} + + void set_central_events(int flag) {
cluster-commits@lists.fedorahosted.org