dlm: master - dlm_controld: limit log debug - cluster-commits - Fedora Mailing-Lists

Tuesday, 22 January 2013

Gitweb:       
http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=6a030a4efa08528...
Commit:        6a030a4efa08528c75ab5d045a3b3752a4b85fb2
Parent:        f15a2daed7137481c7161e1447b1796c308c7e08
Author:        David Teigland <teigland(a)redhat.com&gt;
AuthorDate:    Tue Jan 22 10:41:30 2013 -0600
Committer:     David Teigland <teigland(a)redhat.com&gt;
CommitterDate: Tue Jan 22 10:41:30 2013 -0600

dlm_controld: limit log debug

Avoid filling the debug buffer with log_debug
entries that occur within indefinate retries.
Limit to 60 log_debug repetitions, (once a
second for a minute.)

Also, after one minute, escalate these repeating
log_debugs to log_error messages.  The log_error
messages repeat once an hour.

Signed-off-by: David Teigland <teigland(a)redhat.com&gt;
---
 dlm_controld/cpg.c        |   44 ++++++++++++++++++++++++++++---
 dlm_controld/daemon_cpg.c |   61 +++++++++++++++++++++++++++++++++------------
 dlm_controld/dlm_daemon.h |    4 ++-
 dlm_controld/main.c       |    2 +-
 4 files changed, 88 insertions(+), 23 deletions(-)

diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index 0318b98..487887c 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -16,6 +16,16 @@
 	}                                     \
 })
 
+/* retries are once a second */
+#define log_retry(ls, fmt, args...) ({ \
+	if (ls->wait_retry < 60) \
+		log_group(ls, fmt, ##args); \
+	else if (ls->wait_retry == 60) \
+		log_erros(ls, fmt, ##args); \
+        else if (!(ls->wait_retry % 3600)) \
+                log_erros(ls, fmt, ##args); \
+})
+
 /* per lockspace cpg: ls->node_history */
 
 struct node {
@@ -591,30 +601,54 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq)
 static int wait_conditions_done(struct lockspace *ls)
 {
 	if (!check_ringid_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_RINGID;
+		if (ls->wait_debug != DLMC_LS_WAIT_RINGID) {
+			ls->wait_debug = DLMC_LS_WAIT_RINGID;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		/* the check function logs a message */
+
 		return 0;
 	}
 
 	if (opt(enable_quorum_lockspace_ind) && !cluster_quorate) {
-		log_group(ls, "wait for quorum");
-		ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+		if (ls->wait_debug != DLMC_LS_WAIT_QUORUM) {
+			ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for quorum");
+
 		poll_lockspaces++;
 		return 0;
 	}
 
 	if (!check_fencing_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_FENCING;
+		if (ls->wait_debug != DLMC_LS_WAIT_FENCING) {
+			ls->wait_debug = DLMC_LS_WAIT_FENCING;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for fencing");
+
 		poll_lockspaces++;
 		return 0;
 	}
 
 	if (!check_fs_done(ls)) {
-		ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+		if (ls->wait_debug != DLMC_LS_WAIT_FSDONE) {
+			ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+			ls->wait_retry = 0;
+		}
+		ls->wait_retry++;
+		log_retry(ls, "wait for fsdone");
+
 		poll_fs++;
 		return 0;
 	}
 
 	ls->wait_debug = 0;
+	ls->wait_retry = 0;
 
 	return 1;
 }
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index 61d2da2..0eec5ce 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -11,6 +11,16 @@
 /* protocol_version flags */
 #define PV_STATEFUL 0x0001
 
+/* retries are once a second */
+#define log_retry(cur_count, fmt, args...) ({ \
+	if (cur_count < 60) \
+		log_debug(fmt, ##args); \
+	else if (cur_count == 60) \
+		log_error(fmt, ##args); \
+	else if (!(cur_count % 3600)) \
+		log_error(fmt, ##args); \
+})
+
 struct protocol_version {
 	uint16_t major;
 	uint16_t minor;
@@ -106,6 +116,9 @@ static int fence_in_progress_unknown = 1;
 static int zombie_pids[MAX_ZOMBIES];
 static int zombie_count;
 
+static int fence_result_pid;
+static unsigned int fence_result_try;
+
 static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t
walltime);
 static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
 
@@ -718,32 +731,33 @@ static void fence_pid_cancel(int nodeid, int pid)
  * later same as case B above
  */
 
-static void daemon_fence_work(void)
+static int daemon_fence_work(void)
 {
 	struct node_daemon *node, *safe;
 	int rv, nodeid, pid, need, low, actor, result;
+	int retry = 0;
 	uint32_t flags;
 
 	if (daemon_ringid_wait) {
 		/* We've seen a nodedown confchg callback, but not the
 		   corresponding ringid callback. */
-		log_debug("fence work wait for cpg ringid");
-		return;
+		log_retry(retry_fencing, "fence work wait for cpg ringid");
+		return retry;
 	}
 
 	if (cluster_ringid_seq != daemon_ringid.seq) {
 		/* wait for ringids to be in sync */
-		log_debug("fence work wait for cluster ringid");
-		return;
+		log_retry(retry_fencing, "fence work wait for cluster ringid");
+		return retry;
 	}
 
-	/* poll_fencing++; */
+	/* retry = 1; */
 
 	if (opt(enable_quorum_fencing_ind) && !cluster_quorate) {
 		/* wait for quorum before doing any fencing, but if there
 		   is none, send_fence_clear below can unblock new nodes */
-		log_debug("fence work wait for quorum");
-		poll_fencing++;
+		log_retry(retry_fencing, "fence work wait for quorum");
+		retry = 1;
 		goto out_fipu;
 	}
 
@@ -766,7 +780,7 @@ static void daemon_fence_work(void)
 			log_debug("fence startup %d delay %d from %llu",
 				  node->nodeid, opt(post_join_delay_ind),
 				  (unsigned long long)daemon_last_join_monotime);
-			poll_fencing++;
+			retry = 1;
 			continue;
 		}
 
@@ -837,10 +851,10 @@ static void daemon_fence_work(void)
 
 		if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) {
 			/* run one agent at a time in case they need the same switch */
-			log_debug("fence request %d delay for other pid %d",
+			log_retry(retry_fencing, "fence request %d delay for other pid %d",
 				  node->nodeid, daemon_fence_pid);
 			node->delay_fencing = 1;
-			poll_fencing++;
+			retry = 1;
 			continue;
 		}
 
@@ -849,7 +863,7 @@ static void daemon_fence_work(void)
 				  node->nodeid, opt(post_join_delay_ind),
 				  (unsigned long long)cluster_last_join_monotime);
 			node->delay_fencing = 1;
-			poll_fencing++;
+			retry = 1;
 			continue;
 		}
 		node->delay_fencing = 0;
@@ -947,12 +961,19 @@ static void daemon_fence_work(void)
 			continue;
 		}
 
-		poll_fencing++;
+		retry = 1;
 
 		rv = fence_result(nodeid, pid, &result);
 		if (rv == -EAGAIN) {
 			/* agent pid is still running */
-			log_debug("fence wait %d pid %d running", nodeid, pid);
+
+			if (fence_result_pid != pid) {
+				fence_result_try = 0;
+				fence_result_pid = pid;
+			}
+			fence_result_try++;
+
+			log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid);
 			continue;
 		}
 
@@ -1083,12 +1104,20 @@ static void daemon_fence_work(void)
 
 	if (zombie_count)
 		clear_zombies();
+
+	return retry;
 }
 
 void process_fencing_changes(void)
 {
-	poll_fencing = 0;
-	daemon_fence_work();
+	int retry;
+
+	retry = daemon_fence_work();
+
+	if (retry)
+		retry_fencing++;
+	else
+		retry_fencing = 0;
 }
 
 static void receive_fence_clear(struct dlm_header *hd, int len)
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index ad99e17..662cc35 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -161,7 +161,7 @@ EXTERN struct dlm_option dlm_options[dlm_options_max];
 EXTERN int daemon_quit;
 EXTERN int cluster_down;
 EXTERN int poll_lockspaces;
-EXTERN int poll_fencing;
+EXTERN unsigned int retry_fencing;
 EXTERN int poll_fs;
 EXTERN int poll_ignore_plock;
 EXTERN int poll_drop_plock;
@@ -190,6 +190,7 @@ void log_level(char *name_in, uint32_t level_in, const char *fmt,
...);
 
 #define log_error(fmt, args...) log_level(NULL, LOG_ERR, fmt, ##args)
 #define log_debug(fmt, args...) log_level(NULL, LOG_DEBUG, fmt, ##args)
+#define log_erros(ls, fmt, args...) log_level((ls)->name, LOG_ERR, fmt, ##args)
 #define log_group(ls, fmt, args...) log_level((ls)->name, LOG_DEBUG, fmt, ##args)
 
 #define log_plock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_NONE, fmt,
##args)
@@ -258,6 +259,7 @@ struct lockspace {
 	int			kernel_stopped;
 	int			fs_registered;
 	int			wait_debug; /* for status/debugging */
+	uint32_t		wait_retry; /* for debug rate limiting */
 	uint32_t		change_seq;
 	uint32_t		started_count;
 	struct change		*started_change;
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index a68e1b8..953cf1b 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -1059,7 +1059,7 @@ static void loop(void)
 
 		poll_timeout = -1;
 
-		if (poll_fencing) {
+		if (retry_fencing) {
 			process_fencing_changes();
 			poll_timeout = 1000;
 		}

    

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

dlm: master - dlm_controld: limit log debug