Gitweb:
http://git.fedorahosted.org/git/?p=dlm.git;a=commitdiff;h=6a030a4efa08528...
Commit: 6a030a4efa08528c75ab5d045a3b3752a4b85fb2
Parent: f15a2daed7137481c7161e1447b1796c308c7e08
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Jan 22 10:41:30 2013 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Tue Jan 22 10:41:30 2013 -0600
dlm_controld: limit log debug
Avoid filling the debug buffer with log_debug
entries that occur within indefinate retries.
Limit to 60 log_debug repetitions, (once a
second for a minute.)
Also, after one minute, escalate these repeating
log_debugs to log_error messages. The log_error
messages repeat once an hour.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
dlm_controld/cpg.c | 44 ++++++++++++++++++++++++++++---
dlm_controld/daemon_cpg.c | 61 +++++++++++++++++++++++++++++++++------------
dlm_controld/dlm_daemon.h | 4 ++-
dlm_controld/main.c | 2 +-
4 files changed, 88 insertions(+), 23 deletions(-)
diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c
index 0318b98..487887c 100644
--- a/dlm_controld/cpg.c
+++ b/dlm_controld/cpg.c
@@ -16,6 +16,16 @@
} \
})
+/* retries are once a second */
+#define log_retry(ls, fmt, args...) ({ \
+ if (ls->wait_retry < 60) \
+ log_group(ls, fmt, ##args); \
+ else if (ls->wait_retry == 60) \
+ log_erros(ls, fmt, ##args); \
+ else if (!(ls->wait_retry % 3600)) \
+ log_erros(ls, fmt, ##args); \
+})
+
/* per lockspace cpg: ls->node_history */
struct node {
@@ -591,30 +601,54 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq)
static int wait_conditions_done(struct lockspace *ls)
{
if (!check_ringid_done(ls)) {
- ls->wait_debug = DLMC_LS_WAIT_RINGID;
+ if (ls->wait_debug != DLMC_LS_WAIT_RINGID) {
+ ls->wait_debug = DLMC_LS_WAIT_RINGID;
+ ls->wait_retry = 0;
+ }
+ ls->wait_retry++;
+ /* the check function logs a message */
+
return 0;
}
if (opt(enable_quorum_lockspace_ind) && !cluster_quorate) {
- log_group(ls, "wait for quorum");
- ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+ if (ls->wait_debug != DLMC_LS_WAIT_QUORUM) {
+ ls->wait_debug = DLMC_LS_WAIT_QUORUM;
+ ls->wait_retry = 0;
+ }
+ ls->wait_retry++;
+ log_retry(ls, "wait for quorum");
+
poll_lockspaces++;
return 0;
}
if (!check_fencing_done(ls)) {
- ls->wait_debug = DLMC_LS_WAIT_FENCING;
+ if (ls->wait_debug != DLMC_LS_WAIT_FENCING) {
+ ls->wait_debug = DLMC_LS_WAIT_FENCING;
+ ls->wait_retry = 0;
+ }
+ ls->wait_retry++;
+ log_retry(ls, "wait for fencing");
+
poll_lockspaces++;
return 0;
}
if (!check_fs_done(ls)) {
- ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+ if (ls->wait_debug != DLMC_LS_WAIT_FSDONE) {
+ ls->wait_debug = DLMC_LS_WAIT_FSDONE;
+ ls->wait_retry = 0;
+ }
+ ls->wait_retry++;
+ log_retry(ls, "wait for fsdone");
+
poll_fs++;
return 0;
}
ls->wait_debug = 0;
+ ls->wait_retry = 0;
return 1;
}
diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c
index 61d2da2..0eec5ce 100644
--- a/dlm_controld/daemon_cpg.c
+++ b/dlm_controld/daemon_cpg.c
@@ -11,6 +11,16 @@
/* protocol_version flags */
#define PV_STATEFUL 0x0001
+/* retries are once a second */
+#define log_retry(cur_count, fmt, args...) ({ \
+ if (cur_count < 60) \
+ log_debug(fmt, ##args); \
+ else if (cur_count == 60) \
+ log_error(fmt, ##args); \
+ else if (!(cur_count % 3600)) \
+ log_error(fmt, ##args); \
+})
+
struct protocol_version {
uint16_t major;
uint16_t minor;
@@ -106,6 +116,9 @@ static int fence_in_progress_unknown = 1;
static int zombie_pids[MAX_ZOMBIES];
static int zombie_count;
+static int fence_result_pid;
+static unsigned int fence_result_try;
+
static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t
walltime);
static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime);
@@ -718,32 +731,33 @@ static void fence_pid_cancel(int nodeid, int pid)
* later same as case B above
*/
-static void daemon_fence_work(void)
+static int daemon_fence_work(void)
{
struct node_daemon *node, *safe;
int rv, nodeid, pid, need, low, actor, result;
+ int retry = 0;
uint32_t flags;
if (daemon_ringid_wait) {
/* We've seen a nodedown confchg callback, but not the
corresponding ringid callback. */
- log_debug("fence work wait for cpg ringid");
- return;
+ log_retry(retry_fencing, "fence work wait for cpg ringid");
+ return retry;
}
if (cluster_ringid_seq != daemon_ringid.seq) {
/* wait for ringids to be in sync */
- log_debug("fence work wait for cluster ringid");
- return;
+ log_retry(retry_fencing, "fence work wait for cluster ringid");
+ return retry;
}
- /* poll_fencing++; */
+ /* retry = 1; */
if (opt(enable_quorum_fencing_ind) && !cluster_quorate) {
/* wait for quorum before doing any fencing, but if there
is none, send_fence_clear below can unblock new nodes */
- log_debug("fence work wait for quorum");
- poll_fencing++;
+ log_retry(retry_fencing, "fence work wait for quorum");
+ retry = 1;
goto out_fipu;
}
@@ -766,7 +780,7 @@ static void daemon_fence_work(void)
log_debug("fence startup %d delay %d from %llu",
node->nodeid, opt(post_join_delay_ind),
(unsigned long long)daemon_last_join_monotime);
- poll_fencing++;
+ retry = 1;
continue;
}
@@ -837,10 +851,10 @@ static void daemon_fence_work(void)
if (!opt(enable_concurrent_fencing_ind) && daemon_fence_pid) {
/* run one agent at a time in case they need the same switch */
- log_debug("fence request %d delay for other pid %d",
+ log_retry(retry_fencing, "fence request %d delay for other pid %d",
node->nodeid, daemon_fence_pid);
node->delay_fencing = 1;
- poll_fencing++;
+ retry = 1;
continue;
}
@@ -849,7 +863,7 @@ static void daemon_fence_work(void)
node->nodeid, opt(post_join_delay_ind),
(unsigned long long)cluster_last_join_monotime);
node->delay_fencing = 1;
- poll_fencing++;
+ retry = 1;
continue;
}
node->delay_fencing = 0;
@@ -947,12 +961,19 @@ static void daemon_fence_work(void)
continue;
}
- poll_fencing++;
+ retry = 1;
rv = fence_result(nodeid, pid, &result);
if (rv == -EAGAIN) {
/* agent pid is still running */
- log_debug("fence wait %d pid %d running", nodeid, pid);
+
+ if (fence_result_pid != pid) {
+ fence_result_try = 0;
+ fence_result_pid = pid;
+ }
+ fence_result_try++;
+
+ log_retry(fence_result_try, "fence wait %d pid %d running", nodeid, pid);
continue;
}
@@ -1083,12 +1104,20 @@ static void daemon_fence_work(void)
if (zombie_count)
clear_zombies();
+
+ return retry;
}
void process_fencing_changes(void)
{
- poll_fencing = 0;
- daemon_fence_work();
+ int retry;
+
+ retry = daemon_fence_work();
+
+ if (retry)
+ retry_fencing++;
+ else
+ retry_fencing = 0;
}
static void receive_fence_clear(struct dlm_header *hd, int len)
diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h
index ad99e17..662cc35 100644
--- a/dlm_controld/dlm_daemon.h
+++ b/dlm_controld/dlm_daemon.h
@@ -161,7 +161,7 @@ EXTERN struct dlm_option dlm_options[dlm_options_max];
EXTERN int daemon_quit;
EXTERN int cluster_down;
EXTERN int poll_lockspaces;
-EXTERN int poll_fencing;
+EXTERN unsigned int retry_fencing;
EXTERN int poll_fs;
EXTERN int poll_ignore_plock;
EXTERN int poll_drop_plock;
@@ -190,6 +190,7 @@ void log_level(char *name_in, uint32_t level_in, const char *fmt,
...);
#define log_error(fmt, args...) log_level(NULL, LOG_ERR, fmt, ##args)
#define log_debug(fmt, args...) log_level(NULL, LOG_DEBUG, fmt, ##args)
+#define log_erros(ls, fmt, args...) log_level((ls)->name, LOG_ERR, fmt, ##args)
#define log_group(ls, fmt, args...) log_level((ls)->name, LOG_DEBUG, fmt, ##args)
#define log_plock(ls, fmt, args...) log_level((ls)->name, LOG_PLOCK|LOG_NONE, fmt,
##args)
@@ -258,6 +259,7 @@ struct lockspace {
int kernel_stopped;
int fs_registered;
int wait_debug; /* for status/debugging */
+ uint32_t wait_retry; /* for debug rate limiting */
uint32_t change_seq;
uint32_t started_count;
struct change *started_change;
diff --git a/dlm_controld/main.c b/dlm_controld/main.c
index a68e1b8..953cf1b 100644
--- a/dlm_controld/main.c
+++ b/dlm_controld/main.c
@@ -1059,7 +1059,7 @@ static void loop(void)
poll_timeout = -1;
- if (poll_fencing) {
+ if (retry_fencing) {
process_fencing_changes();
poll_timeout = 1000;
}