In a 4 node cluster, stopping two nodes at about the same time would result
in one node leave event not being detected. This is caused by tdetect being
reset in all cases prior to this patch. This patch only resets the tdetect
timer when the event is from the predecessor on the ring.
Further, if tdetect expires, a node is unable to detect a new failure in the
predecessor. This is because tdetect is not reset in the tdetect timer
handler. As long as there is more then one member, tdetect is reset when it
expires to potentially catch a failure of two or more nodes.
Signed-off-by: Steven Dake <sdake(a)redhat.com>
---
src/d1htedra.c | 14 +++++++++++++-
1 files changed, 13 insertions(+), 1 deletions(-)
diff --git a/src/d1htedra.c b/src/d1htedra.c
index 60a4e6b..68eef63 100644
--- a/src/d1htedra.c
+++ b/src/d1htedra.c
@@ -586,6 +586,9 @@ static void timer_function_tdetect (void *data)
target);
}
reset_events (instance);
+ if (instance->my_member_count > 1) {
+ reset_timer_tdetect (instance);
+ }
}
static void cancel_timer_lookup (struct d1htedra_instance *instance)
@@ -638,8 +641,17 @@ static int message_handler_event (
int i;
uint32_t my_ttl;
+ struct d1ht_ip_address *pred;
+
log_printf (LOGSYS_LEVEL_DEBUG, "message_handler_event\n");
- reset_timer_tdetect (instance);
+
+ /*
+ * Verify that event is from pred before reseting the tdetect timer
+ */
+ member_pred (instance, 1, &pred);
+ if (pred && (d1htip_equal (pred, &msg_event->source) == 1)) {
+ reset_timer_tdetect (instance);
+ }
for (i = 0; i < msg_event->stored_events_count; i++) {
if (msg_event->stored_events[i].type == EVENT_TYPE_JOIN) {
member_add (instance, &msg_event->stored_events[i].source);
--
1.6.2.5
Show replies by thread