March 2012 - cluster-commits - Fedora Mailing-Lists

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: dae8d813f9fbabb4fde92bd9df64968d2ea49c49 Parent: d8d8a6de00f7d0f9ca1944913a26b805984851d5 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Tue Mar 20 11:41:58 2012 -0500 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Tue Mar 20 11:41:58 2012 -0500 release 3.99.1 Signed-off-by: David Teigland <teigland(a)redhat.com> --- include/version.cf | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/include/version.cf b/include/version.cf index 1669bf4..f0ae01e 100644 --- a/include/version.cf +++ b/include/version.cf @@ -1,6 +1,6 @@ #ifndef _RELEASE_VERSION_CF_ #define _RELEASE_VERSION_CF_ -#define RELEASE_VERSION "3.99.0" +#define RELEASE_VERSION "3.99.1" #endif

12 years, 1 month

1
0
0 / 0

dlm: master - release 3.99.1

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: dae8d813f9fbabb4fde92bd9df64968d2ea49c49 Parent: d8d8a6de00f7d0f9ca1944913a26b805984851d5 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Tue Mar 20 11:41:58 2012 -0500 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Tue Mar 20 11:41:58 2012 -0500 release 3.99.1 Signed-off-by: David Teigland <teigland(a)redhat.com> --- include/version.cf | 2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/include/version.cf b/include/version.cf index 1669bf4..f0ae01e 100644 --- a/include/version.cf +++ b/include/version.cf @@ -1,6 +1,6 @@ #ifndef _RELEASE_VERSION_CF_ #define _RELEASE_VERSION_CF_ -#define RELEASE_VERSION "3.99.0" +#define RELEASE_VERSION "3.99.1" #endif

12 years, 1 month

1
0
0 / 0

dlm: master - dlm_controld: fencing and config

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: d8d8a6de00f7d0f9ca1944913a26b805984851d5 Parent: eeb80cd73160856a16f73fc45d1ab48904aa55f0 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Mon Feb 27 13:10:25 2012 -0600 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Tue Mar 20 11:24:17 2012 -0500 dlm_controld: fencing and config New /etc/dlm/dlm.conf with same config options as before, but using "key = val" format. Fencing config can also be specified. Without an explicit fencing config, default "dlm_stonith" agent is run to let stonith handle fencing. Signed-off-by: David Teigland <teigland(a)redhat.com> --- dlm_controld/Makefile | 5 +- dlm_controld/action.c | 4 +- dlm_controld/config.c | 123 ++- dlm_controld/cpg.c | 1082 +----------------------- dlm_controld/daemon_cpg.c | 1921 ++++++++++++++++++++++++++++++++++++++++++ dlm_controld/dlm_controld.h | 1 + dlm_controld/dlm_daemon.h | 85 ++- dlm_controld/fence.c | 145 +++- dlm_controld/fence_config.c | 403 +++++++++ dlm_controld/fence_config.h | 62 ++ dlm_controld/lib.c | 19 + dlm_controld/libdlmcontrol.h | 1 + dlm_controld/main.c | 90 ++- dlm_controld/member.c | 51 +- dlm_controld/plock.c | 10 - dlm_tool/main.c | 17 +- fence/Makefile | 56 ++ fence/stonith_helper.c | 96 +++ 18 files changed, 3007 insertions(+), 1164 deletions(-) diff --git a/dlm_controld/Makefile b/dlm_controld/Makefile index a648ab1..b60b8df 100644 --- a/dlm_controld/Makefile +++ b/dlm_controld/Makefile @@ -19,7 +19,9 @@ LIB_TARGET = $(LIB_SO).$(LIB_MAJOR).$(LIB_MINOR) BIN_SOURCE = action.c \ cpg.c \ + daemon_cpg.c \ crc.c \ + fence_config.c \ fence.c \ main.c \ plock.c \ @@ -51,13 +53,10 @@ BIN_CFLAGS += -D_GNU_SOURCE -g \ -fdiagnostics-show-option \ BIN_CFLAGS += -fPIE -DPIE -BIN_CFLAGS += `xml2-config --cflags` BIN_CFLAGS += -I../include -I../libdlm BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie -BIN_LDFLAGS += `xml2-config --libs` BIN_LDFLAGS += -lpthread -lrt -lcpg -lcmap -lcfg -lquorum -#BIN_LDFLAGS += -lfenced LIB_CFLAGS += $(BIN_CFLAGS) LIB_LDFLAGS += -Wl,-z,relro -pie diff --git a/dlm_controld/action.c b/dlm_controld/action.c index d4bf11d..6cb34f2 100644 --- a/dlm_controld/action.c +++ b/dlm_controld/action.c @@ -34,10 +34,8 @@ static int detect_protocol(void) } rv = cmap_get_string(handle, "totem.rrp_mode", &str); - if (rv != CS_OK) { - log_error("cmap_get_string totem.rrp_mode error %d", rv); + if (rv != CS_OK) goto out; - } log_debug("cmap totem.rrp_mode = '%s'", str); diff --git a/dlm_controld/config.c b/dlm_controld/config.c index 3bf02c0..f668b86 100644 --- a/dlm_controld/config.c +++ b/dlm_controld/config.c @@ -7,7 +7,6 @@ */ #include "dlm_daemon.h" -#include <libxml/tree.h> /* TODO: <dlm> @@ -37,66 +36,100 @@ static void proto_val(char *str, int *val) } } -static void set_val(xmlNode *root, const char *name, int *opt, int *val) +static void set_val(char *line, int *val_out) { - xmlChar *str; + char key[PATH_MAX]; + char val[PATH_MAX]; + int rv; - str = xmlGetProp(root, BAD_CAST name); - if (str && !(*opt)) { - *val = atoi((char *)str); - log_debug("config %s = %d", name, *val); - } + rv = sscanf(line, "%[^=]=%s", key, val); + if (rv != 2) + return; + + *val_out = atoi(val); + + log_debug("config %s=%d", key, *val_out); +} + +static void get_val(char *line, char *val_out) +{ + char key[PATH_MAX]; + char val[PATH_MAX]; + int rv; + + rv = sscanf(line, "%[^=]=%s", key, val); + if (rv != 2) + return; + + strcpy(val_out, val); } void setup_config(int update) { - xmlDoc *doc; - xmlNode *root; - xmlChar *str; + FILE *file; + char line[PATH_MAX]; + char str[PATH_MAX]; if (!path_exists(CONF_FILE_PATH)) return; - doc = xmlParseFile(CONF_FILE_PATH); - if (!doc) { - log_error("xml parse error %d %s", errno, CONF_FILE_PATH); + file = fopen(CONF_FILE_PATH, "r"); + if (!file) return; - } - root = xmlDocGetRootElement(doc); - if (!root) { - log_error("xml root error %d %s", errno, CONF_FILE_PATH); - xmlFreeDoc(doc); - return; - } + while (fgets(line, PATH_MAX, file)) { + if (line[0] == '#') + continue; + if (line[0] == '\n') + continue; + + if (!optk_debug && !strncmp(line, "log_debug", strlen("log_debug"))) + set_val(line, &cfgk_debug); + + else if (!optk_timewarn && !strncmp(line, "timewarn", strlen("timewarn")) && !update) + set_val(line, &cfgk_timewarn); + + else if (!optd_enable_fencing && !strncmp(line, "enable_fencing", strlen("enable_fencing")) && !update) + set_val(line, &cfgd_enable_fencing); + + else if (!optd_enable_quorum_fencing && !strncmp(line, "enable_quorum_fencing", strlen("enable_quorum_fencing")) && !update) + set_val(line, &cfgd_enable_quorum_fencing); + + else if (!optd_enable_quorum_lockspace && !strncmp(line, "enable_quorum_lockspace", strlen("enable_quorum_lockspace")) && !update) + set_val(line, &cfgd_enable_quorum_lockspace); + + else if (!optd_enable_fscontrol && !strncmp(line, "enable_fscontrol", strlen("enable_fscontrol")) && !update) + set_val(line, &cfgd_enable_fscontrol); + + else if (!optd_enable_plock && !strncmp(line, "enable_plock", strlen("enable_plock")) && !update) + set_val(line, &cfgd_enable_plock); + + else if (!optd_plock_ownership && !strncmp(line, "plock_ownership", strlen("plock_ownership")) && !update) + set_val(line, &cfgd_plock_ownership); + + else if (!optd_plock_debug && !strncmp(line, "plock_debug", strlen("plock_debug"))) + set_val(line, &cfgd_plock_debug); + + else if (!optd_plock_rate_limit && !strncmp(line, "plock_rate_limit", strlen("plock_rate_limit"))) + set_val(line, &cfgd_plock_rate_limit); + + else if (!optd_drop_resources_time && !strncmp(line, "drop_resources_time", strlen("drop_resources_time"))) + set_val(line, &cfgd_drop_resources_time); - if (update) - goto do_update; + else if (!optd_drop_resources_count && !strncmp(line, "drop_resources_count", strlen("drop_resources_count"))) + set_val(line, &cfgd_drop_resources_count); - /* These config values are set from dlm.conf only if they haven't - already been set on the command line. */ + else if (!optd_drop_resources_age && !strncmp(line, "drop_resources_age", strlen("drop_resources_age"))) + set_val(line, &cfgd_drop_resources_age); - str = xmlGetProp(root, BAD_CAST "protocol"); - if (str && !optk_protocol) { - proto_val((char *)str, &cfgk_protocol); - log_debug("config protocol = %d", cfgk_protocol); + else if (!optk_protocol && !strncmp(line, "protocol", strlen("protocol")) && !update) { + memset(str, 0, sizeof(str)); + get_val(line, str); + proto_val(str, &cfgk_protocol); + log_debug("config protocol = %d", cfgk_protocol); + } } - set_val(root, "log_debug", &optk_debug, &cfgk_debug); - set_val(root, "timewarn", &optk_timewarn, &cfgk_timewarn); - set_val(root, "enable_fencing", &optd_enable_fencing, &cfgd_enable_fencing); - set_val(root, "enable_quorum", &optd_enable_quorum, &cfgd_enable_quorum); - set_val(root, "enable_fscontrol", &optd_enable_fscontrol, &cfgd_enable_fscontrol); - set_val(root, "enable_plock", &optd_enable_plock, &cfgd_enable_plock); - set_val(root, "plock_ownership", &optd_plock_ownership, &cfgd_plock_ownership); - do_update: - /* The following can be changed while running */ - set_val(root, "plock_debug", &optd_plock_debug, &cfgd_plock_debug); - set_val(root, "plock_rate_limit", &optd_plock_rate_limit, &cfgd_plock_rate_limit); - set_val(root, "drop_resources_time", &optd_drop_resources_time, &cfgd_drop_resources_time); - set_val(root, "drop_resources_count", &optd_drop_resources_count, &cfgd_drop_resources_count); - set_val(root, "drop_resources_age", &optd_drop_resources_age, &cfgd_drop_resources_age); - - xmlFreeDoc(doc); + fclose(file); } diff --git a/dlm_controld/cpg.c b/dlm_controld/cpg.c index 1360493..1fb5372 100644 --- a/dlm_controld/cpg.c +++ b/dlm_controld/cpg.c @@ -1,5 +1,5 @@ /* - * Copyright 2004-2011 Red Hat, Inc. + * Copyright 2004-2012 Red Hat, Inc. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -8,49 +8,6 @@ #include "dlm_daemon.h" -#define PV_STATEFUL 0x0001 - -struct protocol_version { - uint16_t major; - uint16_t minor; - uint16_t patch; - uint16_t flags; -}; - -struct protocol { - union { - struct protocol_version dm_ver; - uint16_t daemon_max[4]; - }; - union { - struct protocol_version km_ver; - uint16_t kernel_max[4]; - }; - union { - struct protocol_version dr_ver; - uint16_t daemon_run[4]; - }; - union { - struct protocol_version kr_ver; - uint16_t kernel_run[4]; - }; -}; - -/* per dlm_controld cpg: daemon_nodes */ - -struct node_daemon { - struct list_head list; - int nodeid; - - uint64_t daemon_add_time; - uint64_t daemon_rem_time; - int daemon_member; - - int killed; - - struct protocol proto; -}; - /* per lockspace cpg: ls->node_history */ struct node { @@ -71,11 +28,10 @@ struct node { int check_fs; int fs_notified; - int request_fencing; - int check_fencing; - uint64_t fail_realtime; - uint64_t fence_realtime; + int need_fencing; uint32_t fence_queries; /* for debug */ + uint64_t fail_walltime; + uint64_t fail_monotime; }; /* per lockspace confchg: ls->changes */ @@ -127,93 +83,6 @@ struct id_info { int nodeid; }; -int message_flow_control_on; -static cpg_handle_t cpg_handle_daemon; -static int cpg_fd_daemon; -static struct protocol our_protocol; -static struct list_head daemon_nodes; -static struct cpg_address daemon_member[MAX_NODES]; -static int daemon_member_count; - -static void log_config(const struct cpg_name *group_name, - const struct cpg_address *member_list, - size_t member_list_entries, - const struct cpg_address *left_list, - size_t left_list_entries, - const struct cpg_address *joined_list, - size_t joined_list_entries) -{ - char m_buf[128]; - char j_buf[32]; - char l_buf[32]; - size_t i, len, pos; - int ret; - - memset(m_buf, 0, sizeof(m_buf)); - memset(j_buf, 0, sizeof(j_buf)); - memset(l_buf, 0, sizeof(l_buf)); - - len = sizeof(m_buf); - pos = 0; - for (i = 0; i < member_list_entries; i++) { - ret = snprintf(m_buf + pos, len - pos, " %d", - member_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - len = sizeof(j_buf); - pos = 0; - for (i = 0; i < joined_list_entries; i++) { - ret = snprintf(j_buf + pos, len - pos, " %d", - joined_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - len = sizeof(l_buf); - pos = 0; - for (i = 0; i < left_list_entries; i++) { - ret = snprintf(l_buf + pos, len - pos, " %d", - left_list[i].nodeid); - if (ret >= len - pos) - break; - pos += ret; - } - - log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value, - member_list_entries, joined_list_entries, left_list_entries, - m_buf, j_buf, l_buf); -} - -static void log_ringid(const char *name, - struct cpg_ring_id *ringid, - const uint32_t *member_list, - size_t member_list_entries) -{ - char m_buf[128]; - size_t i, len, pos; - int ret; - - memset(m_buf, 0, sizeof(m_buf)); - - len = sizeof(m_buf); - pos = 0; - for (i = 0; i < member_list_entries; i++) { - ret = snprintf(m_buf + pos, len - pos, " %u", - member_list[i]); - if (ret >= len - pos) - break; - pos += ret; - } - - log_debug("%s ring %u:%llu %zu memb%s", - name, ringid->nodeid, (unsigned long long)ringid->seq, - member_list_entries, m_buf); -} - static void ls_info_in(struct ls_info *li) { li->ls_info_size = le32_to_cpu(li->ls_info_size); @@ -243,93 +112,6 @@ static void ids_in(struct ls_info *li, struct id_info *ids) } } -const char *msg_name(int type) -{ - switch (type) { - case DLM_MSG_PROTOCOL: - return "protocol"; - case DLM_MSG_START: - return "start"; - case DLM_MSG_PLOCK: - return "plock"; - case DLM_MSG_PLOCK_OWN: - return "plock_own"; - case DLM_MSG_PLOCK_DROP: - return "plock_drop"; - case DLM_MSG_PLOCK_SYNC_LOCK: - return "plock_sync_lock"; - case DLM_MSG_PLOCK_SYNC_WAITER: - return "plock_sync_waiter"; - case DLM_MSG_PLOCKS_DATA: - return "plocks_data"; - case DLM_MSG_PLOCKS_DONE: - return "plocks_done"; - case DLM_MSG_DEADLK_CYCLE_START: - return "deadlk_cycle_start"; - case DLM_MSG_DEADLK_CYCLE_END: - return "deadlk_cycle_end"; - case DLM_MSG_DEADLK_CHECKPOINT_READY: - return "deadlk_checkpoint_ready"; - case DLM_MSG_DEADLK_CANCEL_LOCK: - return "deadlk_cancel_lock"; - default: - return "unknown"; - } -} - -static int _send_message(cpg_handle_t h, void *buf, int len, int type) -{ - struct iovec iov; - cs_error_t error; - int retries = 0; - - iov.iov_base = buf; - iov.iov_len = len; - - retry: - error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); - if (error == CS_ERR_TRY_AGAIN) { - retries++; - usleep(1000); - if (!(retries % 100)) - log_error("cpg_mcast_joined retry %d %s", - retries, msg_name(type)); - goto retry; - } - if (error != CS_OK) { - log_error("cpg_mcast_joined error %d handle %llx %s", - error, (unsigned long long)h, msg_name(type)); - return -1; - } - - if (retries) - log_debug("cpg_mcast_joined retried %d %s", - retries, msg_name(type)); - - return 0; -} - -/* header fields caller needs to set: type, to_nodeid, flags, msgdata */ - -void dlm_send_message(struct lockspace *ls, char *buf, int len) -{ - struct dlm_header *hd = (struct dlm_header *) buf; - int type = hd->type; - - hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); - hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); - hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); - hd->type = cpu_to_le16(hd->type); - hd->nodeid = cpu_to_le32(our_nodeid); - hd->to_nodeid = cpu_to_le32(hd->to_nodeid); - hd->global_id = cpu_to_le32(ls->global_id); - hd->flags = cpu_to_le32(hd->flags); - hd->msgdata = cpu_to_le32(hd->msgdata); - hd->msgdata2 = cpu_to_le32(hd->msgdata2); - - _send_message(ls->cpg_handle, buf, len, type); -} - static struct member *find_memb(struct change *cg, int nodeid) { struct member *memb; @@ -424,7 +206,7 @@ static void free_ls(struct lockspace *ls) For 1: - node X fails - we see node X fail and X has non-zero start_time, - set check_fencing and record the fail time + set need_fencing and record the fail time - wait for X to be removed from all dlm cpg's (probably not necessary) - check that the fencing time is later than the recorded time above @@ -442,8 +224,8 @@ static void free_ls(struct lockspace *ls) when we see a node not in this list, add entry for it with zero start_time record the time we get a good start message from the node, start_time clear start_time if the node leaves - if node fails with non-zero start_time, set check_fencing - when a node is fenced, clear start_time and clear check_fencing + if node fails with non-zero start_time, set need_fencing + when a node is fenced, clear start_time and clear need_fencing if a node remerges after this, no good start message, no new start_time set if a node fails with zero start_time, it doesn't need fencing if a node remerges before it's been fenced, no good start message, no new @@ -526,9 +308,7 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid, } if (cfgd_enable_fencing && node->start_time) { - node->request_fencing = 1; - node->check_fencing = 1; - node->fence_realtime = 0; + node->need_fencing = 1; node->fence_queries = 0; } @@ -543,7 +323,9 @@ static void node_history_lockspace_fail(struct lockspace *ls, int nodeid, node->lockspace_fail_time = now; node->lockspace_fail_seq = node->lockspace_rem_seq; node->lockspace_fail_reason = reason; /* for queries */ - node->fail_realtime = time(NULL); + + node->fail_monotime = now; + node->fail_walltime = time(NULL); } static void node_history_start(struct lockspace *ls, int nodeid) @@ -601,9 +383,9 @@ static int check_ringid_done(struct lockspace *ls) static int check_fencing_done(struct lockspace *ls) { struct node *node; - uint64_t last_fenced_time; - int in_progress, wait_count = 0; - int rv; + uint64_t fence_monotime; + int wait_count = 0; + int rv, in_progress; if (!cfgd_enable_fencing) { log_group(ls, "check_fencing disabled"); @@ -611,47 +393,35 @@ static int check_fencing_done(struct lockspace *ls) } list_for_each_entry(node, &ls->node_history, list) { - if (!node->check_fencing) + if (!node->need_fencing) continue; - /* check with fenced to see if the node has been - fenced since node->start_time */ - - rv = fence_node_time(node->nodeid, &last_fenced_time); + rv = fence_node_time(node->nodeid, &fence_monotime); if (rv < 0) { log_error("fenced_node_time error %d", rv); continue; } - /* fenced gives us real time */ - - /* need >= not just > because in at least one case - we've seen fenced_time within the same second as - fail_time: with external fencing, e.g. fence_node */ - - if (last_fenced_time >= node->fail_realtime) { - log_group(ls, "check_fencing %d done " - "start %llu fail %llu last %llu", + if (fence_monotime >= node->fail_monotime) { + log_group(ls, "check_fencing %d done start %llu fail %llu fence %llu", node->nodeid, (unsigned long long)node->start_time, - (unsigned long long)node->fail_realtime, - (unsigned long long)last_fenced_time); - node->check_fencing = 0; + (unsigned long long)node->fail_monotime, + (unsigned long long)fence_monotime); + + node->need_fencing = 0; node->start_time = 0; - node->fence_realtime = last_fenced_time; + continue; } else { - if (!node->fence_queries || - node->fence_realtime != last_fenced_time) { - log_group(ls, "check_fencing %d wait " - "start %llu fail %llu last %llu", + if (!node->fence_queries) { + log_group(ls, "check_fencing %d wait start %llu fail %llu", node->nodeid, (unsigned long long)node->start_time, - (unsigned long long)node->fail_realtime, - (unsigned long long)last_fenced_time); + (unsigned long long)node->fail_monotime); node->fence_queries++; - node->fence_realtime = last_fenced_time; } wait_count++; + continue; } } @@ -679,39 +449,6 @@ static int check_fencing_done(struct lockspace *ls) return 1; } -static int need_fencing(struct lockspace *ls) -{ - struct node *node; - - list_for_each_entry(node, &ls->node_history, list) { - if (node->check_fencing) { - log_group(ls, "need_fencing %d", node->nodeid); - return 1; - } - } - return 0; -} - -/* we don't need to ask fenced to initiate fencing; it does - so itself when it sees a fence domain member fail. Without - fenced we'll probably need to ask another daemon to initiate - fencing, then check with it above, like we check libfenced. */ - -static void request_fencing(struct lockspace *ls) -{ - struct node *node; - int rv; - - list_for_each_entry(node, &ls->node_history, list) { - if (!node->request_fencing) - continue; - log_group(ls, "fence_request %d", node->nodeid); - rv = fence_request(node->nodeid); - if (!rv) - node->request_fencing = 0; - } -} - /* wait for local fs_controld to ack each failed node */ static int check_fs_done(struct lockspace *ls) @@ -839,43 +576,18 @@ static void stop_kernel(struct lockspace *ls, uint32_t seq) /* we know that the cluster_quorate value here is consistent with the cpg events because the ringid's are in sync per the check_ringid_done */ -/* - enable_quorum = 0 - Never wait for quorum here. - - enable_quorum = 1 - Wait for quorum before calling request_fencing() in cases where - fencing is needed. Reason for using this would be if fencing - system doesn't wait for quorum before carrying out a fencing - request, and we want to avoid having partitioned inquorate nodes - fencing quorate nodes in another partition. - - enable_quorum = 2 - Always wait for quorum as a general condition here. This - means that lockspace join/leave operations would block waiting - for quorum. There's not any reason to do this per se, unless - that's the behavior the application using the dlm wants. -*/ - static int wait_conditions_done(struct lockspace *ls) { if (!check_ringid_done(ls)) return 0; - if ((cfgd_enable_quorum > 1) && !cluster_quorate) { + if ((cfgd_enable_quorum_lockspace > 1) && !cluster_quorate) { log_group(ls, "wait for quorum"); return 0; } - if ((cfgd_enable_quorum > 0) && need_fencing(ls) && !cluster_quorate) { - log_group(ls, "wait for quorum before fencing"); - return 0; - } - - request_fencing(ls); - if (!check_fencing_done(ls)) { - poll_fencing++; + poll_lockspaces++; return 0; } @@ -1437,7 +1149,7 @@ static void apply_changes(struct lockspace *ls) case CGST_WAIT_MESSAGES: if (wait_messages_done(ls)) { - our_protocol.dr_ver.flags |= PV_STATEFUL; + set_protocol_stateful(); start_kernel(ls); prepare_plocks(ls); cleanup_changes(ls); @@ -1453,9 +1165,7 @@ void process_lockspace_changes(void) { struct lockspace *ls, *safe; - poll_ringid = 0; - poll_fencing = 0; - poll_quorum = 0; + poll_lockspaces = 0; poll_fs = 0; list_for_each_entry_safe(ls, safe, &lockspaces, list) { @@ -1464,24 +1174,6 @@ void process_lockspace_changes(void) } } -static const char *reason_str(int reason) -{ - switch (reason) { - case CPG_REASON_JOIN: - return "join"; - case CPG_REASON_LEAVE: - return "leave"; - case CPG_REASON_NODEDOWN: - return "nodedown"; - case CPG_REASON_NODEUP: - return "nodeup"; - case CPG_REASON_PROCDOWN: - return "procdown"; - default: - return "unknown"; - }; -} - static int add_change(struct lockspace *ls, const struct cpg_address *member_list, size_t member_list_entries, @@ -1665,20 +1357,6 @@ static void confchg_cb(cpg_handle_t handle, #endif } -static void dlm_header_in(struct dlm_header *hd) -{ - hd->version[0] = le16_to_cpu(hd->version[0]); - hd->version[1] = le16_to_cpu(hd->version[1]); - hd->version[2] = le16_to_cpu(hd->version[2]); - hd->type = le16_to_cpu(hd->type); - hd->nodeid = le32_to_cpu(hd->nodeid); - hd->to_nodeid = le32_to_cpu(hd->to_nodeid); - hd->global_id = le32_to_cpu(hd->global_id); - hd->flags = le32_to_cpu(hd->flags); - hd->msgdata = le32_to_cpu(hd->msgdata); - hd->msgdata2 = le32_to_cpu(hd->msgdata2); -} - /* after our join confchg, we want to ignore plock messages (see need_plocks checks below) until the point in time where the ckpt_node saves plock state (final start message received); at this time we want to shift from @@ -1693,6 +1371,7 @@ static void deliver_cb(cpg_handle_t handle, struct lockspace *ls; struct dlm_header *hd; int ignore_plock; + int rv; ls = find_ls_handle(handle); if (!ls) { @@ -1700,7 +1379,7 @@ static void deliver_cb(cpg_handle_t handle, return; } - if (len < sizeof(*hd)) { + if (len < sizeof(struct dlm_header)) { log_error("deliver_cb short message %zd", len); return; } @@ -1708,20 +1387,9 @@ static void deliver_cb(cpg_handle_t handle, hd = (struct dlm_header *)data; dlm_header_in(hd); - if (hd->version[0] != our_protocol.daemon_run[0] || - hd->version[1] != our_protocol.daemon_run[1]) { - log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", - nodeid, hd->version[0], hd->version[1], - hd->version[2], our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2]); + rv = dlm_header_validate(hd, nodeid); + if (rv < 0) return; - } - - if (hd->nodeid != nodeid) { - log_error("bad msg nodeid %d %d", hd->nodeid, nodeid); - return; - } ignore_plock = 0; @@ -1892,31 +1560,6 @@ static cpg_model_v1_data_t cpg_callbacks = { .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, }; -void update_flow_control_status(void) -{ - cpg_flow_control_state_t flow_control_state; - cs_error_t error; - - error = cpg_flow_control_state_get(cpg_handle_daemon, - &flow_control_state); - if (error != CS_OK) { - log_error("cpg_flow_control_state_get %d", error); - return; - } - - if (flow_control_state == CPG_FLOW_CONTROL_ENABLED) { - if (message_flow_control_on == 0) { - log_debug("flow control on"); - } - message_flow_control_on = 1; - } else { - if (message_flow_control_on) { - log_debug("flow control off"); - } - message_flow_control_on = 0; - } -} - static void process_cpg_lockspace(int ci) { struct lockspace *ls; @@ -1933,8 +1576,6 @@ static void process_cpg_lockspace(int ci) log_error("cpg_dispatch error %d", error); return; } - - update_flow_control_status(); } /* received an "online" uevent from dlm-kernel */ @@ -1945,14 +1586,6 @@ int dlm_join_lockspace(struct lockspace *ls) cpg_handle_t h; struct cpg_name name; int i = 0, fd, ci, rv; - int unused; - - rv = fence_in_progress(&unused); - if (cfgd_enable_fencing && rv < 0) { - log_error("dlm_join_lockspace no fence domain"); - rv = -1; - goto fail_free; - } error = cpg_model_initialize(&h, CPG_MODEL_V1, (cpg_model_data_t *)&cpg_callbacks, NULL); @@ -2038,645 +1671,6 @@ int dlm_leave_lockspace(struct lockspace *ls) return 0; } -static struct node_daemon *get_node_daemon(int nodeid) -{ - struct node_daemon *node; - - list_for_each_entry(node, &daemon_nodes, list) { - if (node->nodeid == nodeid) - return node; - } - return NULL; -} - -static void add_node_daemon(int nodeid) -{ - struct node_daemon *node; - - if (get_node_daemon(nodeid)) - return; - - node = malloc(sizeof(struct node_daemon)); - if (!node) { - log_error("add_node_daemon no mem"); - return; - } - memset(node, 0, sizeof(struct node_daemon)); - node->nodeid = nodeid; - list_add_tail(&node->list, &daemon_nodes); -} - -static void pv_in(struct protocol_version *pv) -{ - pv->major = le16_to_cpu(pv->major); - pv->minor = le16_to_cpu(pv->minor); - pv->patch = le16_to_cpu(pv->patch); - pv->flags = le16_to_cpu(pv->flags); -} - -static void pv_out(struct protocol_version *pv) -{ - pv->major = cpu_to_le16(pv->major); - pv->minor = cpu_to_le16(pv->minor); - pv->patch = cpu_to_le16(pv->patch); - pv->flags = cpu_to_le16(pv->flags); -} - -static void protocol_in(struct protocol *proto) -{ - pv_in(&proto->dm_ver); - pv_in(&proto->km_ver); - pv_in(&proto->dr_ver); - pv_in(&proto->kr_ver); -} - -static void protocol_out(struct protocol *proto) -{ - pv_out(&proto->dm_ver); - pv_out(&proto->km_ver); - pv_out(&proto->dr_ver); - pv_out(&proto->kr_ver); -} - -/* go through member list saved in last confchg, see if we have received a - proto message from each */ - -static int all_protocol_messages(void) -{ - struct node_daemon *node; - int i; - - if (!daemon_member_count) - return 0; - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) { - log_error("all_protocol_messages no node %d", - daemon_member[i].nodeid); - return 0; - } - - if (!node->proto.daemon_max[0]) - return 0; - } - return 1; -} - -static int pick_min_protocol(struct protocol *proto) -{ - uint16_t mind[4]; - uint16_t mink[4]; - struct node_daemon *node; - int i; - - memset(&mind, 0, sizeof(mind)); - memset(&mink, 0, sizeof(mink)); - - /* first choose the minimum major */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) { - log_error("pick_min_protocol no node %d", - daemon_member[i].nodeid); - return -1; - } - - if (!mind[0] || node->proto.daemon_max[0] < mind[0]) - mind[0] = node->proto.daemon_max[0]; - - if (!mink[0] || node->proto.kernel_max[0] < mink[0]) - mink[0] = node->proto.kernel_max[0]; - } - - if (!mind[0] || !mink[0]) { - log_error("pick_min_protocol zero major number"); - return -1; - } - - /* second pick the minimum minor with the chosen major */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) - continue; - - if (mind[0] == node->proto.daemon_max[0]) { - if (!mind[1] || node->proto.daemon_max[1] < mind[1]) - mind[1] = node->proto.daemon_max[1]; - } - - if (mink[0] == node->proto.kernel_max[0]) { - if (!mink[1] || node->proto.kernel_max[1] < mink[1]) - mink[1] = node->proto.kernel_max[1]; - } - } - - if (!mind[1] || !mink[1]) { - log_error("pick_min_protocol zero minor number"); - return -1; - } - - /* third pick the minimum patch with the chosen major.minor */ - - for (i = 0; i < daemon_member_count; i++) { - node = get_node_daemon(daemon_member[i].nodeid); - if (!node) - continue; - - if (mind[0] == node->proto.daemon_max[0] && - mind[1] == node->proto.daemon_max[1]) { - if (!mind[2] || node->proto.daemon_max[2] < mind[2]) - mind[2] = node->proto.daemon_max[2]; - } - - if (mink[0] == node->proto.kernel_max[0] && - mink[1] == node->proto.kernel_max[1]) { - if (!mink[2] || node->proto.kernel_max[2] < mink[2]) - mink[2] = node->proto.kernel_max[2]; - } - } - - if (!mind[2] || !mink[2]) { - log_error("pick_min_protocol zero patch number"); - return -1; - } - - memcpy(&proto->daemon_run, &mind, sizeof(mind)); - memcpy(&proto->kernel_run, &mink, sizeof(mink)); - return 0; -} - -static void receive_protocol(struct dlm_header *hd, int len) -{ - struct protocol *p; - struct node_daemon *node; - - p = (struct protocol *)((char *)hd + sizeof(struct dlm_header)); - protocol_in(p); - - if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) { - log_error("receive_protocol invalid len %d from %d", - len, hd->nodeid); - return; - } - - /* zero is an invalid version value */ - - if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] || - !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) { - log_error("receive_protocol invalid max value from %d " - "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, - p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], - p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]); - return; - } - - /* the run values will be zero until a version is set, after - which none of the run values can be zero */ - - if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] || - !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) { - log_error("receive_protocol invalid run value from %d " - "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, - p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], - p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]); - return; - } - - /* save this node's proto so we can tell when we've got all, and - use it to select a minimum protocol from all */ - - node = get_node_daemon(hd->nodeid); - if (!node) { - log_error("receive_protocol no node %d", hd->nodeid); - return; - } - - if (!node->daemon_member) { - log_error("receive_protocol node %d not member", hd->nodeid); - return; - } - - log_debug("receive_protocol from %d max %u.%u.%u.%x run %u.%u.%u.%x", - hd->nodeid, - p->daemon_max[0], p->daemon_max[1], - p->daemon_max[2], p->daemon_max[3], - p->daemon_run[0], p->daemon_run[1], - p->daemon_run[2], p->daemon_run[3]); - log_debug("daemon node %d max %u.%u.%u.%x run %u.%u.%u.%x", - hd->nodeid, - node->proto.daemon_max[0], node->proto.daemon_max[1], - node->proto.daemon_max[2], node->proto.daemon_max[3], - node->proto.daemon_run[0], node->proto.daemon_run[1], - node->proto.daemon_run[2], node->proto.daemon_run[3]); - log_debug("daemon node %d join %llu left %llu local quorum %llu", - hd->nodeid, - (unsigned long long)node->daemon_add_time, - (unsigned long long)node->daemon_rem_time, - (unsigned long long)quorate_time); - - /* checking zero node->daemon_max[0] is a way to tell if we've received - an acceptable (non-stateful) proto message from the node since we - saw it join the daemon cpg */ - - if (hd->nodeid != our_nodeid && - !node->proto.daemon_max[0] && - (p->dr_ver.flags & PV_STATEFUL) && - (our_protocol.dr_ver.flags & PV_STATEFUL)) { - - log_debug("daemon node %d stateful merge", hd->nodeid); - - if (cluster_quorate && node->daemon_rem_time && - quorate_time < node->daemon_rem_time) { - log_debug("daemon node %d kill due to stateful merge", hd->nodeid); - if (!node->killed) - kick_node_from_cluster(hd->nodeid); - node->killed = 1; - } - - /* don't save p->proto into node->proto; we need to come - through here based on zero daemon_max[0] for other proto - messages like this one from the same node */ - - return; - } - - memcpy(&node->proto, p, sizeof(struct protocol)); - - /* if we have zero run values, and this msg has non-zero run values, - then adopt them as ours; otherwise save this proto message */ - - if (our_protocol.daemon_run[0]) - return; - - if (p->daemon_run[0]) { - our_protocol.daemon_run[0] = p->daemon_run[0]; - our_protocol.daemon_run[1] = p->daemon_run[1]; - our_protocol.daemon_run[2] = p->daemon_run[2]; - - our_protocol.kernel_run[0] = p->kernel_run[0]; - our_protocol.kernel_run[1] = p->kernel_run[1]; - our_protocol.kernel_run[2] = p->kernel_run[2]; - - log_debug("run protocol from nodeid %d", hd->nodeid); - } -} - -static void send_protocol(struct protocol *proto) -{ - struct dlm_header *hd; - struct protocol *pr; - char *buf; - int len; - - len = sizeof(struct dlm_header) + sizeof(struct protocol); - buf = malloc(len); - if (!buf) { - log_error("send_protocol no mem %d", len); - return; - } - memset(buf, 0, len); - - hd = (struct dlm_header *)buf; - pr = (struct protocol *)(buf + sizeof(*hd)); - - hd->type = cpu_to_le16(DLM_MSG_PROTOCOL); - hd->nodeid = cpu_to_le32(our_nodeid); - - memcpy(pr, proto, sizeof(struct protocol)); - protocol_out(pr); - - _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL); -} - -int set_protocol(void) -{ - struct protocol proto; - struct pollfd pollfd; - int sent_proposal = 0; - int rv; - - memset(&pollfd, 0, sizeof(pollfd)); - pollfd.fd = cpg_fd_daemon; - pollfd.events = POLLIN; - - while (1) { - if (our_protocol.daemon_run[0]) - break; - - if (!sent_proposal && all_protocol_messages()) { - /* propose a protocol; look through info from all - nodes and pick the min for both daemon and kernel, - and propose that */ - - sent_proposal = 1; - - /* copy our max values */ - memcpy(&proto, &our_protocol, sizeof(struct protocol)); - - rv = pick_min_protocol(&proto); - if (rv < 0) - return rv; - - log_debug("set_protocol member_count %d propose " - "daemon %u.%u.%u kernel %u.%u.%u", - daemon_member_count, - proto.daemon_run[0], proto.daemon_run[1], - proto.daemon_run[2], proto.kernel_run[0], - proto.kernel_run[1], proto.kernel_run[2]); - - send_protocol(&proto); - } - - /* only process messages/events from daemon cpg until protocol - is established */ - - rv = poll(&pollfd, 1, -1); - if (rv == -1 && errno == EINTR) { - if (daemon_quit) - return -1; - continue; - } - if (rv < 0) { - log_error("set_protocol poll errno %d", errno); - return -1; - } - - if (pollfd.revents & POLLIN) - process_cpg_daemon(0); - if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { - log_error("set_protocol poll revents %u", - pollfd.revents); - return -1; - } - } - - if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] || - our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) { - log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u", - our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2], - our_protocol.daemon_max[0], - our_protocol.daemon_max[1], - our_protocol.daemon_max[2]); - return -1; - } - - if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] || - our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) { - log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u", - our_protocol.kernel_run[0], - our_protocol.kernel_run[1], - our_protocol.kernel_run[2], - our_protocol.kernel_max[0], - our_protocol.kernel_max[1], - our_protocol.kernel_max[2]); - return -1; - } - - log_debug("daemon run %u.%u.%u max %u.%u.%u " - "kernel run %u.%u.%u max %u.%u.%u", - our_protocol.daemon_run[0], - our_protocol.daemon_run[1], - our_protocol.daemon_run[2], - our_protocol.daemon_max[0], - our_protocol.daemon_max[1], - our_protocol.daemon_max[2], - our_protocol.kernel_run[0], - our_protocol.kernel_run[1], - our_protocol.kernel_run[2], - our_protocol.kernel_max[0], - our_protocol.kernel_max[1], - our_protocol.kernel_max[2]); - - send_protocol(&our_protocol); - return 0; -} - -static void deliver_cb_daemon(cpg_handle_t handle, - const struct cpg_name *group_name, - uint32_t nodeid, uint32_t pid, - void *data, size_t len) -{ - struct dlm_header *hd; - - if (len < sizeof(*hd)) { - log_error("deliver_cb short message %zd", len); - return; - } - - hd = (struct dlm_header *)data; - dlm_header_in(hd); - - switch (hd->type) { - case DLM_MSG_PROTOCOL: - receive_protocol(hd, len); - break; - default: - log_error("deliver_cb_daemon unknown msg type %d", hd->type); - } -} - -static int in_daemon_member_list(int nodeid) -{ - int i; - - for (i = 0; i < daemon_member_count; i++) { - if (daemon_member[i].nodeid == nodeid) - return 1; - } - return 0; -} - -static void confchg_cb_daemon(cpg_handle_t handle, - const struct cpg_name *group_name, - const struct cpg_address *member_list, - size_t member_list_entries, - const struct cpg_address *left_list, - size_t left_list_entries, - const struct cpg_address *joined_list, - size_t joined_list_entries) -{ - struct node_daemon *node; - uint64_t now = monotime(); - int nodedown = 0, procdown = 0, leave = 0; - int i; - - log_config(group_name, member_list, member_list_entries, - left_list, left_list_entries, - joined_list, joined_list_entries); - - for (i = 0; i < left_list_entries; i++) { - if (left_list[i].reason == CPG_REASON_NODEDOWN) - nodedown++; - else if (left_list[i].reason == CPG_REASON_PROCDOWN) - procdown++; - else if (left_list[i].reason == CPG_REASON_LEAVE) - leave++; - } - - if (nodedown || procdown || leave) - log_debug("%s left nodedown %d procdown %d leave %d", - group_name->value, nodedown, procdown, leave); - - if (joined_list_entries) - send_protocol(&our_protocol); - - memset(&daemon_member, 0, sizeof(daemon_member)); - daemon_member_count = member_list_entries; - - for (i = 0; i < member_list_entries; i++) { - daemon_member[i] = member_list[i]; - /* add struct for nodes we've not seen before */ - add_node_daemon(member_list[i].nodeid); - } - - list_for_each_entry(node, &daemon_nodes, list) { - if (in_daemon_member_list(node->nodeid)) { - if (node->daemon_member) - continue; - - /* node joined daemon cpg */ - node->daemon_member = 1; - node->daemon_add_time = now; - } else { - if (!node->daemon_member) - continue; - - /* node left daemon cpg */ - node->daemon_member = 0; - node->killed = 0; - memset(&node->proto, 0, sizeof(struct protocol)); - node->daemon_rem_time = now; - } - } -} - -static void totem_cb_daemon(cpg_handle_t handle, - struct cpg_ring_id ring_id, - uint32_t member_list_entries, - const uint32_t *member_list) -{ - log_ringid("dlm:controld", &ring_id, member_list, member_list_entries); -} - -static cpg_model_v1_data_t cpg_callbacks_daemon = { - .cpg_deliver_fn = deliver_cb_daemon, - .cpg_confchg_fn = confchg_cb_daemon, - .cpg_totem_confchg_fn = totem_cb_daemon, - .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, -}; - -void process_cpg_daemon(int ci) -{ - cs_error_t error; - - error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL); - if (error != CS_OK) - log_error("daemon cpg_dispatch error %d", error); -} - -int setup_cpg_daemon(void) -{ - cs_error_t error; - struct cpg_name name; - int i = 0; - - INIT_LIST_HEAD(&daemon_nodes); - - /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible - with cluster4/RHEL7 */ - - memset(&our_protocol, 0, sizeof(our_protocol)); - - if (cfgd_enable_fscontrol) - our_protocol.daemon_max[0] = 2; - else - our_protocol.daemon_max[0] = 3; - - our_protocol.daemon_max[1] = 1; - our_protocol.daemon_max[2] = 1; - our_protocol.kernel_max[0] = 1; - our_protocol.kernel_max[1] = 1; - our_protocol.kernel_max[2] = 1; - - error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1, - (cpg_model_data_t *)&cpg_callbacks_daemon, - NULL); - if (error != CS_OK) { - log_error("daemon cpg_initialize error %d", error); - return -1; - } - - cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon); - - memset(&name, 0, sizeof(name)); - sprintf(name.value, "dlm:controld"); - name.length = strlen(name.value) + 1; - - log_debug("cpg_join %s ...", name.value); - retry: - error = cpg_join(cpg_handle_daemon, &name); - if (error == CS_ERR_TRY_AGAIN) { - sleep(1); - if (!(++i % 10)) - log_error("daemon cpg_join error retrying"); - goto retry; - } - if (error != CS_OK) { - log_error("daemon cpg_join error %d", error); - goto fail; - } - - log_debug("setup_cpg_daemon %d", cpg_fd_daemon); - return cpg_fd_daemon; - - fail: - cpg_finalize(cpg_handle_daemon); - return -1; -} - -void close_cpg_daemon(void) -{ - struct lockspace *ls; - cs_error_t error; - struct cpg_name name; - int i = 0; - - if (!cpg_handle_daemon) - return; - if (cluster_down) - goto fin; - - memset(&name, 0, sizeof(name)); - sprintf(name.value, "dlm:controld"); - name.length = strlen(name.value) + 1; - - log_debug("cpg_leave %s ...", name.value); - retry: - error = cpg_leave(cpg_handle_daemon, &name); - if (error == CS_ERR_TRY_AGAIN) { - sleep(1); - if (!(++i % 10)) - log_error("daemon cpg_leave error retrying"); - goto retry; - } - if (error != CS_OK) - log_error("daemon cpg_leave error %d", error); - fin: - list_for_each_entry(ls, &lockspaces, list) { - if (ls->cpg_handle) - cpg_finalize(ls->cpg_handle); - } - cpg_finalize(cpg_handle_daemon); -} - -/* fs_controld has seen nodedown for nodeid; it's now ok for dlm to do - recovery for the failed node */ - int set_fs_notified(struct lockspace *ls, int nodeid) { struct node *node; @@ -2755,12 +1749,8 @@ int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace) if (cg->state == CGST_WAIT_CONDITIONS) lockspace->cg_next.wait_condition = 5; - if (poll_ringid) - lockspace->cg_next.wait_condition = 1; else if (poll_fencing) lockspace->cg_next.wait_condition = 2; - else if (poll_quorum) - lockspace->cg_next.wait_condition = 3; else if (poll_fs) lockspace->cg_next.wait_condition = 4; @@ -2795,7 +1785,7 @@ static int _set_node_info(struct lockspace *ls, struct change *cg, int nodeid, if (!n) goto out; - if (n->check_fencing) + if (n->need_fencing) node->flags |= DLMC_NF_CHECK_FENCING; if (n->check_fs) node->flags |= DLMC_NF_CHECK_FS; diff --git a/dlm_controld/daemon_cpg.c b/dlm_controld/daemon_cpg.c new file mode 100644 index 0000000..e5e9270 --- /dev/null +++ b/dlm_controld/daemon_cpg.c @@ -0,0 +1,1921 @@ +/* + * Copyright 2004-2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include "dlm_daemon.h" + +/* protocol_version flags */ +#define PV_STATEFUL 0x0001 + +struct protocol_version { + uint16_t major; + uint16_t minor; + uint16_t patch; + uint16_t flags; +}; + +struct protocol { + union { + struct protocol_version dm_ver; + uint16_t daemon_max[4]; + }; + union { + struct protocol_version km_ver; + uint16_t kernel_max[4]; + }; + union { + struct protocol_version dr_ver; + uint16_t daemon_run[4]; + }; + union { + struct protocol_version kr_ver; + uint16_t kernel_run[4]; + }; +}; + +/* fence_result flags */ +#define FR_FIPU 0x00000001 +#define FR_CLEAR_STARTUP 0x00000002 +#define FR_CLEAR_FIPU 0x00000004 + +struct fence_result { + uint32_t version; + uint32_t flags; + uint32_t nodeid; + uint32_t result; + uint64_t fence_walltime; + char unused[1000]; +}; + +struct node_daemon { + struct list_head list; + int nodeid; + + uint64_t daemon_add_time; + uint64_t daemon_rem_time; + int daemon_member; + + int killed; + + struct protocol proto; + + struct fence_config fence_config; + + int fence_in_progress_unknown; + int left_reason; + int recover_setup; + int need_fence_clear; + int need_fencing; + int fence_pid; + int fence_pid_wait; + int fence_actor_done; + int fence_actors[MAX_NODES]; + uint64_t fail_walltime; + uint64_t fail_monotime; + uint64_t fence_request_time; + uint64_t fence_walltime; + uint64_t fence_monotime; +}; + +#define REASON_STARTUP_FENCING -1 + +static cpg_handle_t cpg_handle_daemon; +static int cpg_fd_daemon; +static struct protocol our_protocol; +static struct list_head daemon_nodes; +static struct list_head startup_nodes; +static struct cpg_address daemon_member[MAX_NODES]; +static struct cpg_address daemon_joined[MAX_NODES]; +static struct cpg_address daemon_remove[MAX_NODES]; +static int daemon_member_count; +static int daemon_joined_count; +static int daemon_remove_count; +static int daemon_ringid_wait; +static struct cpg_ring_id daemon_ringid; +static int daemon_clear_nodeid; +static int daemon_clear_pid; +static uint64_t last_join_monotime; +static uint32_t last_join_seq; +static uint32_t send_fipu_seq; +static int fence_in_progress_unknown = 1; + +static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime); +static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime); + +void log_config(const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries) +{ + char m_buf[128]; + char j_buf[32]; + char l_buf[32]; + size_t i, len, pos; + int ret; + + memset(m_buf, 0, sizeof(m_buf)); + memset(j_buf, 0, sizeof(j_buf)); + memset(l_buf, 0, sizeof(l_buf)); + + len = sizeof(m_buf); + pos = 0; + for (i = 0; i < member_list_entries; i++) { + ret = snprintf(m_buf + pos, len - pos, " %d", + member_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + len = sizeof(j_buf); + pos = 0; + for (i = 0; i < joined_list_entries; i++) { + ret = snprintf(j_buf + pos, len - pos, " %d", + joined_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + len = sizeof(l_buf); + pos = 0; + for (i = 0; i < left_list_entries; i++) { + ret = snprintf(l_buf + pos, len - pos, " %d", + left_list[i].nodeid); + if (ret >= len - pos) + break; + pos += ret; + } + + log_debug("%s conf %zu %zu %zu memb%s join%s left%s", group_name->value, + member_list_entries, joined_list_entries, left_list_entries, + m_buf, j_buf, l_buf); +} + +void log_ringid(const char *name, + struct cpg_ring_id *ringid, + const uint32_t *member_list, + size_t member_list_entries) +{ + char m_buf[128]; + size_t i, len, pos; + int ret; + + memset(m_buf, 0, sizeof(m_buf)); + + len = sizeof(m_buf); + pos = 0; + for (i = 0; i < member_list_entries; i++) { + ret = snprintf(m_buf + pos, len - pos, " %u", + member_list[i]); + if (ret >= len - pos) + break; + pos += ret; + } + + log_debug("%s ring %u:%llu %zu memb%s", + name, ringid->nodeid, (unsigned long long)ringid->seq, + member_list_entries, m_buf); +} + +const char *reason_str(int reason) +{ + switch (reason) { + case CPG_REASON_JOIN: + return "join"; + case CPG_REASON_LEAVE: + return "leave"; + case CPG_REASON_NODEDOWN: + return "nodedown"; + case CPG_REASON_NODEUP: + return "nodeup"; + case CPG_REASON_PROCDOWN: + return "procdown"; + default: + return "unknown"; + }; +} + +const char *msg_name(int type) +{ + switch (type) { + case DLM_MSG_PROTOCOL: + return "protocol"; + case DLM_MSG_FENCE_RESULT: + return "fence_result"; + case DLM_MSG_FENCE_CLEAR: + return "fence_clear"; + + case DLM_MSG_START: + return "start"; + case DLM_MSG_PLOCK: + return "plock"; + case DLM_MSG_PLOCK_OWN: + return "plock_own"; + case DLM_MSG_PLOCK_DROP: + return "plock_drop"; + case DLM_MSG_PLOCK_SYNC_LOCK: + return "plock_sync_lock"; + case DLM_MSG_PLOCK_SYNC_WAITER: + return "plock_sync_waiter"; + case DLM_MSG_PLOCKS_DATA: + return "plocks_data"; + case DLM_MSG_PLOCKS_DONE: + return "plocks_done"; + case DLM_MSG_DEADLK_CYCLE_START: + return "deadlk_cycle_start"; + case DLM_MSG_DEADLK_CYCLE_END: + return "deadlk_cycle_end"; + case DLM_MSG_DEADLK_CHECKPOINT_READY: + return "deadlk_checkpoint_ready"; + case DLM_MSG_DEADLK_CANCEL_LOCK: + return "deadlk_cancel_lock"; + default: + return "unknown"; + } +} + +static int _send_message(cpg_handle_t h, void *buf, int len, int type) +{ + struct iovec iov; + cs_error_t error; + int retries = 0; + + iov.iov_base = buf; + iov.iov_len = len; + + retry: + error = cpg_mcast_joined(h, CPG_TYPE_AGREED, &iov, 1); + if (error == CS_ERR_TRY_AGAIN) { + retries++; + usleep(1000); + if (!(retries % 100)) + log_error("cpg_mcast_joined retry %d %s", + retries, msg_name(type)); + goto retry; + } + if (error != CS_OK) { + log_error("cpg_mcast_joined error %d handle %llx %s", + error, (unsigned long long)h, msg_name(type)); + return -1; + } + + if (retries) + log_debug("cpg_mcast_joined retried %d %s", + retries, msg_name(type)); + + return 0; +} + +/* header fields caller needs to set: type, to_nodeid, flags, msgdata */ + +void dlm_send_message(struct lockspace *ls, char *buf, int len) +{ + struct dlm_header *hd = (struct dlm_header *) buf; + int type = hd->type; + + hd->version[0] = cpu_to_le16(our_protocol.daemon_run[0]); + hd->version[1] = cpu_to_le16(our_protocol.daemon_run[1]); + hd->version[2] = cpu_to_le16(our_protocol.daemon_run[2]); + hd->type = cpu_to_le16(hd->type); + hd->nodeid = cpu_to_le32(our_nodeid); + hd->to_nodeid = cpu_to_le32(hd->to_nodeid); + hd->global_id = cpu_to_le32(ls->global_id); + hd->flags = cpu_to_le32(hd->flags); + hd->msgdata = cpu_to_le32(hd->msgdata); + hd->msgdata2 = cpu_to_le32(hd->msgdata2); + + _send_message(ls->cpg_handle, buf, len, type); +} + +void dlm_header_in(struct dlm_header *hd) +{ + hd->version[0] = le16_to_cpu(hd->version[0]); + hd->version[1] = le16_to_cpu(hd->version[1]); + hd->version[2] = le16_to_cpu(hd->version[2]); + hd->type = le16_to_cpu(hd->type); + hd->nodeid = le32_to_cpu(hd->nodeid); + hd->to_nodeid = le32_to_cpu(hd->to_nodeid); + hd->global_id = le32_to_cpu(hd->global_id); + hd->flags = le32_to_cpu(hd->flags); + hd->msgdata = le32_to_cpu(hd->msgdata); + hd->msgdata2 = le32_to_cpu(hd->msgdata2); +} + +int dlm_header_validate(struct dlm_header *hd, int nodeid) +{ + if (hd->version[0] != our_protocol.daemon_run[0] || + hd->version[1] != our_protocol.daemon_run[1]) { + log_error("reject message from %d version %u.%u.%u vs %u.%u.%u", + nodeid, hd->version[0], hd->version[1], + hd->version[2], our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2]); + return -1; + } + + if (hd->nodeid != nodeid) { + log_error("bad msg nodeid %d %d", hd->nodeid, nodeid); + return -1; + } + + return 0; +} + +static struct node_daemon *get_node_daemon(int nodeid) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (node->nodeid == nodeid) + return node; + } + return NULL; +} + +static int nodes_need_fencing(void) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (node->need_fencing) + return 1; + } + return 0; +} + +static int all_daemon_members_fipu(void) +{ + struct node_daemon *node; + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member) + continue; + if (!node->fence_in_progress_unknown) + return 0; + } + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member) + continue; + node->fence_in_progress_unknown = 0; + } + + return 1; +} + +int fence_node_time(int nodeid, uint64_t *last_fenced) +{ + struct node_daemon *node; + + node = get_node_daemon(nodeid); + if (!node) + return -1; + + *last_fenced = node->fence_monotime; + return 0; +} + +int fence_in_progress(int *in_progress) +{ + if (fence_in_progress_unknown) { + *in_progress = 1; + } else if (!list_empty(&startup_nodes)) { + *in_progress = 2; + } else if (nodes_need_fencing()) { + *in_progress = 3; + } else { + *in_progress = 0; + } + return 0; +} + +void add_startup_node(int nodeid) +{ + struct node_daemon *node; + + node = malloc(sizeof(struct node_daemon)); + if (!node) { + log_error("add_startup_node no mem"); + return; + } + memset(node, 0, sizeof(struct node_daemon)); + node->nodeid = nodeid; + list_add_tail(&node->list, &startup_nodes); +} + +static int clear_startup_node(int nodeid, int all) +{ + struct node_daemon *node, *safe; + int count = 0; + + list_for_each_entry_safe(node, safe, &startup_nodes, list) { + if (all || node->nodeid == nodeid) { + list_del(&node->list); + free(node); + count++; + } + } + return count; +} + +static struct node_daemon *add_node_daemon(int nodeid) +{ + struct node_daemon *node; + struct fence_config *fc; + + node = get_node_daemon(nodeid); + if (node) + return node; + + node = malloc(sizeof(struct node_daemon)); + if (!node) { + log_error("add_node_daemon no mem"); + return NULL; + } + memset(node, 0, sizeof(struct node_daemon)); + node->nodeid = nodeid; + list_add_tail(&node->list, &daemon_nodes); + + /* TODO: allow the config to be reread */ + + fc = &node->fence_config; + fc->nodeid = nodeid; + + /* explicit command line arg has first priority */ + + if (optd_fence_all_agent) { + fc->dev[0] = &fence_all_device; + goto out; + } + + /* explicit config file setting has second priority */ + + fence_config_init(fc, (unsigned int)nodeid, (char *)CONF_FILE_PATH); + + /* no command line, no config file, use default, third priority */ + + if (!fc->dev[0] && fence_all_agent[0]) + fc->dev[0] = &fence_all_device; + out: + return node; +} + +/* A clean daemon member is a node that has joined the daemon cpg + from a "clean state", i.e. not a stateful merge. If would not + have joined the daemon cpg if it found uncontrolled dlm kernel + state (check_uncontrolled_lockspaces). We would not have + accepted and saved its protocol in node->proto.daemon if it + was a stateful merge. */ + +static int is_clean_daemon_member(int nodeid) +{ + struct node_daemon *node; + + node = get_node_daemon(nodeid); + if (node && node->daemon_member && node->proto.daemon_max[0]) + return 1; + return 0; +} + +static int in_daemon_list(int nodeid, struct cpg_address *daemon_list, int count) +{ + int i; + + for (i = 0; i < count; i++) { + if (daemon_list[i].nodeid == nodeid) + return 1; + } + return 0; +} + +/* save in node->fence_actors[] any nodeid present when the node + failed which therefore saw it fail, knows it needs fencing, and + can request fencing for it if it becomes the low actor. A node + added in the same change with the removed node does not qualify. */ + +static int set_fence_actors(struct node_daemon *node, int all_memb) +{ + int i, nodeid, count = 0, low = 0; + + memset(node->fence_actors, 0, sizeof(node->fence_actors)); + + for (i = 0; i < daemon_member_count; i++) { + nodeid = daemon_member[i].nodeid; + + if (!all_memb && in_daemon_list(nodeid, daemon_joined, daemon_joined_count)) + continue; + + node->fence_actors[count++] = nodeid; + + if (!low || nodeid < low) + low = nodeid; + } + + log_debug("set_fence_actors for %d low %d count %d", + node->nodeid, low, count); + return low; +} + +static int get_fence_actor(struct node_daemon *node) +{ + int i, low, low_i; + + retry: + low = 0; + + for (i = 0; i < MAX_NODES; i++) { + if (!node->fence_actors[i]) + continue; + + if (!low || node->fence_actors[i] < low) { + low = node->fence_actors[i]; + low_i = i; + } + } + + if (low && !in_daemon_list(low, daemon_member, daemon_member_count)) { + log_debug("get_fence_actor for %d low actor %d is gone", + node->nodeid, low); + + node->fence_actors[low_i] = 0; + goto retry; + } + + return low; +} + +/* if an actor fails to fence, it will send that result, and others + will clear it from the actors, which will cause the next lowest + actor to try */ + +static void clear_fence_actor(int nodeid, int actor) +{ + struct node_daemon *node; + int i; + + node = get_node_daemon(nodeid); + if (!node) + return; + + for (i = 0; i < MAX_NODES; i++) { + if (node->fence_actors[i] == actor) { + node->fence_actors[i] = 0; + return; + } + } +} + +/* TODO: handle delayed cleanup of more than one pid */ + +static void fence_pid_cancel(int nodeid, int pid) +{ + struct node_daemon *node; + int rv, result; + + log_debug("fence_pid_cancel nodeid %d pid %d", nodeid, pid); + + kill(pid, SIGKILL); + usleep(500000); + + rv = fence_result(nodeid, pid, &result); + if (rv == -EAGAIN) { + /* Try again later */ + daemon_clear_nodeid = nodeid; + daemon_clear_pid = pid; + } else { + log_debug("fence_pid_cancel nodeid %d pid %d done %d", + nodeid, pid, result); + + daemon_clear_nodeid = 0; + daemon_clear_pid = 0; + + node = get_node_daemon(nodeid); + if (node && node->fence_pid == pid) { + node->fence_pid_wait = 0; + node->fence_pid = 0; + } + } +} + +/* + * fence_in_progress_unknown (fipu) + * + * If current daemon members are fencing someone, and a new node + * joins, that new node needs to wait for the previous members to + * finish any fencing they're doing before it can start a lockspace. + * + * The previous members may be fencing the last node that was using + * the lockspace the new node is going to use, so if it doesn't wait, + * it could start using a lockspace with an unfenced user. + * + * So, the daemon starts with fence_in_progress_unknown set to + * indicate that other nodes may be fencing someone, and it won't + * start any lockspaces until it is clear. + * + * A node starts with fence_in_progress_unknown set and won't + * start any lockspaces until it's clear. + * + * When using startup_fencing: + * + * . When all nodes start up together, all have fipu set, + * and will go through startup fencing, which will eventually + * result in all nodes either being clean daemon members or fenced, + * so everyone will clear fipu by seeing that. + * + * . The more common case is when a new node joins other previously + * running nodes. The new node needs to be told that the others + * have no outstanding fencing ops before it can clear fipu. + * A previous member does send_fence_clear(0) to a new node once + * all fencing is complete. The two flags in send_fence_clear are + * usually sent together but may sometimes may be in separate messages: + * send_fence_clear(0, CLEAR_STARTUP) to clear startup_nodes right away + * send_fence_clear(0, CLEAR_FIPU) to clear fipu once all fencing is done + * + * When not using startup_fencing: + * + * . When all nodes start up together, all have fipu set, and all + * will be waiting to receive_fence_clear from a previous node + * in order to clear it. The nodes need to detect this situation, + * and when they do, they will know that everyone is in startup, + * so there can be no pending fencing on a previous node, so all + * can clear fipu. To detect this case, when a node starts up + * with !startup_fence, it sends a special send_fence_clear(-ENODATA, FIPU) + * message about itself to indicate it has fipu set and needs it cleared. + * After sending this, it checks to see if all present nodes have sent + * this same message about themselves. If so, then this startup + * case has been detected, an all will clear fipu. + * + * . New nodes that join after this startup initialization will be + * handled the same as when startup_fencing is set (above). + * + * + * startup_fencing + * --------------- + * + * case A + * all nodes start up, + * all have fipu set, + * all wait for startup_nodes to be empty, (joined or moved to need_fencing) + * all wait for no daemon_nodes to need_fencing, (joined or were fenced) + * all clear fipu + * + * later, + * + * case B + * new node starts, + * new node has fipu set, + * cur node sees need_fence_clear on new node + * cur node sees no pending fencing ops, + * cur node send_fence_clear(0) to new node, + * new node clears startup_nodes and fipu + * + * !startup_fencing + * ---------------- + * + * case C + * all nodes start up, + * all have fipu set, + * all send_fence_clear(-ENODATA,FIPU), + * all receive_fence_clear(-ENODATA,FIPU) from everyone, + * all_daemon_members_fipu() is 1, + * all clear fipu + * + * later same as case B above + */ + +/* + * TODO: limit to one agent running at once, in case both + * instances need to log into the same switch, for example. + */ + +static void daemon_fence_work(void) +{ + struct node_daemon *node, *safe; + int rv, nodeid, pid, need, low, actor, result; + uint32_t flags; + + if (daemon_ringid_wait) { + /* We've seen a nodedown confchg callback, but not the + corresponding ringid callback. */ + log_debug("fence work wait for cpg ringid"); + return; + } + + if (cluster_ringid_seq != daemon_ringid.seq) { + /* wait for ringids to be in sync */ + log_debug("fence work wait for cluster ringid"); + return; + } + + /* poll_fencing++; */ + + if (cfgd_enable_quorum_fencing && !cluster_quorate) { + /* wait for quorum before doing any fencing, but if there + is none, send_fence_clear below can unblock new nodes */ + log_debug("fence work wait for quorum"); + goto out_fipu; + } + + /* + * startup fencing + */ + + list_for_each_entry_safe(node, safe, &startup_nodes, list) { + if (is_clean_daemon_member(node->nodeid)) { + log_debug("fence startup %d member skip", node->nodeid); + list_del(&node->list); + free(node); + continue; + } + + if (!cfgd_startup_fence) + continue; + + if (monotime() - last_join_monotime < cfgd_startup_fence) { + log_debug("fence startup %d delay %d from %llu", + node->nodeid, cfgd_startup_fence, + (unsigned long long)last_join_monotime); + poll_fencing++; + continue; + } + + /* clear this entry and create a daemon_nodes entry with + need_fencing and the fence loops below will handle it */ + + nodeid = node->nodeid; + list_del(&node->list); + free(node); + + node = add_node_daemon(nodeid); + if (!node) { + log_debug("fence startup %d add failed", nodeid); + continue; + } + if (node->need_fencing) { + /* don't think this should happen? */ + log_error("fence startup %d already set", nodeid); + continue; + } + node->need_fencing = 1; + node->fence_config.pos = 0; + node->left_reason = REASON_STARTUP_FENCING; + node->fail_monotime = cluster_joined_monotime - 1; + node->fail_walltime = cluster_joined_walltime - 1; + node->fence_monotime = 0; + node->fence_walltime = 0; + node->fence_request_time = 0; + low = set_fence_actors(node, 1); + + log_debug("fence startup nodeid %d act %d", node->nodeid, low); + } + + /* + * request fencing + */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->need_fencing) + continue; + + if (node->fence_pid_wait) + continue; + + if (is_clean_daemon_member(node->nodeid)) { + /* node rejoined cleanly, doesn't need fencing */ + log_debug("fence request %d member skip", node->nodeid); + node->need_fencing = 0; + continue; + } + + /* get_fence_actor picks the low nodeid that existed + when node failed and is still around. if the current + actor fails, get_fence_actor will not find it in the + members list, will clear it, and return the next actor */ + + actor = get_fence_actor(node); + + if (!actor) { + log_error("fence request %d no actor", node->nodeid); + continue; + } + + if (actor != our_nodeid) { + log_debug("fence request %d defer to %d", + node->nodeid, actor); + continue; + } + + log_debug("fence request %d", node->nodeid); + + rv = fence_request(node->nodeid, + node->fail_walltime, node->fail_monotime, + &node->fence_config, &pid); + if (rv < 0) { + send_fence_result(node->nodeid, rv, 0, time(NULL)); + continue; + } + + node->fence_pid_wait = 1; + node->fence_pid = pid; + node->fence_request_time = monotime(); + } + + /* + * check outstanding fence requests + */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->need_fencing) + continue; + + if (!node->fence_pid_wait) { + /* + * another node is the actor, or we were actor, + * sent done msg and are waiting to recv it + */ + log_debug("fence wait %d for done", node->nodeid); + continue; + } + + if (is_clean_daemon_member(node->nodeid)) { + /* + * node has rejoined in clean state so we can + * abort outstanding fence op for it. all nodes + * will see and do this, so we don't need to send + * a fence result. + */ + log_debug("fence wait %d member skip", node->nodeid); + node->need_fencing = 0; + node->fence_walltime = time(NULL); + node->fence_monotime = monotime(); + fence_pid_cancel(node->nodeid, node->fence_pid); + continue; + } + + poll_fencing++; + + rv = fence_result(node->nodeid, node->fence_pid, &result); + if (rv == -EAGAIN) { + /* agent pid is still running */ + log_debug("fence wait %d pid %d running", + node->nodeid, node->fence_pid); + continue; + } + + if (rv < 0) { + /* shouldn't happen */ + log_error("fence wait %d pid %d error %d", + node->nodeid, node->fence_pid_wait, rv); + node->fence_pid_wait = 0; + continue; + } + + if (!result) { + /* agent exit 0, if there's another agent to run in + parallel, set it to run next, otherwise success */ + + log_debug("fence nodeid %d pid %d succeeded", + node->nodeid, node->fence_pid); + + node->fence_pid_wait = 0; + node->fence_pid = 0; + + rv = fence_config_next_parallel(&node->fence_config); + if (rv < 0) + send_fence_result(node->nodeid, 0, 0, time(NULL)); + } else { + /* agent exit 1, if there's another agent to run at + next priority, set it to run next, otherwise fail */ + + log_debug("fence nodeid %d pid %d failed %d", + node->nodeid, node->fence_pid, result); + + node->fence_pid_wait = 0; + node->fence_pid = 0; + + rv = fence_config_next_priority(&node->fence_config); + if (rv < 0) + send_fence_result(node->nodeid, result, 0, time(NULL)); + } + } + + /* + * clear fence_in_progress_unknown + */ + out_fipu: + need = nodes_need_fencing(); + + if (cfgd_startup_fence && fence_in_progress_unknown && !need && list_empty(&startup_nodes)) { + /* + * case A in comment above + * all nodes are starting and have fipu set, they all do + * startup fencing, and eventually see unknown nodes become + * members or get fenced, so all clear fipu for themselves. + */ + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 startup"); + } + + if (!fence_in_progress_unknown) { + /* + * case B in comment above + * some cur nodes have fipu clear, new nodes have fipu set. + * A current node needs to send_fence_clear to the new nodes + * once all fencing is done so they clear fipu. + */ + low = 0; + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member || node->need_fence_clear) + continue; + if (!low || node->nodeid < low) + low = node->nodeid; + } + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->daemon_member || !node->need_fence_clear) + continue; + if (node->nodeid == our_nodeid) { + node->need_fence_clear = 0; + continue; + } + if (low != our_nodeid) + continue; + + flags = 0; + + if (node->need_fence_clear & FR_CLEAR_STARTUP) { + flags |= FR_CLEAR_STARTUP; + node->need_fence_clear &= ~FR_CLEAR_STARTUP; + } + + if ((node->need_fence_clear & FR_CLEAR_FIPU) && !need) { + flags |= FR_CLEAR_FIPU; + node->need_fence_clear &= ~FR_CLEAR_FIPU; + } + + if (!flags) + continue; + + send_fence_clear(node->nodeid, 0, flags, 0); + } + } + + if (!cfgd_startup_fence && fence_in_progress_unknown) { + /* + * case C in comment above + * all nodes are starting and have fipu set. All expect a + * previous node to send_fence_clear so they can clear fipu. + * But there are no previous nodes. They need to detect this + * condition. Each node does send_fence_clear(ENODATA,FIPU). + * When all have received this from all, condition is + * detected and all clear fipu. + */ + if (all_daemon_members_fipu()) { + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 all_fipu"); + } else if (last_join_seq > send_fipu_seq) { + /* the seq numbers keep us from spamming this msg */ + send_fence_clear(our_nodeid, -ENODATA, FR_FIPU, 0); + log_debug("send_fence_clear %d fipu", our_nodeid); + send_fipu_seq = last_join_seq; + } + } + + /* + * clean up a zombie pid from an agent we killed + */ + + if (daemon_clear_pid) + fence_pid_cancel(daemon_clear_nodeid, daemon_clear_pid); +} + +void process_fencing_changes(void) +{ + poll_fencing = 0; + daemon_fence_work(); +} + +static void receive_fence_clear(struct dlm_header *hd, int len) +{ + struct fence_result *fr; + struct node_daemon *node; + int count; + + fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); + + fr->flags = le32_to_cpu(fr->flags); + fr->nodeid = le32_to_cpu(fr->nodeid); + fr->result = le32_to_cpu(fr->result); + fr->fence_walltime = le64_to_cpu(fr->fence_walltime); + + if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { + log_error("receive_fence_clear invalid len %d from %d", + len, hd->nodeid); + return; + } + + node = get_node_daemon(fr->nodeid); + if (!node) { + log_error("receive_fence_clear from %d no daemon node %d", + hd->nodeid, fr->nodeid); + return; + } + + log_debug("receive_fence_clear from %d for %d result %d flags %x", + hd->nodeid, fr->nodeid, fr->result, fr->flags); + + /* + * A node sends this message about itself indicating that it's in + * startup with fipu set. The only time we care about node->fipu + * is when all nodes are fipu in startup. node->need_fence_clear + * and node->fipu are not related, they address different cases. + */ + if ((fr->result == -ENODATA) && (fr->flags & FR_FIPU)) { + if (!fence_in_progress_unknown) + return; + + node->fence_in_progress_unknown = 1; + return; + } + + /* + * An previous member sends this to new members to tell them that + * they can clear startup_nodes and clear fipu. These two flags + * may come in separate messages if there is a pending fencing op + * when the new member joins (CLEAR_STARTUP will come right away, + * but CLEAR_FIPU will come once the fencing op is done.) + */ + if (!fr->result && (node->nodeid == our_nodeid)) { + if ((fr->flags & FR_CLEAR_STARTUP) && !list_empty(&startup_nodes)) { + count = clear_startup_node(0, 1); + log_debug("clear_startup_nodes %d", count); + } + + if ((fr->flags & FR_CLEAR_FIPU) && fence_in_progress_unknown) { + fence_in_progress_unknown = 0; + log_debug("fence_in_progress_unknown 0 recv"); + } + } + + /* this node doesn't need these flags any more */ + if (!fr->result) { + if (fr->flags & FR_CLEAR_STARTUP) + node->need_fence_clear &= ~FR_CLEAR_STARTUP; + if (fr->flags & FR_CLEAR_FIPU) + node->need_fence_clear &= ~FR_CLEAR_FIPU; + } +} + +static void send_fence_clear(int nodeid, int result, uint32_t flags, uint64_t walltime) +{ + struct dlm_header *hd; + struct fence_result *fr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct fence_result); + buf = malloc(len); + if (!buf) { + log_error("send_fence_clear no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + fr = (struct fence_result *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_FENCE_CLEAR); + hd->nodeid = cpu_to_le32(our_nodeid); + + fr->flags = cpu_to_le32(flags); + fr->nodeid = cpu_to_le32(nodeid); + fr->result = cpu_to_le32(result); + fr->fence_walltime = cpu_to_le64(walltime); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_CLEAR); +} + +static void receive_fence_result(struct dlm_header *hd, int len) +{ + struct fence_result *fr; + struct node_daemon *node; + int count; + + fr = (struct fence_result *)((char *)hd + sizeof(struct dlm_header)); + + fr->flags = le32_to_cpu(fr->flags); + fr->nodeid = le32_to_cpu(fr->nodeid); + fr->result = le32_to_cpu(fr->result); + fr->fence_walltime = le64_to_cpu(fr->fence_walltime); + + if (len < sizeof(struct dlm_header) + sizeof(struct fence_result)) { + log_error("receive_fence_result invalid len %d from %d", + len, hd->nodeid); + return; + } + + count = clear_startup_node(fr->nodeid, 0); + if (count) { + log_debug("receive_fence_result from %d for %d clear startup", + hd->nodeid, fr->nodeid); + } + + node = get_node_daemon(fr->nodeid); + if (!node) { + log_error("receive_fence_result from %d for %d no daemon node", + hd->nodeid, fr->nodeid); + return; + } + + log_debug("receive_fence_result from %d for %d result %d walltime %llu", + hd->nodeid, fr->nodeid, fr->result, + (unsigned long long)fr->fence_walltime); + + if (!node->need_fencing) { + /* should never happen */ + log_error("receive_fence_result from %d for %d result %d no need_fencing", + hd->nodeid, fr->nodeid, fr->result); + return; + } + + if (fr->result == -ECANCELED) { + /* if an agent pid is running, kill it and clean up */ + if (node->fence_pid_wait && node->fence_pid) + fence_pid_cancel(node->nodeid, node->fence_pid); + fr->result = 0; /* force success below */ + } + + if (!fr->result) { + node->need_fencing = 0; + node->fence_walltime = fr->fence_walltime; + node->fence_monotime = monotime(); + node->fence_actor_done = hd->nodeid; + } else { + clear_fence_actor(fr->nodeid, hd->nodeid); + } +} + +static void send_fence_result(int nodeid, int result, uint32_t flags, uint64_t walltime) +{ + struct dlm_header *hd; + struct fence_result *fr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct fence_result); + buf = malloc(len); + if (!buf) { + log_error("send_fence_result no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + fr = (struct fence_result *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_FENCE_RESULT); + hd->nodeid = cpu_to_le32(our_nodeid); + + fr->flags = cpu_to_le32(flags); + fr->nodeid = cpu_to_le32(nodeid); + fr->result = cpu_to_le32(result); + fr->fence_walltime = cpu_to_le64(walltime); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_FENCE_RESULT); +} + +void fence_ack_node(int nodeid) +{ + send_fence_result(nodeid, -ECANCELED, 0, time(NULL)); +} + +void set_protocol_stateful(void) +{ + our_protocol.dr_ver.flags |= PV_STATEFUL; +} + +static void pv_in(struct protocol_version *pv) +{ + pv->major = le16_to_cpu(pv->major); + pv->minor = le16_to_cpu(pv->minor); + pv->patch = le16_to_cpu(pv->patch); + pv->flags = le16_to_cpu(pv->flags); +} + +static void pv_out(struct protocol_version *pv) +{ + pv->major = cpu_to_le16(pv->major); + pv->minor = cpu_to_le16(pv->minor); + pv->patch = cpu_to_le16(pv->patch); + pv->flags = cpu_to_le16(pv->flags); +} + +static void protocol_in(struct protocol *proto) +{ + pv_in(&proto->dm_ver); + pv_in(&proto->km_ver); + pv_in(&proto->dr_ver); + pv_in(&proto->kr_ver); +} + +static void protocol_out(struct protocol *proto) +{ + pv_out(&proto->dm_ver); + pv_out(&proto->km_ver); + pv_out(&proto->dr_ver); + pv_out(&proto->kr_ver); +} + +/* go through member list saved in last confchg, see if we have received a + proto message from each */ + +static int all_protocol_messages(void) +{ + struct node_daemon *node; + int i; + + if (!daemon_member_count) + return 0; + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) { + log_error("all_protocol_messages no node %d", + daemon_member[i].nodeid); + return 0; + } + + if (!node->proto.daemon_max[0]) + return 0; + } + return 1; +} + +static int pick_min_protocol(struct protocol *proto) +{ + uint16_t mind[4]; + uint16_t mink[4]; + struct node_daemon *node; + int i; + + memset(&mind, 0, sizeof(mind)); + memset(&mink, 0, sizeof(mink)); + + /* first choose the minimum major */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) { + log_error("pick_min_protocol no node %d", + daemon_member[i].nodeid); + return -1; + } + + if (!mind[0] || node->proto.daemon_max[0] < mind[0]) + mind[0] = node->proto.daemon_max[0]; + + if (!mink[0] || node->proto.kernel_max[0] < mink[0]) + mink[0] = node->proto.kernel_max[0]; + } + + if (!mind[0] || !mink[0]) { + log_error("pick_min_protocol zero major number"); + return -1; + } + + /* second pick the minimum minor with the chosen major */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) + continue; + + if (mind[0] == node->proto.daemon_max[0]) { + if (!mind[1] || node->proto.daemon_max[1] < mind[1]) + mind[1] = node->proto.daemon_max[1]; + } + + if (mink[0] == node->proto.kernel_max[0]) { + if (!mink[1] || node->proto.kernel_max[1] < mink[1]) + mink[1] = node->proto.kernel_max[1]; + } + } + + if (!mind[1] || !mink[1]) { + log_error("pick_min_protocol zero minor number"); + return -1; + } + + /* third pick the minimum patch with the chosen major.minor */ + + for (i = 0; i < daemon_member_count; i++) { + node = get_node_daemon(daemon_member[i].nodeid); + if (!node) + continue; + + if (mind[0] == node->proto.daemon_max[0] && + mind[1] == node->proto.daemon_max[1]) { + if (!mind[2] || node->proto.daemon_max[2] < mind[2]) + mind[2] = node->proto.daemon_max[2]; + } + + if (mink[0] == node->proto.kernel_max[0] && + mink[1] == node->proto.kernel_max[1]) { + if (!mink[2] || node->proto.kernel_max[2] < mink[2]) + mink[2] = node->proto.kernel_max[2]; + } + } + + if (!mind[2] || !mink[2]) { + log_error("pick_min_protocol zero patch number"); + return -1; + } + + memcpy(&proto->daemon_run, &mind, sizeof(mind)); + memcpy(&proto->kernel_run, &mink, sizeof(mink)); + return 0; +} + +static void receive_protocol(struct dlm_header *hd, int len) +{ + struct protocol *p; + struct node_daemon *node; + int new = 0; + + p = (struct protocol *)((char *)hd + sizeof(struct dlm_header)); + protocol_in(p); + + if (len < sizeof(struct dlm_header) + sizeof(struct protocol)) { + log_error("receive_protocol invalid len %d from %d", + len, hd->nodeid); + return; + } + + /* zero is an invalid version value */ + + if (!p->daemon_max[0] || !p->daemon_max[1] || !p->daemon_max[2] || + !p->kernel_max[0] || !p->kernel_max[1] || !p->kernel_max[2]) { + log_error("receive_protocol invalid max value from %d " + "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, + p->daemon_max[0], p->daemon_max[1], p->daemon_max[2], + p->kernel_max[0], p->kernel_max[1], p->kernel_max[2]); + return; + } + + /* the run values will be zero until a version is set, after + which none of the run values can be zero */ + + if (p->daemon_run[0] && (!p->daemon_run[1] || !p->daemon_run[2] || + !p->kernel_run[0] || !p->kernel_run[1] || !p->kernel_run[2])) { + log_error("receive_protocol invalid run value from %d " + "daemon %u.%u.%u kernel %u.%u.%u", hd->nodeid, + p->daemon_run[0], p->daemon_run[1], p->daemon_run[2], + p->kernel_run[0], p->kernel_run[1], p->kernel_run[2]); + return; + } + + /* save this node's proto so we can tell when we've got all, and + use it to select a minimum protocol from all */ + + node = get_node_daemon(hd->nodeid); + if (!node) { + log_error("receive_protocol no node %d", hd->nodeid); + return; + } + + if (!node->daemon_member) { + log_error("receive_protocol node %d not member", hd->nodeid); + return; + } + + log_debug("receive_protocol %d max %u.%u.%u.%x run %u.%u.%u.%x", + hd->nodeid, + p->daemon_max[0], p->daemon_max[1], + p->daemon_max[2], p->daemon_max[3], + p->daemon_run[0], p->daemon_run[1], + p->daemon_run[2], p->daemon_run[3]); + + if (memcmp(&node->proto, p, sizeof(struct protocol))) { + log_debug("daemon node %d prot max %u.%u.%u.%x run %u.%u.%u.%x", + hd->nodeid, + node->proto.daemon_max[0], node->proto.daemon_max[1], + node->proto.daemon_max[2], node->proto.daemon_max[3], + node->proto.daemon_run[0], node->proto.daemon_run[1], + node->proto.daemon_run[2], node->proto.daemon_run[3]); + new = 1; + } + + /* checking zero node->daemon_max[0] is a way to tell if we've received + an acceptable (non-stateful) proto message from the node since we + saw it join the daemon cpg */ + + if (hd->nodeid != our_nodeid && + !node->proto.daemon_max[0] && + (p->dr_ver.flags & PV_STATEFUL) && + (our_protocol.dr_ver.flags & PV_STATEFUL)) { + + log_debug("daemon node %d stateful merge", hd->nodeid); + log_debug("daemon node %d join %llu left %llu local quorum %llu", + hd->nodeid, + (unsigned long long)node->daemon_add_time, + (unsigned long long)node->daemon_rem_time, + (unsigned long long)cluster_quorate_monotime); + + if (cluster_quorate && node->daemon_rem_time && + cluster_quorate_monotime < node->daemon_rem_time) { + log_debug("daemon node %d kill due to stateful merge", hd->nodeid); + if (!node->killed) + kick_node_from_cluster(hd->nodeid); + node->killed = 1; + } + + /* don't save p->proto into node->proto; we need to come + through here based on zero daemon_max[0] for other proto + messages like this one from the same node */ + + return; + } + + if (new) { + memcpy(&node->proto, p, sizeof(struct protocol)); + + log_debug("daemon node %d save max %u.%u.%u.%x run %u.%u.%u.%x", + node->nodeid, + node->proto.daemon_max[0], node->proto.daemon_max[1], + node->proto.daemon_max[2], node->proto.daemon_max[3], + node->proto.daemon_run[0], node->proto.daemon_run[1], + node->proto.daemon_run[2], node->proto.daemon_run[3]); + } + + /* if we have zero run values, and this msg has non-zero run values, + then adopt them as ours; otherwise save this proto message */ + + if (our_protocol.daemon_run[0]) + return; + + if (p->daemon_run[0]) { + our_protocol.daemon_run[0] = p->daemon_run[0]; + our_protocol.daemon_run[1] = p->daemon_run[1]; + our_protocol.daemon_run[2] = p->daemon_run[2]; + + our_protocol.kernel_run[0] = p->kernel_run[0]; + our_protocol.kernel_run[1] = p->kernel_run[1]; + our_protocol.kernel_run[2] = p->kernel_run[2]; + + log_debug("run protocol from nodeid %d", hd->nodeid); + } +} + +static void send_protocol(struct protocol *proto) +{ + struct dlm_header *hd; + struct protocol *pr; + char *buf; + int len; + + len = sizeof(struct dlm_header) + sizeof(struct protocol); + buf = malloc(len); + if (!buf) { + log_error("send_protocol no mem %d", len); + return; + } + memset(buf, 0, len); + + hd = (struct dlm_header *)buf; + pr = (struct protocol *)(buf + sizeof(*hd)); + + hd->type = cpu_to_le16(DLM_MSG_PROTOCOL); + hd->nodeid = cpu_to_le32(our_nodeid); + + memcpy(pr, proto, sizeof(struct protocol)); + protocol_out(pr); + + _send_message(cpg_handle_daemon, buf, len, DLM_MSG_PROTOCOL); +} + +int set_protocol(void) +{ + struct protocol proto; + struct pollfd pollfd; + int sent_proposal = 0; + int rv; + + memset(&pollfd, 0, sizeof(pollfd)); + pollfd.fd = cpg_fd_daemon; + pollfd.events = POLLIN; + + while (1) { + if (our_protocol.daemon_run[0]) + break; + + if (!sent_proposal && all_protocol_messages()) { + /* propose a protocol; look through info from all + nodes and pick the min for both daemon and kernel, + and propose that */ + + sent_proposal = 1; + + /* copy our max values */ + memcpy(&proto, &our_protocol, sizeof(struct protocol)); + + rv = pick_min_protocol(&proto); + if (rv < 0) + return rv; + + log_debug("set_protocol member_count %d propose " + "daemon %u.%u.%u kernel %u.%u.%u", + daemon_member_count, + proto.daemon_run[0], proto.daemon_run[1], + proto.daemon_run[2], proto.kernel_run[0], + proto.kernel_run[1], proto.kernel_run[2]); + + send_protocol(&proto); + } + + /* only process messages/events from daemon cpg until protocol + is established */ + + rv = poll(&pollfd, 1, -1); + if (rv == -1 && errno == EINTR) { + if (daemon_quit) + return -1; + continue; + } + if (rv < 0) { + log_error("set_protocol poll errno %d", errno); + return -1; + } + + if (pollfd.revents & POLLIN) + process_cpg_daemon(0); + if (pollfd.revents & (POLLERR | POLLHUP | POLLNVAL)) { + log_error("set_protocol poll revents %u", + pollfd.revents); + return -1; + } + } + + if (our_protocol.daemon_run[0] != our_protocol.daemon_max[0] || + our_protocol.daemon_run[1] > our_protocol.daemon_max[1]) { + log_error("incompatible daemon protocol run %u.%u.%u max %u.%u.%u", + our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2], + our_protocol.daemon_max[0], + our_protocol.daemon_max[1], + our_protocol.daemon_max[2]); + return -1; + } + + if (our_protocol.kernel_run[0] != our_protocol.kernel_max[0] || + our_protocol.kernel_run[1] > our_protocol.kernel_max[1]) { + log_error("incompatible kernel protocol run %u.%u.%u max %u.%u.%u", + our_protocol.kernel_run[0], + our_protocol.kernel_run[1], + our_protocol.kernel_run[2], + our_protocol.kernel_max[0], + our_protocol.kernel_max[1], + our_protocol.kernel_max[2]); + return -1; + } + + log_debug("daemon run %u.%u.%u max %u.%u.%u " + "kernel run %u.%u.%u max %u.%u.%u", + our_protocol.daemon_run[0], + our_protocol.daemon_run[1], + our_protocol.daemon_run[2], + our_protocol.daemon_max[0], + our_protocol.daemon_max[1], + our_protocol.daemon_max[2], + our_protocol.kernel_run[0], + our_protocol.kernel_run[1], + our_protocol.kernel_run[2], + our_protocol.kernel_max[0], + our_protocol.kernel_max[1], + our_protocol.kernel_max[2]); + + send_protocol(&our_protocol); + return 0; +} + +static void deliver_cb_daemon(cpg_handle_t handle, + const struct cpg_name *group_name, + uint32_t nodeid, uint32_t pid, + void *data, size_t len) +{ + struct dlm_header *hd; + + if (len < sizeof(*hd)) { + log_error("deliver_cb short message %zd", len); + return; + } + + hd = (struct dlm_header *)data; + dlm_header_in(hd); + + switch (hd->type) { + case DLM_MSG_PROTOCOL: + receive_protocol(hd, len); + break; + case DLM_MSG_FENCE_RESULT: + receive_fence_result(hd, len); + break; + case DLM_MSG_FENCE_CLEAR: + receive_fence_clear(hd, len); + break; + default: + log_error("deliver_cb_daemon unknown msg type %d", hd->type); + } + + daemon_fence_work(); +} + +static void confchg_cb_daemon(cpg_handle_t handle, + const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries) +{ + struct node_daemon *node; + uint64_t now, now_wall; + int nodedown = 0, procdown = 0, leave = 0; + int we_joined = 0; + int i, reason, low; + + now = monotime(); + now_wall = time(NULL); + + log_config(group_name, member_list, member_list_entries, + left_list, left_list_entries, + joined_list, joined_list_entries); + + memset(&daemon_member, 0, sizeof(daemon_member)); + daemon_member_count = member_list_entries; + + for (i = 0; i < member_list_entries; i++) { + daemon_member[i] = member_list[i]; + /* add struct for nodes we've not seen before */ + add_node_daemon(member_list[i].nodeid); + } + + memset(&daemon_joined, 0, sizeof(daemon_joined)); + daemon_joined_count = joined_list_entries; + + for (i = 0; i < joined_list_entries; i++) { + daemon_joined[i] = joined_list[i]; + if (joined_list[i].nodeid == our_nodeid) + we_joined = 1; + } + + memset(&daemon_remove, 0, sizeof(daemon_remove)); + daemon_remove_count = left_list_entries; + + for (i = 0; i < left_list_entries; i++) { + daemon_remove[i] = left_list[i]; + + if (left_list[i].reason == CPG_REASON_NODEDOWN) + nodedown++; + else if (left_list[i].reason == CPG_REASON_PROCDOWN) + procdown++; + else if (left_list[i].reason == CPG_REASON_LEAVE) + leave++; + } + + if (nodedown || procdown || leave) + log_debug("%s left nodedown %d procdown %d leave %d", + group_name->value, nodedown, procdown, leave); + + if (nodedown) + daemon_ringid_wait = 1; + + if (joined_list_entries) + send_protocol(&our_protocol); + + list_for_each_entry(node, &daemon_nodes, list) { + if (in_daemon_list(node->nodeid, daemon_member, daemon_member_count)) { + if (node->daemon_member) + continue; + + /* node joined daemon cpg */ + node->daemon_member = 1; + node->daemon_add_time = now; + + last_join_monotime = now; + last_join_seq++; + + /* a joining node shows prev members in joined list */ + if (!we_joined) + node->need_fence_clear = FR_CLEAR_STARTUP|FR_CLEAR_FIPU; + + if (node->need_fencing) { + /* need_fencing will be cleared if we accept a + valid proto from it */ + log_error("daemon new nodeid %d needs fencing", + node->nodeid); + } + + } else { + if (!node->daemon_member) + continue; + + /* node left daemon cpg */ + node->daemon_member = 0; + node->killed = 0; + memset(&node->proto, 0, sizeof(struct protocol)); + node->daemon_rem_time = now; + + /* tell loop below to look at this node */ + node->recover_setup = 1; + } + } + + /* set up recovery work for nodes that just failed */ + + /* TODO: limit to nodes with a valid proto? + * node_history_lockspace_fail() would only set + * need_fencing if node->start_time was non-zero. */ + + list_for_each_entry(node, &daemon_nodes, list) { + if (!node->recover_setup) + continue; + + node->recover_setup = 0; + reason = 0; + low = 0; + + if (!cfgd_enable_fencing) + continue; + + if (node->need_fencing) { + log_error("daemon remove nodeid %d already needs fencing", + node->nodeid); + continue; + } + + for (i = 0; i < left_list_entries; i++) { + if (left_list[i].nodeid != node->nodeid) + continue; + reason = left_list[i].reason; + break; + } + + if (reason == CPG_REASON_NODEDOWN || reason == CPG_REASON_PROCDOWN) { + node->need_fencing = 1; + node->fence_config.pos = 0; + node->left_reason = reason; + node->fail_monotime = now; + node->fail_walltime = now_wall; + node->fence_monotime = 0; + node->fence_walltime = 0; + node->fence_request_time = 0; + low = set_fence_actors(node, 0); + } + + log_debug("daemon remove %d %s need_fencing %d low %d", + node->nodeid, reason_str(reason), node->need_fencing, low); + } + + daemon_fence_work(); +} + +static void totem_cb_daemon(cpg_handle_t handle, + struct cpg_ring_id ring_id, + uint32_t member_list_entries, + const uint32_t *member_list) +{ + daemon_ringid.nodeid = ring_id.nodeid; + daemon_ringid.seq = ring_id.seq; + daemon_ringid_wait = 0; + + log_ringid("dlm:controld", &ring_id, member_list, member_list_entries); + + daemon_fence_work(); +} + +static cpg_model_v1_data_t cpg_callbacks_daemon = { + .cpg_deliver_fn = deliver_cb_daemon, + .cpg_confchg_fn = confchg_cb_daemon, + .cpg_totem_confchg_fn = totem_cb_daemon, + .flags = CPG_MODEL_V1_DELIVER_INITIAL_TOTEM_CONF, +}; + +void process_cpg_daemon(int ci) +{ + cs_error_t error; + + error = cpg_dispatch(cpg_handle_daemon, CS_DISPATCH_ALL); + if (error != CS_OK) + log_error("daemon cpg_dispatch error %d", error); +} + +int setup_cpg_daemon(void) +{ + cs_error_t error; + struct cpg_name name; + int i = 0; + + /* daemon 1.1.1 was cluster3/STABLE3/RHEL6 which is incompatible + with cluster4/RHEL7 */ + + memset(&our_protocol, 0, sizeof(our_protocol)); + + if (cfgd_enable_fscontrol) + our_protocol.daemon_max[0] = 2; + else + our_protocol.daemon_max[0] = 3; + + our_protocol.daemon_max[1] = 1; + our_protocol.daemon_max[2] = 1; + our_protocol.kernel_max[0] = 1; + our_protocol.kernel_max[1] = 1; + our_protocol.kernel_max[2] = 1; + + error = cpg_model_initialize(&cpg_handle_daemon, CPG_MODEL_V1, + (cpg_model_data_t *)&cpg_callbacks_daemon, + NULL); + if (error != CS_OK) { + log_error("daemon cpg_initialize error %d", error); + return -1; + } + + cpg_fd_get(cpg_handle_daemon, &cpg_fd_daemon); + + memset(&name, 0, sizeof(name)); + sprintf(name.value, "dlm:controld"); + name.length = strlen(name.value) + 1; + + log_debug("cpg_join %s ...", name.value); + retry: + error = cpg_join(cpg_handle_daemon, &name); + if (error == CS_ERR_TRY_AGAIN) { + sleep(1); + if (!(++i % 10)) + log_error("daemon cpg_join error retrying"); + goto retry; + } + if (error != CS_OK) { + log_error("daemon cpg_join error %d", error); + goto fail; + } + + log_debug("setup_cpg_daemon %d", cpg_fd_daemon); + return cpg_fd_daemon; + + fail: + cpg_finalize(cpg_handle_daemon); + return -1; +} + +void close_cpg_daemon(void) +{ + struct lockspace *ls; + cs_error_t error; + struct cpg_name name; + int i = 0; + + if (!cpg_handle_daemon) + return; + if (cluster_down) + goto fin; + + memset(&name, 0, sizeof(name)); + sprintf(name.value, "dlm:controld"); + name.length = strlen(name.value) + 1; + + log_debug("cpg_leave %s ...", name.value); + retry: + error = cpg_leave(cpg_handle_daemon, &name); + if (error == CS_ERR_TRY_AGAIN) { + sleep(1); + if (!(++i % 10)) + log_error("daemon cpg_leave error retrying"); + goto retry; + } + if (error != CS_OK) + log_error("daemon cpg_leave error %d", error); + fin: + list_for_each_entry(ls, &lockspaces, list) { + if (ls->cpg_handle) + cpg_finalize(ls->cpg_handle); + } + cpg_finalize(cpg_handle_daemon); +} + +void init_daemon(void) +{ + INIT_LIST_HEAD(&daemon_nodes); + INIT_LIST_HEAD(&startup_nodes); + +} diff --git a/dlm_controld/dlm_controld.h b/dlm_controld/dlm_controld.h index 638b0de..96b425c 100644 --- a/dlm_controld/dlm_controld.h +++ b/dlm_controld/dlm_controld.h @@ -29,6 +29,7 @@ #define DLMC_CMD_FS_NOTIFIED 9 #define DLMC_CMD_DEADLOCK_CHECK 10 #define DLMC_CMD_DUMP_LOG_PLOCK 11 +#define DLMC_CMD_FENCE_ACK 12 struct dlmc_header { unsigned int magic; diff --git a/dlm_controld/dlm_daemon.h b/dlm_controld/dlm_daemon.h index 5fca041..5e83db3 100644 --- a/dlm_controld/dlm_daemon.h +++ b/dlm_controld/dlm_daemon.h @@ -9,16 +9,18 @@ #ifndef __DLM_DAEMON_DOT_H__ #define __DLM_DAEMON_DOT_H__ -#include <sys/types.h> #include <asm/types.h> +#include <sys/types.h> #include <sys/uio.h> -#include <netinet/in.h> #include <sys/socket.h> #include <sys/un.h> #include <sys/ioctl.h> #include <sys/stat.h> #include <sys/utsname.h> #include <sys/poll.h> +#include <sys/time.h> +#include <sys/wait.h> +#include <netinet/in.h> #include <netinet/in.h> #include <arpa/inet.h> #include <net/if.h> @@ -37,7 +39,6 @@ #include <syslog.h> #include <sched.h> #include <signal.h> -#include <sys/time.h> #include <dirent.h> #include <corosync/cpg.h> @@ -45,6 +46,7 @@ #include <linux/dlmconstants.h> #include "libdlmcontrol.h" #include "dlm_controld.h" +#include "fence_config.h" #include "list.h" #include "rbtree.h" #include "linux_endian.h" @@ -60,9 +62,9 @@ /* TODO: get CONFDIR, LOGDIR, RUNDIR from build */ -#define RUNDIR "/var/run/cluster" -#define LOGDIR "/var/log/cluster" -#define CONFDIR "/etc" +#define RUNDIR "/var/run/dlm" +#define LOGDIR "/var/log/dlm" +#define CONFDIR "/etc/dlm" #define RUN_FILE_NAME "dlm_controld.pid" #define LOG_FILE_NAME "dlm_controld.log" @@ -80,7 +82,8 @@ #define DEFAULT_DEBUG_LOGFILE 0 #define DEFAULT_ENABLE_FENCING 1 -#define DEFAULT_ENABLE_QUORUM 0 +#define DEFAULT_ENABLE_QUORUM_FENCING 1 +#define DEFAULT_ENABLE_QUORUM_LOCKSPACE 0 #define DEFAULT_ENABLE_FSCONTROL 0 #define DEFAULT_ENABLE_PLOCK 1 #define DEFAULT_PLOCK_DEBUG 0 @@ -89,6 +92,8 @@ #define DEFAULT_DROP_RESOURCES_TIME 10000 /* 10 sec */ #define DEFAULT_DROP_RESOURCES_COUNT 10 #define DEFAULT_DROP_RESOURCES_AGE 10000 /* 10 sec */ +#define DEFAULT_FENCE_ALL_AGENT "dlm_stonith" +#define DEFAULT_STARTUP_FENCE 30 /* DLM_LOCKSPACE_LEN: maximum lockspace name length, from linux/dlmconstants.h. Copied in libdlm.h so apps don't need to include the kernel header. @@ -118,9 +123,8 @@ EXTERN int daemon_debug_opt; EXTERN int daemon_quit; EXTERN int cluster_down; -EXTERN int poll_ringid; +EXTERN int poll_lockspaces; EXTERN int poll_fencing; -EXTERN int poll_quorum; EXTERN int poll_fs; EXTERN int poll_ignore_plock; EXTERN int poll_drop_plock; @@ -128,20 +132,24 @@ EXTERN int plock_fd; EXTERN int plock_ci; EXTERN struct list_head lockspaces; EXTERN int cluster_quorate; -EXTERN uint64_t quorate_time; +EXTERN uint64_t cluster_quorate_monotime; +EXTERN uint64_t cluster_joined_monotime; +EXTERN uint64_t cluster_joined_walltime; EXTERN uint32_t cluster_ringid_seq; EXTERN char cluster_name[DLM_LOCKSPACE_LEN+1]; EXTERN int our_nodeid; EXTERN uint32_t control_minor; EXTERN uint32_t monitor_minor; EXTERN uint32_t plock_minor; +EXTERN struct fence_device fence_all_device; EXTERN int optk_debug; EXTERN int optk_timewarn; EXTERN int optk_protocol; EXTERN int optd_debug_logfile; EXTERN int optd_enable_fencing; -EXTERN int optd_enable_quorum; +EXTERN int optd_enable_quorum_fencing; +EXTERN int optd_enable_quorum_lockspace; EXTERN int optd_enable_fscontrol; EXTERN int optd_enable_plock; EXTERN int optd_plock_debug; @@ -150,13 +158,16 @@ EXTERN int optd_plock_ownership; EXTERN int optd_drop_resources_time; EXTERN int optd_drop_resources_count; EXTERN int optd_drop_resources_age; +EXTERN int optd_startup_fence; +EXTERN int optd_fence_all_agent; EXTERN int cfgk_debug; EXTERN int cfgk_timewarn; EXTERN int cfgk_protocol; EXTERN int cfgd_debug_logfile; EXTERN int cfgd_enable_fencing; -EXTERN int cfgd_enable_quorum; +EXTERN int cfgd_enable_quorum_fencing; +EXTERN int cfgd_enable_quorum_lockspace; EXTERN int cfgd_enable_fscontrol; EXTERN int cfgd_enable_plock; EXTERN int cfgd_plock_debug; @@ -165,6 +176,8 @@ EXTERN int cfgd_plock_ownership; EXTERN int cfgd_drop_resources_time; EXTERN int cfgd_drop_resources_count; EXTERN int cfgd_drop_resources_age; +EXTERN int cfgd_startup_fence; +EXTERN char fence_all_agent[PATH_MAX]; #define LOG_DUMP_SIZE DLMC_DUMP_SIZE @@ -194,7 +207,9 @@ enum { DLM_MSG_DEADLK_CYCLE_START, DLM_MSG_DEADLK_CYCLE_END, DLM_MSG_DEADLK_CHECKPOINT_READY, - DLM_MSG_DEADLK_CANCEL_LOCK + DLM_MSG_DEADLK_CANCEL_LOCK, + DLM_MSG_FENCE_RESULT, + DLM_MSG_FENCE_CLEAR, }; /* dlm_header flags */ @@ -288,15 +303,10 @@ void setup_config(int update); int get_weight(int nodeid, char *lockspace); /* cpg.c */ -int setup_cpg_daemon(void); -void close_cpg_daemon(void); -void process_cpg_daemon(int ci); -int set_protocol(void); void process_lockspace_changes(void); -void dlm_send_message(struct lockspace *ls, char *buf, int len); +void process_fencing_changes(void); int dlm_join_lockspace(struct lockspace *ls); int dlm_leave_lockspace(struct lockspace *ls); -const char *msg_name(int type); void update_flow_control_status(void); int set_node_info(struct lockspace *ls, int nodeid, struct dlmc_node *node); int set_lockspace_info(struct lockspace *ls, struct dlmc_lockspace *lockspace); @@ -305,6 +315,36 @@ int set_lockspace_nodes(struct lockspace *ls, int option, int *node_count, struct dlmc_node **nodes_out); int set_fs_notified(struct lockspace *ls, int nodeid); +/* daemon_cpg.c */ +void init_daemon(void); +void fence_ack_node(int nodeid); +void add_startup_node(int nodeid); +const char *reason_str(int reason); +const char *msg_name(int type); +void dlm_send_message(struct lockspace *ls, char *buf, int len); +void dlm_header_in(struct dlm_header *hd); +int dlm_header_validate(struct dlm_header *hd, int nodeid); +int fence_node_time(int nodeid, uint64_t *last_fenced); +int fence_in_progress(int *in_progress); +int setup_cpg_daemon(void); +void close_cpg_daemon(void); +void process_cpg_daemon(int ci); +void set_protocol_stateful(void); +int set_protocol(void); + +void log_config(const struct cpg_name *group_name, + const struct cpg_address *member_list, + size_t member_list_entries, + const struct cpg_address *left_list, + size_t left_list_entries, + const struct cpg_address *joined_list, + size_t joined_list_entries); + +void log_ringid(const char *name, + struct cpg_ring_id *ringid, + const uint32_t *member_list, + size_t member_list_entries); + /* deadlock.c */ void setup_deadlock(void); void send_cycle_start(struct lockspace *ls); @@ -346,11 +386,12 @@ int setup_cluster_cfg(void); void close_cluster_cfg(void); void process_cluster_cfg(int ci); void kick_node_from_cluster(int nodeid); +int setup_node_config(void); /* fence.c */ -int fence_request(int nodeid); -int fence_node_time(int nodeid, uint64_t *last_fenced_time); -int fence_in_progress(int *count); +int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, + struct fence_config *fc, int *pid_out); +int fence_result(int nodeid, int pid, int *result); /* netlink.c */ int setup_netlink(void); diff --git a/dlm_controld/fence.c b/dlm_controld/fence.c index 50b7d4d..91c5e67 100644 --- a/dlm_controld/fence.c +++ b/dlm_controld/fence.c @@ -7,28 +7,147 @@ */ #include "dlm_daemon.h" -#include <pacemaker/crm/stonith-ng.h> -int fence_request(int nodeid) +static int run_agent(char *agent, char *args, int *pid_out) { - int rv; - rv = stonith_api_kick_helper(nodeid, 300, 1); - if (rv) { - log_error("stonith_api_kick_helper %d error %d", nodeid, rv); - return rv; + int pid, len; + int pw_fd = -1; /* parent write file descriptor */ + int cr_fd = -1; /* child read file descriptor */ + int pfd[2]; + + len = strlen(args); + + if (pipe(pfd)) + return -errno; + + cr_fd = pfd[0]; + pw_fd = pfd[1]; + + pid = fork(); + if (pid < 0) { + close(cr_fd); + close(pw_fd); + return -errno; } - return 0; + + if (pid) { + /* parent */ + int ret; + + do { + ret = write(pw_fd, args, len); + } while (ret < 0 && errno == EINTR); + + if (ret != len) + goto fail; + + close(cr_fd); + close(pw_fd); + + *pid_out = pid; + return 0; + } else { + /* child */ + int c_stdout, c_stderr; + + /* redirect agent stdout/stderr to /dev/null */ + close(1); + c_stdout = open("/dev/null", O_WRONLY); + if (c_stdout < 0) + goto fail; + close(2); + c_stderr = open("/dev/null", O_WRONLY); + if (c_stderr < 0) + goto fail; + + /* redirect agent stdin from parent */ + close(0); + if (dup(cr_fd) < 0) + goto fail; + + close(cr_fd); + close(pw_fd); + + execlp(agent, agent, NULL); + exit(EXIT_FAILURE); + } + fail: + close(cr_fd); + close(pw_fd); + return -1; } -int fence_node_time(int nodeid, uint64_t *last_fenced_time) +int fence_request(int nodeid, uint64_t fail_walltime, uint64_t fail_monotime, + struct fence_config *fc, int *pid_out) { - *last_fenced_time = stonith_api_time_helper(nodeid, 0); + struct fence_device *dev; + char args[FENCE_CONFIG_ARGS_MAX]; + char extra[FENCE_CONFIG_NAME_MAX]; + int rv, pid = -1; + + memset(args, 0, sizeof(args)); + + memset(extra, 0, sizeof(extra)); + snprintf(extra, sizeof(extra)-1, "fail_time=%llu\n", (unsigned long long)fail_walltime); + + dev = fc->dev[fc->pos]; + if (!dev) + return -1; + + rv = fence_config_agent_args(fc, extra, args); + if (rv < 0) { + log_error("fence_request %d args error %d", nodeid, rv); + return rv; + } + + rv = run_agent(dev->agent, args, &pid); + if (rv < 0) { + log_error("fence_request %d agent %s pid %d run error %d", + nodeid, dev->agent, pid, rv); + return rv; + } + + log_debug("fence_request %d pos %d agent %s pid %d running", + nodeid, fc->pos, dev->agent, pid); + + *pid_out = pid; return 0; } -int fence_in_progress(int *count) +/* + * if pid has exited, return 0 with exit code in result + * if pid is running, return -EAGAIN + * other error, return -EXXX + */ + +int fence_result(int nodeid, int pid, int *result) { - *count = 0; - return 0; + int status, rv; + + rv = waitpid(pid, &status, WNOHANG); + + if (rv < 0) { + /* shouldn't happen */ + log_error("agent pid %d nodeid %d errno %d", + pid, nodeid, errno); + return rv; + + } else if (!rv) { + /* pid still running */ + return -EAGAIN; + + } else if (WIFEXITED(status)) { + /* pid exited */ + + *result = WEXITSTATUS(status); + + log_error("agent pid %d nodeid %d result %d", + pid, nodeid, *result); + return 0; + + } else { + /* pid state changed but still running */ + return -EAGAIN; + } } diff --git a/dlm_controld/fence_config.c b/dlm_controld/fence_config.c new file mode 100644 index 0000000..9f72e36 --- /dev/null +++ b/dlm_controld/fence_config.c @@ -0,0 +1,403 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + + +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include "fence_config.h" + +#if 0 + +Empty new line separates the config for each fence device. + +- + +fence_all fence_foo key=val ... + +Special fence config format that applies to all nodes, allows +no per node config parameters, and cannot be used with any +other fence device configuration. + +- + +device <dev_name> <agent> <dev_args> +connect <dev_name> node=<nodeid> <con_args> + +General fence config format, allowing per node config +parameters. + +- + +device foo fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo node=1 port=1 +connect foo node=2 port=2 +connect foo node=3 port=3 + +Simple example of nodes connected to switch ports. +If fencing with the device fails, the next device +listed for the node, if any, will be tried. + +- + +device foo:1 fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo:1 node=1 port=1 +connect foo:1 node=2 port=2 +connect foo:1 node=3 port=3 + +device foo:2 fence_foo ipaddr=2.2.2.2 login=x password=y +connect foo:2 node=1 port=1 +connect foo:2 node=2 port=2 +connect foo:2 node=3 port=3 + +Associate two parallel path/power devices that must both +succeed for fencing to succeed. Devices have same base +name with :1 :2 suffix. + +- + +device foo fence_foo ipaddr=1.1.1.1 login=x password=y +connect foo node=1 port=1 +connect foo node=2 port=2 +connect foo node=3 port=3 +unfence foo + +Add unfence line to indicate nodes connected to the device +should be unfenced. + +#endif + +#define MAX_LINE (FENCE_CONFIG_ARGS_MAX + (3 * FENCE_CONFIG_NAME_MAX)) + +static unsigned int con_args_nodeid(char *args) +{ + char *k; + unsigned int v; + int rv; + + k = strstr(args, "node="); + + rv = sscanf(k, "node=%u", &v); + if (rv != 1) + return 0; + return v; +} + +static int read_config_section(unsigned int nodeid, FILE *file, char *dev_line, + struct fence_device **dev_out, + struct fence_connect **con_out) +{ + struct fence_device *dev; + struct fence_connect *con; + char line[MAX_LINE]; + char unused[FENCE_CONFIG_NAME_MAX]; + char agent[FENCE_CONFIG_NAME_MAX]; + char dev_name[FENCE_CONFIG_NAME_MAX]; + char con_name[FENCE_CONFIG_NAME_MAX]; + char dev_args[FENCE_CONFIG_ARGS_MAX]; + char con_args[FENCE_CONFIG_ARGS_MAX]; + int rv, unfence = 0; + + if (strlen(dev_line) > MAX_LINE) + return -1; + + memset(dev_name, 0, sizeof(dev_name)); + memset(agent, 0, sizeof(agent)); + memset(dev_args, 0, sizeof(dev_args)); + + rv = sscanf(dev_line, "%s %s %s %[^\n]s\n", unused, dev_name, agent, dev_args); + if (rv < 3) + return -1; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '\n') + break; + if (line[0] == '#') + continue; + + if (!strncmp(line, "unfence", strlen("unfence"))) { + /* TODO: verify dev_name matches */ + unfence = 1; + continue; + } + + /* invalid config */ + if (strncmp(line, "connect", strlen("connect"))) + return -1; + + memset(con_name, 0, sizeof(con_name)); + memset(con_args, 0, sizeof(con_args)); + + rv = sscanf(line, "%s %s %[^\n]s", unused, con_name, con_args); + if (rv < 3) + return -1; + + /* invalid config */ + if (strncmp(dev_name, con_name, FENCE_CONFIG_NAME_MAX)) + return -1; + + /* skip connection for another node */ + if (con_args_nodeid(con_args) != nodeid) + continue; + + dev = malloc(sizeof(struct fence_device)); + if (!dev) + return -ENOMEM; + + con = malloc(sizeof(struct fence_connect)); + if (!con) { + free(dev); + return -ENOMEM; + } + + memset(dev, 0, sizeof(struct fence_device)); + memset(con, 0, sizeof(struct fence_connect)); + + strncpy(dev->name, dev_name, FENCE_CONFIG_NAME_MAX-1); + strncpy(dev->agent, agent, FENCE_CONFIG_NAME_MAX-1); + strncpy(dev->args, dev_args, FENCE_CONFIG_ARGS_MAX-1); + strncpy(con->name, con_name, FENCE_CONFIG_NAME_MAX-1); + strncpy(con->args, con_args, FENCE_CONFIG_ARGS_MAX-1); + dev->unfence = unfence; + + *dev_out = dev; + *con_out = con; + return 0; + } + + return -1; +} + +void fence_config_free(struct fence_config *fc) +{ + struct fence_device *dev; + struct fence_connect *con; + int i; + + for (i = 0; i < FENCE_CONFIG_DEVS_MAX; i++) { + dev = fc->dev[i]; + con = fc->con[i]; + if (dev) + free(dev); + if (con) + free(con); + } + + memset(fc, 0, sizeof(struct fence_config)); +} + +int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path) +{ + char line[MAX_LINE]; + struct fence_device *dev; + struct fence_connect *con; + FILE *file; + int pos = 0; + int rv; + + fc->nodeid = nodeid; + + file = fopen(path, "r"); + if (!file) + return -1; + + while (fgets(line, MAX_LINE, file)) { + if (line[0] == '#') + continue; + if (line[0] == '\n') + continue; + + if (!strncmp(line, "fence_all", strlen("fence_all"))) { + /* fence_all cannot be used with other fence devices */ + if (pos) { + rv = -EINVAL; + goto out; + } + + dev = malloc(sizeof(struct fence_device)); + if (!dev) { + rv = -ENOMEM; + goto out; + } + + rv = sscanf(line, "%s %s %[^\n]s\n", dev->name, dev->agent, dev->args); + if (rv < 2) { + rv = -EINVAL; + goto out; + } + + fc->dev[0] = dev; + fc->pos = 0; + rv = 0; + goto out; + } + + if (strncmp(line, "device", strlen("device"))) + continue; + + /* read connect and unfence lines following a device line */ + + rv = read_config_section(nodeid, file, line, &dev, &con); + if (rv < 0) + continue; + + fc->dev[pos] = dev; + fc->con[pos] = con; + pos++; + } + + if (pos) + rv = 0; + out: + fclose(file); + return rv; +} + +static int same_base_name(struct fence_device *a, + struct fence_device *b) +{ + int len, i; + + len = strlen(a->name); + if (len > strlen(b->name)) + len = strlen(b->name); + + for (i = 0; i < len; i++) { + if (a->name[i] == ':' && b->name[i] == ':') + return 1; + if (a->name[i] == b->name[i]) + continue; + return 0; + } + return 0; +} + +/* + * if next dev is in parallel with last one, + * set d,c return 0, else -1 + * + * two consecutive devs with same basename are parallel + */ + +int fence_config_next_parallel(struct fence_config *fc) +{ + struct fence_device *prev, *next; + int d = fc->pos; + + if (d >= FENCE_CONFIG_DEVS_MAX) + return -1; + + prev = fc->dev[d]; + next = fc->dev[d+1]; + + if (!next) + return -1; + + if (same_base_name(prev, next)) { + fc->pos = d+1; + return 0; + } + return -1; +} + +/* + * if there's a dev with the next priority, + * set d,c return 0, else -1 + * + * look for another dev with a non-matching basename + */ + +int fence_config_next_priority(struct fence_config *fc) +{ + struct fence_device *prev, *next; + int d = fc->pos; + int i; + + if (d >= FENCE_CONFIG_DEVS_MAX) + return -1; + + prev = fc->dev[d]; + + for (i = d+1; i < FENCE_CONFIG_DEVS_MAX; i++) { + next = fc->dev[i]; + + if (!next) + return -1; + + if (same_base_name(prev, next)) + continue; + + fc->pos = d+1; + return 0; + } + return -1; +} + +int fence_config_agent_args(struct fence_config *fc, char *extra, char *args) +{ + struct fence_device *dev; + struct fence_connect *con; + char node[FENCE_CONFIG_NAME_MAX]; + char *p; + int n = 0; + int i, len; + + dev = fc->dev[fc->pos]; + con = fc->con[fc->pos]; + + memset(node, 0, sizeof(node)); + snprintf(node, FENCE_CONFIG_NAME_MAX-1, "node=%u\n", fc->nodeid); + len = strlen(node); + + if (dev) + len += strlen(dev->args) + 1; /* +1 for \n */ + if (con) + len += strlen(con->args) + 1; + if (extra) + len += strlen(extra) + 1; + + if (len > FENCE_CONFIG_ARGS_MAX - 1) + return -1; + + if (dev && dev->args[0]) { + p = dev->args; + + for (i = 0; i < strlen(dev->args); i++) { + if (*p == ' ') + args[n++] = '\n'; + else + args[n++] = *p; + p++; + } + args[n++] = '\n'; + } + + if (con && con->args[0]) { + p = con->args; + + for (i = 0; i < strlen(con->args); i++) { + if (*p == ' ') + args[n++] = '\n'; + else + args[n++] = *p; + p++; + } + args[n++] = '\n'; + } + + if (!strstr(args, "node=")) + strcat(args, node); + if (extra) + strcat(args, extra); + + return 0; +} + diff --git a/dlm_controld/fence_config.h b/dlm_controld/fence_config.h new file mode 100644 index 0000000..47e7bda --- /dev/null +++ b/dlm_controld/fence_config.h @@ -0,0 +1,62 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#ifndef _FENCE_CONFIG_H_ +#define _FENCE_CONFIG_H_ + +#define FENCE_CONFIG_DEVS_MAX 4 /* max devs per node */ +#define FENCE_CONFIG_NAME_MAX 256 /* including terminating \0 */ +#define FENCE_CONFIG_ARGS_MAX 4096 /* including terminating \0 */ + +struct fence_device { + char name[FENCE_CONFIG_NAME_MAX]; + char agent[FENCE_CONFIG_NAME_MAX]; + char args[FENCE_CONFIG_ARGS_MAX]; + int unfence; +}; + +struct fence_connect { + char name[FENCE_CONFIG_NAME_MAX]; + char args[FENCE_CONFIG_ARGS_MAX]; +}; + +/* describes fence config for one node */ + +struct fence_config { + struct fence_device *dev[FENCE_CONFIG_DEVS_MAX]; + struct fence_connect *con[FENCE_CONFIG_DEVS_MAX]; + unsigned int nodeid; + int pos; +}; + + +int fence_config_init(struct fence_config *fc, unsigned int nodeid, char *path); +void fence_config_free(struct fence_config *fc); + +/* + * Iterate through fence_config, sets pos to indicate next to try. + * Based on two rules: + * + * - next_parallel is the next device with the same base name + * as the current device (base name is name preceding ":") + * + * - next_priority is the next device without the same base name + * as the current device + */ + +int fence_config_next_parallel(struct fence_config *fc); +int fence_config_next_priority(struct fence_config *fc); + +/* + * Combine dev->args and con->args, replacing ' ' with '\n'. + * Also add "node=nodeid" if "node=" does not already exist. + */ + +int fence_config_agent_args(struct fence_config *fc, char *extra, char *args); + +#endif diff --git a/dlm_controld/lib.c b/dlm_controld/lib.c index 09f84cf..228f037 100644 --- a/dlm_controld/lib.c +++ b/dlm_controld/lib.c @@ -425,3 +425,22 @@ int dlmc_deadlock_check(char *name) return rv; } +int dlmc_fence_ack(char *name) +{ + struct dlmc_header h; + int fd, rv; + + init_header(&h, DLMC_CMD_FENCE_ACK, name, 0); + + fd = do_connect(DLMC_SOCK_PATH); + if (fd < 0) { + rv = fd; + goto out; + } + + rv = do_write(fd, &h, sizeof(h)); + close(fd); + out: + return rv; +} + diff --git a/dlm_controld/libdlmcontrol.h b/dlm_controld/libdlmcontrol.h index 609372d..4034372 100644 --- a/dlm_controld/libdlmcontrol.h +++ b/dlm_controld/libdlmcontrol.h @@ -93,6 +93,7 @@ int dlmc_fs_notified(int fd, char *name, int nodeid); int dlmc_fs_result(int fd, char *name, int *type, int *nodeid, int *result); int dlmc_deadlock_check(char *name); +int dlmc_fence_ack(char *name); #endif diff --git a/dlm_controld/main.c b/dlm_controld/main.c index e0420b6..e623a6f 100644 --- a/dlm_controld/main.c +++ b/dlm_controld/main.c @@ -163,6 +163,10 @@ static void sigterm_handler(int sig) daemon_quit = 1; } +static void sigchld_handler(int sig) +{ +} + static struct lockspace *create_ls(char *name) { struct lockspace *ls; @@ -649,6 +653,10 @@ static void process_connection(int ci) } switch (h.command) { + case DLMC_CMD_FENCE_ACK: + fence_ack_node(atoi(h.name)); + break; + case DLMC_CMD_FS_REGISTER: if (cfgd_enable_fscontrol) { rv = fs_register_add(h.name); @@ -869,6 +877,8 @@ static void loop(void) void (*workfn) (int ci); void (*deadfn) (int ci); + setup_config(0); + rv = setup_queries(); if (rv < 0) goto out; @@ -884,13 +894,15 @@ static void loop(void) if (rv > 0) client_add(rv, process_cluster_cfg, cluster_dead); + rv = setup_node_config(); + if (rv < 0) + goto out; + rv = setup_cluster(); if (rv < 0) goto out; client_add(rv, process_cluster, cluster_dead); - setup_config(0); - rv = check_uncontrolled_lockspaces(); if (rv < 0) goto out; @@ -945,8 +957,10 @@ static void loop(void) if (rv == -1 && errno == EINTR) { if (daemon_quit && list_empty(&lockspaces)) goto out; - log_error("shutdown ignored, active lockspaces"); - daemon_quit = 0; + if (daemon_quit) { + log_error("shutdown ignored, active lockspaces"); + daemon_quit = 0; + } continue; } if (rv < 0) { @@ -977,7 +991,12 @@ static void loop(void) poll_timeout = -1; - if (poll_fencing || poll_fs) { + if (poll_fencing) { + process_fencing_changes(); + poll_timeout = 1000; + } + + if (poll_lockspaces || poll_fs) { process_lockspace_changes(); poll_timeout = 1000; } @@ -1097,7 +1116,7 @@ static void print_usage(void) printf(" -f <num> Enable (1) or disable (0) fencing recovery dependency\n"); printf(" Default is %d\n", DEFAULT_ENABLE_FENCING); printf(" -q <num> Enable (1) or disable (0) quorum recovery dependency\n"); - printf(" Default is %d\n", DEFAULT_ENABLE_QUORUM); + printf(" Default is %d\n", DEFAULT_ENABLE_QUORUM_FENCING); printf(" -s <num> Enable (1) or disable (0) fs_controld recovery coordination\n"); printf(" Default is %d\n", DEFAULT_ENABLE_FSCONTROL); #if 0 @@ -1121,7 +1140,7 @@ static void print_usage(void) printf(" -V Print program version information, then exit\n"); } -#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:" +#define OPTION_STRING "LDKf:q:p:Pl:o:t:c:a:hVr:s:e:d:" static void read_arguments(int argc, char **argv) { @@ -1151,21 +1170,35 @@ static void read_arguments(int argc, char **argv) cfgk_protocol = atoi(optarg); break; + case 's': + optd_enable_fscontrol = 1; + cfgd_enable_fscontrol = atoi(optarg); + break; + + /* fencing options */ + case 'f': optd_enable_fencing = 1; cfgd_enable_fencing = atoi(optarg); break; case 'q': - optd_enable_quorum = 1; - cfgd_enable_quorum = atoi(optarg); + optd_enable_quorum_fencing = 1; + cfgd_enable_quorum_fencing = atoi(optarg); break; - case 's': - optd_enable_fscontrol = 1; - cfgd_enable_fscontrol = atoi(optarg); + case 'e': + optd_fence_all_agent = 1; + strcpy(fence_all_agent, optarg); break; + case 'd': + optd_startup_fence = 1; + cfgd_startup_fence = atoi(optarg); + + + /* plock options */ + case 'p': optd_enable_plock = 1; cfgd_enable_plock = atoi(optarg); @@ -1256,15 +1289,15 @@ static void set_scheduler(void) int main(int argc, char **argv) { - int fd; + struct sigaction act; + int fd, rv; cfgk_debug = -1; cfgk_timewarn = -1; cfgk_protocol = PROTO_DETECT; cfgd_debug_logfile = DEFAULT_DEBUG_LOGFILE; - cfgd_enable_fencing = DEFAULT_ENABLE_FENCING; - cfgd_enable_quorum = DEFAULT_ENABLE_QUORUM; - cfgd_enable_quorum = DEFAULT_ENABLE_FSCONTROL; + cfgd_enable_fscontrol = DEFAULT_ENABLE_FSCONTROL; + cfgd_enable_plock = DEFAULT_ENABLE_PLOCK; cfgd_plock_debug = DEFAULT_PLOCK_DEBUG; cfgd_plock_rate_limit = DEFAULT_PLOCK_RATE_LIMIT; @@ -1273,8 +1306,20 @@ int main(int argc, char **argv) cfgd_drop_resources_count = DEFAULT_DROP_RESOURCES_COUNT; cfgd_drop_resources_age = DEFAULT_DROP_RESOURCES_AGE; + cfgd_enable_fencing = DEFAULT_ENABLE_FENCING; + cfgd_enable_quorum_lockspace= DEFAULT_ENABLE_QUORUM_LOCKSPACE; + cfgd_enable_quorum_fencing = DEFAULT_ENABLE_QUORUM_FENCING; + cfgd_startup_fence = DEFAULT_STARTUP_FENCE; + + strcpy(fence_all_agent, DEFAULT_FENCE_ALL_AGENT); + memset(&fence_all_device, 0, sizeof(struct fence_device)); + strcpy(fence_all_device.name, "fence_all"); + strcpy(fence_all_device.agent, fence_all_agent); + INIT_LIST_HEAD(&lockspaces); INIT_LIST_HEAD(&fs_register_list); + + init_daemon(); read_arguments(argc, argv); @@ -1293,7 +1338,18 @@ int main(int argc, char **argv) log_level(NULL, LOG_INFO, "dlm_controld %s started", RELEASE_VERSION); - signal(SIGTERM, sigterm_handler); + memset(&act, 0, sizeof(act)); + act.sa_handler = sigterm_handler; + rv = sigaction(SIGTERM, &act, NULL); + if (rv < 0) + return -rv; + + memset(&act, 0, sizeof(act)); + act.sa_handler = sigchld_handler; + act.sa_flags = SA_NOCLDSTOP; + rv = sigaction(SIGCHLD, &act, NULL); + if (rv < 0) + return -rv; set_scheduler(); diff --git a/dlm_controld/member.c b/dlm_controld/member.c index 7af581c..841e5e3 100644 --- a/dlm_controld/member.c +++ b/dlm_controld/member.c @@ -9,6 +9,7 @@ #include "dlm_daemon.h" #include <corosync/corotypes.h> #include <corosync/cfg.h> +#include <corosync/cmap.h> #include <corosync/quorum.h> static corosync_cfg_handle_t ch; @@ -111,8 +112,13 @@ static void quorum_callback(quorum_handle_t h, uint32_t quorate, int i, j, num_addrs; uint64_t now = monotime(); + if (!cluster_joined_monotime) { + cluster_joined_monotime = now; + cluster_joined_walltime = time(NULL); + } + if (!cluster_quorate && quorate) - quorate_time = now; + cluster_quorate_monotime = now; cluster_quorate = quorate; cluster_ringid_seq = (uint32_t)ring_seq; @@ -174,8 +180,10 @@ void process_cluster(int ci) cs_error_t err; err = quorum_dispatch(qh, CS_DISPATCH_ALL); - if (err != CS_OK) + if (err != CS_OK) { + log_error("process_cluster quorum_dispatch %d", err); cluster_dead(0); + } } /* Force re-read of quorum nodes */ @@ -184,8 +192,10 @@ void update_cluster(void) cs_error_t err; err = quorum_dispatch(qh, CS_DISPATCH_ONE); - if (err != CS_OK) + if (err != CS_OK) { + log_error("update_cluster quorum_dispatch %d", err); cluster_dead(0); + } } int setup_cluster(void) @@ -272,8 +282,10 @@ void process_cluster_cfg(int ci) cs_error_t err; err = corosync_cfg_dispatch(ch, CS_DISPATCH_ALL); - if (err != CS_OK) + if (err != CS_OK) { + log_error("process_cluster_cfg cfg_dispatch %d", err); cluster_dead(0); + } } int setup_cluster_cfg(void) @@ -319,3 +331,34 @@ void close_cluster_cfg(void) corosync_cfg_finalize(ch); } +int setup_node_config(void) +{ + char key[CMAP_KEYNAME_MAXLEN+1]; + cmap_handle_t h; + cs_error_t err; + uint32_t nodeid; + int i; + + err = cmap_initialize(&h); + if (err != CS_OK) { + log_error("corosync cmap init error %d", err); + return -1; + } + + for (i = 0; i < MAX_NODES; i++) { + snprintf(key, CMAP_KEYNAME_MAXLEN, "nodelist.node.%d.nodeid", i); + + err = cmap_get_uint32(h, key, &nodeid); + if (err != CS_OK) + break; + + log_debug("node_config %d", nodeid); + + if (cfgd_enable_fencing && cfgd_startup_fence) + add_startup_node(nodeid); + } + + cmap_finalize(h); + return 0; +} + diff --git a/dlm_controld/plock.c b/dlm_controld/plock.c index b6dc0a4..ed98e9c 100644 --- a/dlm_controld/plock.c +++ b/dlm_controld/plock.c @@ -27,8 +27,6 @@ static struct timeval plock_rate_last; static int plock_device_fd = -1; -extern int message_flow_control_on; - #define RD_CONTINUE 0x00000001 struct resource_data { @@ -1480,14 +1478,6 @@ int limit_plocks(void) { struct timeval now; - /* Don't send more messages while the cpg message queue is backed up */ - - if (message_flow_control_on) { - update_flow_control_status(); - if (message_flow_control_on) - return 1; - } - if (!cfgd_plock_rate_limit || !plock_read_count) return 0; diff --git a/dlm_tool/main.c b/dlm_tool/main.c index c9a15db..c378f2b 100644 --- a/dlm_tool/main.c +++ b/dlm_tool/main.c @@ -38,6 +38,7 @@ #define OP_LOCKDUMP 8 #define OP_LOCKDEBUG 9 #define OP_LOG_PLOCK 10 +#define OP_FENCE_ACK 11 static char *prog_name; static char *lsname; @@ -182,7 +183,7 @@ static void print_usage(void) printf("\n"); printf("dlm_tool [options] [join | leave | lockdump | lockdebug |\n" " ls | dump | log_plock | plocks |\n" - " deadlock_check]\n"); + " fence_ack]\n"); printf("\n"); printf("Options:\n"); printf(" -n Show all node information in ls\n"); @@ -321,6 +322,11 @@ static void decode_arguments(int argc, char **argv) operation = OP_DEADLOCK_CHECK; opt_ind = optind + 1; break; + } else if (!strncmp(argv[optind], "fence_ack", 9) && + (strlen(argv[optind]) == 9)) { + operation = OP_FENCE_ACK; + opt_ind = optind + 1; + break; } else if (!strncmp(argv[optind], "dump", 4) && (strlen(argv[optind]) == 4)) { operation = OP_DUMP; @@ -1241,6 +1247,11 @@ static void do_deadlock_check(char *name) dlmc_deadlock_check(name); } +static void do_fence_ack(char *name) +{ + dlmc_fence_ack(name); +} + static void do_plocks(char *name) { char buf[DLMC_DUMP_SIZE]; @@ -1335,6 +1346,10 @@ int main(int argc, char **argv) case OP_LOCKDEBUG: do_lockdebug(lsname); break; + + case OP_FENCE_ACK: + do_fence_ack(lsname); + break; } return 0; } diff --git a/fence/Makefile b/fence/Makefile new file mode 100644 index 0000000..c6ff2c0 --- /dev/null +++ b/fence/Makefile @@ -0,0 +1,56 @@ +DESTDIR= +PREFIX=/usr +BINDIR=$(PREFIX)/sbin +#MANDIR=$(PREFIX)/share/man + +BIN_TARGET = dlm_stonith +#MAN_TARGET = dlm_stonith.8 + +BIN_SOURCE = stonith_helper.c + +BIN_CFLAGS += -D_GNU_SOURCE -g \ + -Wall \ + -Wformat \ + -Wformat-security \ + -Wmissing-prototypes \ + -Wnested-externs \ + -Wpointer-arith \ + -Wextra -Wshadow \ + -Wcast-align \ + -Wwrite-strings \ + -Waggregate-return \ + -Wstrict-prototypes \ + -Winline \ + -Wredundant-decls \ + -Wno-sign-compare \ + -Wno-unused-parameter \ + -Wp,-D_FORTIFY_SOURCE=2 \ + -fexceptions \ + -fasynchronous-unwind-tables \ + -fdiagnostics-show-option \ + +BIN_CFLAGS += -fPIE -DPIE +BIN_CFLAGS += `xml2-config --cflags` +BIN_CFLAGS += -I../include + +BIN_LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie +BIN_LDFLAGS += `xml2-config --libs` + +all: $(BIN_TARGET) + +$(BIN_TARGET): $(BIN_SOURCE) + $(CC) $(BIN_CFLAGS) $(BIN_LDFLAGS) $(BIN_SOURCE) -o $@ -L. + +clean: + rm -f *.o *.so *.so.* $(BIN_TARGET) + + +INSTALL=$(shell which install) + +.PHONY: install +install: all + $(INSTALL) -d $(DESTDIR)/$(BINDIR) + $(INSTALL) -d $(DESTDIR)/$(MANDIR)/man8 + $(INSTALL) -c -m 755 $(BIN_TARGET) $(DESTDIR)/$(BINDIR) +# $(INSTALL) -m 644 $(MAN_TARGET) $(DESTDIR)/$(MANDIR)/man8/ + diff --git a/fence/stonith_helper.c b/fence/stonith_helper.c new file mode 100644 index 0000000..df44219 --- /dev/null +++ b/fence/stonith_helper.c @@ -0,0 +1,96 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include <stdio.h> +#include <stdint.h> +#include <string.h> +#include <unistd.h> +#include <syslog.h> +#include <pacemaker/crm/stonith-ng.h> + +int nodeid; +uint64_t fail_time; + +#define MAX_ARG_LEN 1024 + +static int get_options(int argc, char *argv[]) +{ + char arg[MAX_ARG_LEN]; + char key[MAX_ARG_LEN]; + char val[MAX_ARG_LEN]; + char c; + int rv; + + if (argc > 1) { + while ((c = getopt(argc, argv, "n:t:")) != -1) { + switch (c) { + case 'n': + nodeid = atoi(optarg); + break; + case 't': + fail_time = strtoull(optarg, NULL, 0); + break; + } + } + } else { + while (fgets(arg, sizeof(arg), stdin)) { + rv = sscanf(arg, "%[^=]=%s\n", key, val); + if (rv != 2) + continue; + + if (!strcmp(key, "node")) + nodeid = atoi(val); + else if (!strcmp(key, "fail_time")) + fail_time = strtoull(val, NULL, 0); + } + } + + if (!nodeid) { + fprintf(stderr, "no node\n"); + return -1; + } + + if (!fail_time) { + fprintf(stderr, "no fail_time\n"); + return -1; + } + + return 0; +} + +int main(int argc, char *argv[]) +{ + uint64_t t; + int rv; + + rv = get_options(argc, argv); + if (rv) + return rv; + + t = stonith_api_time_helper(nodeid, 0); + if (t >= fail_time) + return 0; + + rv = stonith_api_kick_helper(nodeid, 300, 1); + if (rv) { + fprintf(stderr, "kick_helper error %d nodeid %d\n", rv, nodeid); + openlog("stonith_helper", LOG_CONS | LOG_PID, LOG_DAEMON); + syslog(LOG_ERR, "kick_helper error %d nodeid %d\n", rv, nodeid); + return rv; + } + + while (1) { + t = stonith_api_time_helper(nodeid, 0); + if (t >= fail_time) + return 0; + sleep(1); + } + + return -1; +} +

12 years, 1 month

1
0
0 / 0

fence-agents: master - fence agents: Using "delay" option can ends with timeout problems

by Marek Grác

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: b5fe319735a628d153c09098cea0f90df8b62ead Parent: 9fb1a85034eb8f94a766ba9766a75924947f5e77 Author: Marek 'marx' Grac <mgrac(a)redhat.com> AuthorDate: Mon Mar 19 16:45:23 2012 +0100 Committer: Marek 'marx' Grac <mgrac(a)redhat.com> CommitterDate: Mon Mar 19 16:45:23 2012 +0100 fence agents: Using "delay" option can ends with timeout problems Resolves: rhbz#804169 --- fence/agents/lib/fencing.py.py | 7 +++++-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/fence/agents/lib/fencing.py.py b/fence/agents/lib/fencing.py.py index e93f59a..02a01be 100644 --- a/fence/agents/lib/fencing.py.py +++ b/fence/agents/lib/fencing.py.py @@ -787,8 +787,6 @@ def fence_action(tn, options, set_power_fn, get_power_fn, get_outlet_list = None print o + options["-C"] + alias return - if options["-o"] in ["off", "reboot"]: - time.sleep(int(options["-f"])) status = get_power_fn(tn, options) if status != "on" and status != "off": @@ -871,6 +869,11 @@ def fence_login(options): else: login_eol = "\r\n" + ## Do the delay of the fence device before logging in + ## Delay is important for two-node clusters fencing but we do not need to delay 'status' operations + if options["-o"] in ["off", "reboot"]: + time.sleep(int(options["-f"])) + try: re_login = re.compile("(login\s*: )|(Login Name: )|(username: )|(User Name :)", re.IGNORECASE) re_pass = re.compile("(password)|(pass phrase)", re.IGNORECASE)

12 years, 1 month

1
0
0 / 0

cluster: RHEL6 - config: drastically improve cman RRP configuration handling

by Fabio M. Di Nitto

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: a59ecbb74af3e4385497ea84bfaa1219f6a5cab6 Parent: e48f97cd32ecb5b595449fd946226e47f69880d3 Author: Fabio M. Di Nitto <fdinitto(a)redhat.com> AuthorDate: Wed Nov 30 12:57:30 2011 +0100 Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com> CommitterDate: Wed Mar 7 14:51:34 2012 +0100 config: drastically improve cman RRP configuration handling - don't allow configuration of more than 2 rings - allow overrided of alternate mcast address and port via envars - when using broadcast, set different ports on second ring. this also required a substantial change in transport handling - add support for <cman> <multicast addr= port= ttl=/> <altmulticast addr= port= ttl=/> </cman> - don't allow overlap of addresses/ports - remove redundant port settings in cman - change relaxng schema to reflect above changes Resolves: rhbz#733298 Reviewed-by: Jan Friesse <jfriesse(a)redhat.com> Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com> --- cman/daemon/cman-preconfig.c | 214 ++++++++++++++++++++++------------ config/tools/xml/cluster.rng.in.head | 64 +++++++--- 2 files changed, 182 insertions(+), 96 deletions(-) diff --git a/cman/daemon/cman-preconfig.c b/cman/daemon/cman-preconfig.c index 13949ec..c8f69e5 100644 --- a/cman/daemon/cman-preconfig.c +++ b/cman/daemon/cman-preconfig.c @@ -50,14 +50,19 @@ static char nodename[MAX_CLUSTER_MEMBER_NAME_LEN]; static int nodeid; static int two_node; static unsigned int disable_openais; -static unsigned int portnum; static int num_nodenames; static char *key_filename=NULL; -static char *mcast_name; static char *cluster_name; static char error_reason[1024] = { '\0' }; static hdb_handle_t cluster_parent_handle; static int use_hashed_cluster_id = 1; +static unsigned int portnum = 0; +static unsigned int altportnum = 0; +static char *mcast_name = NULL; +static char *altmcast_name = NULL; +static unsigned int ttl = 1; +static unsigned int altttl = 1; + /* * Exports the interface for the service @@ -292,7 +297,7 @@ static int add_udpu_members(struct objdb_iface_ver0 *objdb, hdb_handle_t interfa #define PRIMARY_IFACE 0 #define ALT_IFACE 1 -static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, int port, int ttl, int altiface, enum tx_mech transport) +static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, int port, int intttl, int altiface, enum tx_mech transport) { hdb_handle_t totem_object_handle; hdb_handle_t find_handle; @@ -308,6 +313,11 @@ static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, [TX_MECH_RDMA] = "iba", }; + if (num_interfaces >= 2) { + snprintf(error_reason, sizeof(error_reason) - 1, "Configuration of more than 2 rings is not supported"); + return -1; + } + /* Check the families match */ if (address_family(mcast, &mcast_addr, 0) != address_family(ifaddr, &if_addr, mcast_addr.ss_family)) { @@ -379,14 +389,14 @@ static int add_ifaddr(struct objdb_iface_ver0 *objdb, char *mcast, char *ifaddr, tmp, strlen(tmp)+1, OBJDB_VALUETYPE_STRING); /* paranoia check. corosync already does it */ - if ((ttl < 0) || (ttl > 255)) { - sprintf(error_reason, "TTL value (%u) out of range (0 - 255)", ttl); + if ((intttl < 0) || (intttl > 255)) { + sprintf(error_reason, "TTL value (%u) out of range (0 - 255)", intttl); return -1; } /* add the key to the objdb only if value is not default */ - if (ttl != 1) { - sprintf(tmp, "%d", ttl); + if (intttl != 1) { + sprintf(tmp, "%d", intttl); objdb->object_key_create_typed(interface_object_handle, "ttl", tmp, strlen(tmp)+1, OBJDB_VALUETYPE_STRING); } @@ -433,11 +443,11 @@ static char *default_mcast(char *node, int altiface) if (family == AF_INET) { snprintf(addr, sizeof(addr), "239.192.%d.%d", clusterid >> 8, clusterid % 0xFF); - return addr; + return strdup(addr); } if (family == AF_INET6) { snprintf(addr, sizeof(addr), "ff15::%x", clusterid); - return addr; + return strdup(addr); } return NULL; @@ -583,6 +593,11 @@ static int get_env_overrides(void) portnum = atoi(getenv("CMAN_IP_PORT")); } + /* optional alternate port */ + if (getenv("CMAN_IP_ALTPORT")) { + altportnum = atoi(getenv("CMAN_IP_ALTPORT")); + } + /* optional security key filename */ if (getenv("CMAN_KEYFILE")) { key_filename = strdup(getenv("CMAN_KEYFILE")); @@ -606,6 +621,10 @@ static int get_env_overrides(void) mcast_name = getenv("CMAN_MCAST_ADDR"); } + if (getenv("CMAN_ALTMCAST_ADDR")) { + altmcast_name = getenv("CMAN_ALTMCAST_ADDR"); + } + if (getenv("CMAN_2NODE")) { two_node = 1; expected_votes = 1; @@ -630,11 +649,15 @@ static int get_nodename(struct objdb_iface_ver0 *objdb) hdb_handle_t object_handle; hdb_handle_t find_handle; hdb_handle_t node_object_handle; + hdb_handle_t mcast_handle; hdb_handle_t alt_object; enum tx_mech transport = TX_MECH_UDP; char *str; int error; - unsigned int ttl = 1; + unsigned int mcast_portnum = DEFAULT_PORT; + unsigned int altmcast_portnum = DEFAULT_PORT; + char *altmcast_name_tmp = NULL; + int broadcast = 0; if (!getenv("CMAN_NOCONFIG")) { /* our nodename */ @@ -683,7 +706,7 @@ static int get_nodename(struct objdb_iface_ver0 *objdb) } /* Add <cman> bits to pass down to the main module*/ - if ( (node_object_handle = nodelist_byname(objdb, cluster_parent_handle, nodename))) { + if ((node_object_handle = nodelist_byname(objdb, cluster_parent_handle, nodename))) { if (objdb_get_string(objdb, node_object_handle, "nodeid", &nodeid_str)) { sprintf(error_reason, "This node has no nodeid in cluster.conf"); write_cman_pipe("This node has no nodeid in cluster.conf"); @@ -692,84 +715,105 @@ static int get_nodename(struct objdb_iface_ver0 *objdb) } objdb->object_find_create(cluster_parent_handle, "cman", strlen("cman"), &find_handle); - - if (objdb->object_find_next(find_handle, &object_handle) == 0) { - - hdb_handle_t mcast_handle; - hdb_handle_t find_handle2; - - if (!mcast_name) { - - objdb->object_find_create(object_handle, "multicast", strlen("multicast"), &find_handle2); - if (objdb->object_find_next(find_handle2, &mcast_handle) == 0) { - - objdb_get_string(objdb, mcast_handle, "addr", &mcast_name); - objdb_get_int(objdb, mcast_handle, "ttl", &ttl, ttl); - } - objdb->object_find_destroy(find_handle2); - } - - if (!mcast_name) { - mcast_name = default_mcast(nodename, PRIMARY_IFACE); - - } - if (!mcast_name) - return -1; - - objdb->object_key_create_typed(object_handle, "nodename", - nodename, strlen(nodename)+1, OBJDB_VALUETYPE_STRING); + if (objdb->object_find_next(find_handle, &object_handle)) { + sprintf(error_reason, "Unable to find cman in config db"); + write_cman_pipe(error_reason); + return -1; } objdb->object_find_destroy(find_handle); - nodeid = atoi(nodeid_str); - error = 0; - - /* optional port */ - if (!portnum) { - objdb_get_int(objdb, object_handle, "port", &portnum, DEFAULT_PORT); - } - /* Check for broadcast */ if (!objdb_get_string(objdb, object_handle, "broadcast", &str)) { if (strcmp(str, "yes") == 0) { - mcast_name = strdup("255.255.255.255"); - if (!mcast_name) - return -1; + broadcast = 1; transport = TX_MECH_UDPB; } } /* Check for transport */ if (!objdb_get_string(objdb, object_handle, "transport", &str)) { + if ((broadcast) && (strcmp(str, "udpb"))) { + sprintf(error_reason, "Transport and broadcast option are mutually exclusive"); + write_cman_pipe(error_reason); + return -1; + } if (strcmp(str, "udp") == 0) { - if (transport != TX_MECH_UDPB) { - transport = TX_MECH_UDP; - } + transport = TX_MECH_UDP; } else if (strcmp(str, "udpb") == 0) { + broadcast = 1; transport = TX_MECH_UDPB; } else if (strcmp(str, "udpu") == 0) { - if (transport != TX_MECH_UDPB) { - transport = TX_MECH_UDPU; - } else { - sprintf(error_reason, "Transport and broadcast option are mutually exclusive"); - write_cman_pipe("Transport and broadcast option are mutually exclusive"); - return -1; - } + transport = TX_MECH_UDPU; } else if (strcmp(str, "rdma") == 0) { - if (transport != TX_MECH_UDPB) { - transport = TX_MECH_RDMA; - } else { - sprintf(error_reason, "Transport and broadcast option are mutually exclusive"); - write_cman_pipe("Transport and broadcast option are mutually exclusive"); - return -1; - } + transport = TX_MECH_RDMA; } else { sprintf(error_reason, "Transport option value can be one of udp, udpb, udpu, rdma"); - write_cman_pipe("Transport option value can be one of udp, udpb, udpu, rdma"); + write_cman_pipe(error_reason); return -1; } } + if (broadcast) { + mcast_name = strdup("255.255.255.255"); + if (!mcast_name) { + sprintf(error_reason, "Unable to set mcast_name"); + write_cman_pipe(error_reason); + return -1; + } + altmcast_name = strdup("255.255.255.255"); + if (!altmcast_name) { + sprintf(error_reason, "Unable to set altmcast_name"); + write_cman_pipe(error_reason); + return -1; + } + altmcast_portnum = DEFAULT_PORT + 2; + } + + objdb->object_find_create(object_handle, "multicast", strlen("multicast"), &find_handle); + if (objdb->object_find_next(find_handle, &mcast_handle) == 0) { + if (!mcast_name) + objdb_get_string(objdb, mcast_handle, "addr", &mcast_name); + objdb_get_int(objdb, mcast_handle, "ttl", &ttl, ttl); + objdb_get_int(objdb, mcast_handle, "port", &mcast_portnum, DEFAULT_PORT); + } + objdb->object_find_destroy(find_handle); + + if (!mcast_name) { + mcast_name = default_mcast(nodename, PRIMARY_IFACE); + } + + if (!mcast_name) { + sprintf(error_reason, "Unable to set mcast_name"); + write_cman_pipe(error_reason); + return -1; + } + + objdb->object_find_create(object_handle, "altmulticast", strlen("altmulticast"), &find_handle); + if (objdb->object_find_next(find_handle, &mcast_handle) == 0) { + objdb_get_string(objdb, mcast_handle, "addr", &altmcast_name_tmp); + objdb_get_int(objdb, mcast_handle, "ttl", &altttl, ttl); + if (!broadcast) { + objdb_get_int(objdb, mcast_handle, "port", &altmcast_portnum, DEFAULT_PORT); + } else { + objdb_get_int(objdb, mcast_handle, "port", &altmcast_portnum, DEFAULT_PORT + 2); + } + } + objdb->object_find_destroy(find_handle); + + /* See if the user wants our default set of openais services (default=yes) */ + objdb_get_int(objdb, object_handle, "disable_openais", &disable_openais, 0); + + objdb->object_key_create_typed(object_handle, "nodename", + nodename, strlen(nodename)+1, OBJDB_VALUETYPE_STRING); + + nodeid = atoi(nodeid_str); + error = 0; + + /* optional port */ + if (!portnum) { + objdb_get_int(objdb, object_handle, "port", &portnum, mcast_portnum); + } + if (add_ifaddr(objdb, mcast_name, nodename, portnum, ttl, PRIMARY_IFACE, transport)) { write_cman_pipe(error_reason); @@ -780,24 +824,43 @@ static int get_nodename(struct objdb_iface_ver0 *objdb) num_nodenames = 1; objdb->object_find_create(node_object_handle,"altname", strlen("altname"), &find_handle); while (objdb->object_find_next(find_handle, &alt_object) == 0) { - unsigned int port; - unsigned int altttl = 1; char *node; - char *mcast; if (objdb_get_string(objdb, alt_object, "name", &node)) { continue; } - objdb_get_int(objdb, alt_object, "port", &port, portnum); + objdb_get_int(objdb, alt_object, "port", &altportnum, altmcast_portnum); - objdb_get_int(objdb, alt_object, "ttl", &altttl, ttl); + objdb_get_int(objdb, alt_object, "ttl", &altttl, altttl); - if (objdb_get_string(objdb, alt_object, "mcast", &mcast)) { - mcast = default_mcast(nodename, ALT_IFACE); + if (!altmcast_name) { + if (objdb_get_string(objdb, alt_object, "mcast", &altmcast_name)) { + if (altmcast_name_tmp) { + altmcast_name = altmcast_name_tmp; + } else { + altmcast_name = default_mcast(nodename, ALT_IFACE); + } + } + } + + if (!altmcast_name) { + sprintf(error_reason, "Unable to determine alternate multicast name"); + write_cman_pipe(error_reason); + return -1; } - if (add_ifaddr(objdb, mcast, node, portnum, altttl, + if (!strcmp(altmcast_name, mcast_name) && + ((altportnum == portnum) || (altportnum == portnum - 1) || (portnum == altportnum - 1))) { + sprintf(error_reason, "Alternate communication channel (mcast: %s ports: %d,%d) cannot use\n" + "same address and ports of primary channel (mcast: %s ports: %d,%d)", + altmcast_name, altportnum, altportnum - 1, + mcast_name, portnum, portnum - 1); + write_cman_pipe(error_reason); + return -1; + } + + if (add_ifaddr(objdb, altmcast_name, node, altportnum, altttl, ALT_IFACE, transport)) { write_cman_pipe(error_reason); return -1; @@ -1326,9 +1389,6 @@ static int get_cman_globals(struct objdb_iface_ver0 *objdb) /* Get the <cman> bits that override <totem> bits */ objdb->object_find_create(cluster_parent_handle, "cman", strlen("cman"), &find_handle); if (objdb->object_find_next(find_handle, &object_handle) == 0) { - if (!portnum) - objdb_get_int(objdb, object_handle, "port", &portnum, DEFAULT_PORT); - if (!key_filename) objdb_get_string(objdb, object_handle, "keyfile", &key_filename); diff --git a/config/tools/xml/cluster.rng.in.head b/config/tools/xml/cluster.rng.in.head index 11d5052..a669c98 100644 --- a/config/tools/xml/cluster.rng.in.head +++ b/config/tools/xml/cluster.rng.in.head @@ -144,26 +144,25 @@ To validate your cluster.conf against this schema, run: </optional> <optional> <element name="multicast" rha:description="The multicast element - provides the ability for a user to specify a multicast address - instead of using the multicast address generated by cman. If + provides the ability for a user to specify a multicast address, + port and TTL (Time To Live) instead of using cman defaults. If a user does not specify a multicast address, cman creates one. It - forms the upper 16 bits of the multicast address with 239.192 and - forms the lower 16 bits based on the cluster ID."> - <optional> - <attribute name="addr" rha:description="A multicast address specified - by a user. If you do specify a multicast address, you should - use the 239.192.x.x series that cman uses. Otherwise, using a - multicast address outside that range may cause unpredictable - results. For example, using 224.0.0.x (All hosts on the network) - may not be routed correctly, or even routed at all by some - hardware." rha:sample="239.192.0.1"/> - </optional> - <optional> - <attribute name="ttl" rha:description="Define the TTL (time to live) of - a multicast packets. Useful only if nodes are on different subnets and - a multicast router is available in between." rha:default="1" - rha:sample="24"/> - </optional> + forms the upper 16 bits of the multicast address with 239.192 for + IPv4 and ff15:: for IPv6 and forms the lower 16 bits based on + the cluster ID."> + <ref name="MULTICASTOPTS"/> + </element> + </optional> + <optional> + <element name="altmulticast" rha:description="The altmulticast element + provides the ability for a user to specify a multicast address, + port and TTL (Time To Live) instead of using cman defaults for + the alternate communication channel (RRP). If a user does not + specify an alternate multicast address, cman creates one. It + forms the upper 16 bits of the multicast address with 239.192 for + IPv4 and ff15:: for IPv6 and forms the lower 16 bits based on + the cluster ID + 1."> + <ref name="MULTICASTOPTS"/> </element> </optional> </element> @@ -1013,6 +1012,33 @@ To validate your cluster.conf against this schema, run: </element>  </start> + + + <define name="MULTICASTOPTS"> + <optional> + <attribute name="addr" rha:description="A multicast address specified + by a user. If you do specify a multicast address, you should + use the 239.192.x.x series that cman uses. Otherwise, using a + multicast address outside that range may cause unpredictable + results. For example, using 224.0.0.x (All hosts on the network) + may not be routed correctly, or even routed at all by some + hardware." rha:sample="239.192.0.1"/> + </optional> + <optional> + <attribute name="ttl" rha:description="Define the TTL (time to live) of + a multicast packets. Useful only if nodes are on different subnets and + a multicast router is available in between." rha:default="1" + rha:sample="24"/> + </optional> + <optional> + <attribute name="port"> + <data type="nonNegativeInteger"/> + </attribute> + </optional> + </define> + + +  <define name="ALTNAME">

12 years, 2 months

1
0
0 / 0

cluster: RHEL6 - config: make altname validation position indipendent

by Fabio M. Di Nitto

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: e48f97cd32ecb5b595449fd946226e47f69880d3 Parent: d2fdb9c524b53203effa7d4150b5ee20b6cbe8cb Author: Fabio M. Di Nitto <fdinitto(a)redhat.com> AuthorDate: Mon Nov 28 10:47:56 2011 +0100 Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com> CommitterDate: Wed Mar 7 14:48:19 2012 +0100 config: make altname validation position indipendent Resolves: rhbz#740552 Signed-off-by: Lon Hohberger <lhh(a)redhat.com> Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com> --- config/tools/xml/cluster.rng.in.head | 55 +++++++++++++++++++--------------- 1 files changed, 31 insertions(+), 24 deletions(-) diff --git a/config/tools/xml/cluster.rng.in.head b/config/tools/xml/cluster.rng.in.head index e6e4633..11d5052 100644 --- a/config/tools/xml/cluster.rng.in.head +++ b/config/tools/xml/cluster.rng.in.head @@ -785,32 +785,11 @@ To validate your cluster.conf against this schema, run: dlm_controld(8)"/> </optional> - <optional> - <element name="altname" rha:description="Defines a second network - interface to use for corosync redundant ring mode. cman(5)"> - - <attribute name="name" rha:description="A second hostname or IP - address of the node. cman(5)"/> - - <optional> - <attribute name="port" rha:description="The network port to use - on the second interface. cman(5)"/> - </optional> - - <optional> - <attribute name="mcast" rha:description="The multicast address - to use on the second interface. cman(5)"/> - </optional> - - <optional> - <attribute name="ttl" rha:description="The multicast TTL - to use on the second interface. cman(5)"/> - </optional> - </element> - </optional> - <interleave> <optional> + <ref name="ALTNAME"/> + </optional> + <optional> <ref name="FENCE"/> </optional> <optional> @@ -1034,6 +1013,34 @@ To validate your cluster.conf against this schema, run: </element>  </start> + + + <define name="ALTNAME"> + <element name="altname" rha:description="Defines a second network + interface to use for corosync redundant ring mode. cman(5)"> + + <attribute name="name" rha:description="A second hostname or IP + address of the node. cman(5)"/> + + <optional> + <attribute name="port" rha:description="The network port to use + on the second interface. cman(5)"/> + </optional> + + <optional> + <attribute name="mcast" rha:description="The multicast address + to use on the second interface. cman(5)"/> + </optional> + + <optional> + <attribute name="ttl" rha:description="The multicast TTL + to use on the second interface. cman(5)"/> + </optional> + </element> + </define> + + +  <define name="FENCE">

12 years, 2 months

1
0
0 / 0

cluster: RHEL6 - qdisk: Fix man page example

by Fabio M. Di Nitto

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: d2fdb9c524b53203effa7d4150b5ee20b6cbe8cb Parent: 70b15e7afa730166ef0a3c81948d31ab1dd5aa93 Author: Lon Hohberger <lhh(a)redhat.com> AuthorDate: Wed Oct 12 11:23:28 2011 -0400 Committer: Fabio M. Di Nitto <fdinitto(a)redhat.com> CommitterDate: Wed Mar 7 14:46:31 2012 +0100 qdisk: Fix man page example Resolves: rhbz#745538 Signed-off-by: Fabio M. Di Nitto <fdinitto(a)redhat.com> Signed-off-by: Lon Hohberger <lhh(a)redhat.com> --- cman/man/qdisk.5 | 4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cman/man/qdisk.5 b/cman/man/qdisk.5 index 4070f48..e0b0ff6 100644 --- a/cman/man/qdisk.5 +++ b/cman/man/qdisk.5 @@ -1,4 +1,4 @@ -.TH "QDisk" "5" "20 Feb 2007" "" "Cluster Quorum Disk" +.TH "QDisk" "5" "12 Oct 2011" "" "Cluster Quorum Disk" .SH "NAME" qdisk \- a disk-based quorum daemon for CMAN / Linux-Cluster .SH "1. Overview" @@ -502,7 +502,7 @@ by the qdiskd timeout. .br <quorumd interval="1" tko="10" votes="1" label="testing"> .in 12 -<heuristic program="ping A -c1 -t1" score="1" interval="2" tko="3"/> +<heuristic program="ping A -c1 -w1" score="1" interval="2" tko="3"/> .br .in 8 </quorumd>

12 years, 2 months

1
0
0 / 0

cluster: STABLE32 - dlm_controld: fix handling of startup partition merge

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: 6815deffc9f90106f21d049bd694a6592d935597 Parent: 2d3d65ac704589f3e391486dd08f1c7a7b93a731 Author: David Teigland <teigland(a)redhat.com> AuthorDate: Mon Oct 31 13:04:43 2011 -0500 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Mar 1 16:14:31 2012 -0600 dlm_controld: fix handling of startup partition merge Nack messages are not matched correctly with previous change structs. Normal start messages should not be matched with changes that preceeded the node's join time, but nack messages should, so the match_change() function needs to distinguish. This can occur when two nodes are joining a lockspace, but neither has completed when a cluster partition+merge happens. bz 750314 Signed-off-by: David Teigland <teigland(a)redhat.com> --- group/dlm_controld/action.c | 6 +++++- group/dlm_controld/cpg.c | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/group/dlm_controld/action.c b/group/dlm_controld/action.c index 0570a2f..56401a3 100644 --- a/group/dlm_controld/action.c +++ b/group/dlm_controld/action.c @@ -424,7 +424,11 @@ int set_configfs_members(char *name, int new_count, int *new_members, if (rv) { log_error("%s: renew rmdir failed: %d", path, errno); - goto out; + + /* don't quit here, there's a case where + * this can happen, where a node identified + * for renewal was not really added + * previously */ } } diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c index 12cb202..ffb55b4 100644 --- a/group/dlm_controld/cpg.c +++ b/group/dlm_controld/cpg.c @@ -953,7 +953,14 @@ static int match_change(struct lockspace *ls, struct change *cg, "cluster add %llu", hd->nodeid, seq, cg->seq, (unsigned long long)cg->create_time, (unsigned long long)node->cluster_add_time); - return 0; + + /* nacks can apply to older cg's */ + if (!(hd->flags & DLM_MFLG_NACK)) { + return 0; + } else { + log_group(ls, "match_change %d:%u unskip cg %u for nack", + hd->nodeid, seq, cg->seq); + } } /* verify this is the right change by matching the counts

12 years, 2 months

1
0
0 / 0

cluster: STABLE32 - fenced: fix handling of startup partition merge

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: 2d3d65ac704589f3e391486dd08f1c7a7b93a731 Parent: c7d3938a3856f9cb295dc6aed8b7f86762cbed7c Author: David Teigland <teigland(a)redhat.com> AuthorDate: Mon Oct 31 12:06:41 2011 -0500 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Mar 1 16:14:18 2012 -0600 fenced: fix handling of startup partition merge The victims created on each side of a partition are cleared after a merge in the receive_complete function, which is meant to clear "initial victims". Clearing the victims is the correct end result, but the code arrives there through an unintended shortcut. Change the code so it clears the victims in a "more correct", and probably safer way: 2 is blocked doing startup fencing 1 joins fence domain partition between 1,2 1 sees confchg for partition, adds victim 2 (and sets init_victim per this patch, since 1 hasn't finished a start cycle yet) partition removed 2 completes startup fencing 2 sees confchg for partition, adds victim 1 1 sees confchg for merge, adds node 2 2 sees confchg for merge, adds node 1 1 processing the merge confchg 2 reduces victim 1 from partition, since 1 had no state (had not yet completed a start cycle) 2 processing the merge confchg 1,2 finish start cycle for merge confchg 2 sends complete for merge confchg 1 clears victim 2 in receive_complete because it set init_victim bz 750314 Signed-off-by: David Teigland <teigland(a)redhat.com> --- fence/fenced/cpg.c | 36 ++++++++++++++++++++++++++++++++++-- 1 files changed, 34 insertions(+), 2 deletions(-) diff --git a/fence/fenced/cpg.c b/fence/fenced/cpg.c index 99e16a0..6634f8c 100644 --- a/fence/fenced/cpg.c +++ b/fence/fenced/cpg.c @@ -1132,8 +1132,11 @@ static void receive_complete(struct fd *fd, struct fd_header *hd, int len) list_for_each_entry_safe(node, safe, &fd->victims, list) { log_debug("receive_complete clear victim nodeid %d init %d", node->nodeid, node->init_victim); - list_del(&node->list); - free(node); + + if (node->init_victim) { + list_del(&node->list); + free(node); + } } } @@ -1319,6 +1322,35 @@ static void add_victims(struct fd *fd, struct change *cg) return; list_add(&node->list, &fd->victims); log_debug("add_victims node %d", node->nodeid); + + /* + * If we haven't completed a start cycle yet, set + * init_victim on any failed node so that receive_complete + * will clear it. This is a hack for one specific scenario: + * + * - node 2 joins domain, blocks in startup fencing + * - node 1 joins domain, waiting for messages in start cycle + * - partition between 1,2 + * - 1 adds victim 2 + * (and sets init_victim below since 1 hasn't completed + * a start cycle yet) + * - partition removed + * - node 2 completes startup fencing + * - 2 gets confchg for partition + * - 2 adds victim 1 (due to partition) + * - 2 gets confchg for merge + * - 2 does join for 1 (due to merge), begins start cycle + * - start cycle adding node 1 finishes, 2 sends complete + * - 2 reduces victim 1 + * - 1 receives complete for its join start cycle, + * and clears victim 2 because we've set init_victim here + */ + + if (!fd->started_count) { + log_debug("add_victims node %d set init_victim", + node->nodeid); + node->init_victim = 1; + } } }

12 years, 2 months

1
0
0 / 0

cluster: RHEL6 - dlm_controld: fix handling of startup partition merge

by David Teigland

Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=... Commit: 70b15e7afa730166ef0a3c81948d31ab1dd5aa93 Parent: 5457043e975ba4c44c17138b3751150b974aa35c Author: David Teigland <teigland(a)redhat.com> AuthorDate: Mon Oct 31 13:04:43 2011 -0500 Committer: David Teigland <teigland(a)redhat.com> CommitterDate: Thu Mar 1 16:07:29 2012 -0600 dlm_controld: fix handling of startup partition merge Nack messages are not matched correctly with previous change structs. Normal start messages should not be matched with changes that preceeded the node's join time, but nack messages should, so the match_change() function needs to distinguish. This can occur when two nodes are joining a lockspace, but neither has completed when a cluster partition+merge happens. bz 750314 Signed-off-by: David Teigland <teigland(a)redhat.com> --- group/dlm_controld/action.c | 6 +++++- group/dlm_controld/cpg.c | 9 ++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/group/dlm_controld/action.c b/group/dlm_controld/action.c index 0570a2f..56401a3 100644 --- a/group/dlm_controld/action.c +++ b/group/dlm_controld/action.c @@ -424,7 +424,11 @@ int set_configfs_members(char *name, int new_count, int *new_members, if (rv) { log_error("%s: renew rmdir failed: %d", path, errno); - goto out; + + /* don't quit here, there's a case where + * this can happen, where a node identified + * for renewal was not really added + * previously */ } } diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c index 9b0d223..4463f4d 100644 --- a/group/dlm_controld/cpg.c +++ b/group/dlm_controld/cpg.c @@ -952,7 +952,14 @@ static int match_change(struct lockspace *ls, struct change *cg, "cluster add %llu", hd->nodeid, seq, cg->seq, (unsigned long long)cg->create_time, (unsigned long long)node->cluster_add_time); - return 0; + + /* nacks can apply to older cg's */ + if (!(hd->flags & DLM_MFLG_NACK)) { + return 0; + } else { + log_group(ls, "match_change %d:%u unskip cg %u for nack", + hd->nodeid, seq, cg->seq); + } } /* verify this is the right change by matching the counts

12 years, 2 months

1
0
0 / 0

2024

2023

2022

2021

2020

2019

2018

2017

2016

2015

2014

2013

2012

2011

2010

2009

cluster-commits March 2012