cluster: STABLE3 - gfs_controld: set last_plock_time for ownership operations
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: f6994d1db1688140ae011c733efdbaa0af6808fb
Parent: 41a8154dbe97608a45bcd80666b7b5f74527a5f9
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Mar 31 15:51:27 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Wed Mar 31 15:53:27 2010 -0500
gfs_controld: set last_plock_time for ownership operations
last_plock_time was not being set for ops related to ownership mode
like "own" and "drop" which change the plock state that is saved in
checkpoints. Not changing last_plock_time means we don't detect
that plock state has changed when it has, and may end up reusing an
old checkpoint, causing a mounting node to read incorrect resource
ownership state.
bz 578626
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/gfs_controld/plock.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/group/gfs_controld/plock.c b/group/gfs_controld/plock.c
index 39113cf..710e73a 100644
--- a/group/gfs_controld/plock.c
+++ b/group/gfs_controld/plock.c
@@ -1034,6 +1034,8 @@ static void _receive_own(struct mountgroup *mg, char *buf, int len, int from)
int should_not_happen = 0;
int rv;
+ mg->last_plock_time = time(NULL);
+
memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
info_bswap_in(&info);
@@ -1200,6 +1202,8 @@ static void _receive_sync(struct mountgroup *mg, char *buf, int len, int from)
struct resource *r;
int rv;
+ mg->last_plock_time = time(NULL);
+
memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
info_bswap_in(&info);
@@ -1245,6 +1249,8 @@ static void _receive_drop(struct mountgroup *mg, char *buf, int len, int from)
struct resource *r;
int rv;
+ mg->last_plock_time = time(NULL);
+
memcpy(&info, buf + sizeof(struct gdlm_header), sizeof(info));
info_bswap_in(&info);
14 years
cluster: STABLE3 - dlm_controld: don't skip unlinking checkpoint
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 41a8154dbe97608a45bcd80666b7b5f74527a5f9
Parent: 46ae11537137ce73b55b67343e67c6c469fdddd1
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Mar 31 15:06:30 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Wed Mar 31 15:41:40 2010 -0500
dlm_controld: don't skip unlinking checkpoint
This reverts commit c1e139fb328408891d054e577b33d724230dde43
which was wrong.
bz 578628
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/dlm_controld/plock.c | 5 +----
1 files changed, 1 insertions(+), 4 deletions(-)
diff --git a/group/dlm_controld/plock.c b/group/dlm_controld/plock.c
index a534300..bf6ddfa 100644
--- a/group/dlm_controld/plock.c
+++ b/group/dlm_controld/plock.c
@@ -1709,9 +1709,6 @@ static int _unlink_checkpoint(struct lockspace *ls, SaNameT *name)
h = (SaCkptCheckpointHandleT) ls->plock_ckpt_handle;
log_group(ls, "unlink ckpt %llx", (unsigned long long)h);
- if (!h)
- return ret;
-
unlink_retry:
rv = saCkptCheckpointUnlink(system_ckpt_handle, name);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
@@ -1733,7 +1730,7 @@ static int _unlink_checkpoint(struct lockspace *ls, SaNameT *name)
goto status_retry;
}
if (rv != SA_AIS_OK) {
- log_error("unlink ckpt status error %d %s", rv, ls->name);
+ log_group(ls, "unlink ckpt status error %d %s", rv, ls->name);
goto out_close;
}
14 years
cluster: STABLE3 - dlm_controld: set last_plock_time for ownership operations
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 46ae11537137ce73b55b67343e67c6c469fdddd1
Parent: e2ccbf90543cf1d163d1a067bf5a8ce049a9c134
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Mar 31 15:05:49 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Wed Mar 31 15:41:28 2010 -0500
dlm_controld: set last_plock_time for ownership operations
last_plock_time was not being set for ops related to ownership mode
like "own" and "drop" which change the plock state that is saved in
checkpoints. Not changing last_plock_time means we don't detect
that plock state has changed when it has, and may end up reusing an
old checkpoint, causing a mounting node to read incorrect resource
ownership state.
bz 578626
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/dlm_controld/plock.c | 6 ++++++
1 files changed, 6 insertions(+), 0 deletions(-)
diff --git a/group/dlm_controld/plock.c b/group/dlm_controld/plock.c
index 1321af4..a534300 100644
--- a/group/dlm_controld/plock.c
+++ b/group/dlm_controld/plock.c
@@ -1073,6 +1073,8 @@ static void _receive_own(struct lockspace *ls, struct dlm_header *hd, int len)
int from = hd->nodeid;
int rv;
+ ls->last_plock_time = time(NULL);
+
memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info));
info_bswap_in(&info);
@@ -1245,6 +1247,8 @@ static void _receive_sync(struct lockspace *ls, struct dlm_header *hd, int len)
int from = hd->nodeid;
int rv;
+ ls->last_plock_time = time(NULL);
+
memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info));
info_bswap_in(&info);
@@ -1290,6 +1294,8 @@ static void _receive_drop(struct lockspace *ls, struct dlm_header *hd, int len)
int from = hd->nodeid;
int rv;
+ ls->last_plock_time = time(NULL);
+
memcpy(&info, (char *)hd + sizeof(struct dlm_header), sizeof(info));
info_bswap_in(&info);
14 years
cluster: STABLE3 - dlm_controld: add plock checkpoint signatures
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: e2ccbf90543cf1d163d1a067bf5a8ce049a9c134
Parent: 71e0466770a402bfbc625169bedf4b872be7bf84
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Tue Mar 30 15:36:45 2010 -0500
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Wed Mar 31 15:41:18 2010 -0500
dlm_controld: add plock checkpoint signatures
A signature of the plock checkpoint data is made by the writer and
included in the "checkpoint ready" message it sends. The reader
then computes the signature of the data it reads from the checkpoint
and compares against the signature of the writer. If they don't
match, the reader has retrieved incorrect plock state so it disables
plock operations for the given lockspace on that node.
bz 578625
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/dlm_controld/cpg.c | 56 ++++++++++-----
group/dlm_controld/dlm_daemon.h | 14 +++-
group/dlm_controld/plock.c | 138 ++++++++++++++++++++++++++++-----------
3 files changed, 147 insertions(+), 61 deletions(-)
diff --git a/group/dlm_controld/cpg.c b/group/dlm_controld/cpg.c
index 20d59d5..af7ac40 100644
--- a/group/dlm_controld/cpg.c
+++ b/group/dlm_controld/cpg.c
@@ -263,6 +263,7 @@ void dlm_send_message(struct lockspace *ls, char *buf, int len)
hd->global_id = cpu_to_le32(ls->global_id);
hd->flags = cpu_to_le32(hd->flags);
hd->msgdata = cpu_to_le32(hd->msgdata);
+ hd->msgdata2 = cpu_to_le32(hd->msgdata2);
_send_message(ls->cpg_handle, buf, len, type);
}
@@ -1081,9 +1082,11 @@ static void receive_plocks_stored(struct lockspace *ls, struct dlm_header *hd,
{
struct ls_info *li;
struct id_info *ids;
+ uint32_t sig;
- log_group(ls, "receive_plocks_stored %d:%u need_plocks %d",
- hd->nodeid, hd->msgdata, ls->need_plocks);
+ log_group(ls, "receive_plocks_stored %d:%u flags %x sig %x "
+ "need_plocks %d", hd->nodeid, hd->msgdata, hd->flags,
+ hd->msgdata2, ls->need_plocks);
if (!ls->need_plocks)
return;
@@ -1109,14 +1112,25 @@ static void receive_plocks_stored(struct lockspace *ls, struct dlm_header *hd,
return;
}
- retrieve_plocks(ls);
+ retrieve_plocks(ls, &sig);
+
+ if ((hd->flags & DLM_MFLG_PLOCK_SIG) && (sig != hd->msgdata2)) {
+ log_error("lockspace %s plock disabled our sig %x "
+ "nodeid %d sig %x", ls->name, sig, hd->nodeid,
+ hd->msgdata2);
+ ls->disable_plock = 1;
+ ls->need_plocks = 1; /* don't set HAVEPLOCK */
+ ls->save_plocks = 0;
+ return;
+ }
+
process_saved_plocks(ls);
ls->need_plocks = 0;
ls->save_plocks = 0;
}
static void send_info(struct lockspace *ls, struct change *cg, int type,
- uint32_t flags)
+ uint32_t flags, uint32_t msgdata2)
{
struct dlm_header *hd;
struct ls_info *li;
@@ -1146,6 +1160,7 @@ static void send_info(struct lockspace *ls, struct change *cg, int type,
hd->type = type;
hd->msgdata = cg->seq;
hd->flags = flags;
+ hd->msgdata2 = msgdata2;
if (ls->joining)
hd->flags |= DLM_MFLG_JOINING;
@@ -1170,10 +1185,11 @@ static void send_info(struct lockspace *ls, struct change *cg, int type,
id++;
}
- log_group(ls, "send_%s cg %u flags %x counts %u %d %d %d %d",
+ log_group(ls, "send_%s cg %u flags %x data2 %x counts %u %d %d %d %d",
type == DLM_MSG_START ? "start" : "plocks_stored",
- cg->seq, hd->flags, ls->started_count, cg->member_count,
- cg->joined_count, cg->remove_count, cg->failed_count);
+ cg->seq, hd->flags, hd->msgdata2, ls->started_count,
+ cg->member_count, cg->joined_count, cg->remove_count,
+ cg->failed_count);
dlm_send_message(ls, buf, len);
@@ -1184,14 +1200,14 @@ static void send_start(struct lockspace *ls)
{
struct change *cg = list_first_entry(&ls->changes, struct change, list);
- send_info(ls, cg, DLM_MSG_START, 0);
+ send_info(ls, cg, DLM_MSG_START, 0, 0);
}
-static void send_plocks_stored(struct lockspace *ls)
+static void send_plocks_stored(struct lockspace *ls, uint32_t sig)
{
struct change *cg = list_first_entry(&ls->changes, struct change, list);
- send_info(ls, cg, DLM_MSG_PLOCKS_STORED, 0);
+ send_info(ls, cg, DLM_MSG_PLOCKS_STORED, DLM_MFLG_PLOCK_SIG, sig);
}
static int same_members(struct change *cg1, struct change *cg2)
@@ -1218,7 +1234,7 @@ static void send_nacks(struct lockspace *ls, struct change *startcg)
same_members(cg, startcg)) {
log_group(ls, "send nack old cg %u new cg %u",
cg->seq, startcg->seq);
- send_info(ls, cg, DLM_MSG_START, DLM_MFLG_NACK);
+ send_info(ls, cg, DLM_MSG_START, DLM_MFLG_NACK, 0);
}
}
}
@@ -1238,8 +1254,9 @@ static void prepare_plocks(struct lockspace *ls)
{
struct change *cg = list_first_entry(&ls->changes, struct change, list);
struct member *memb;
+ uint32_t sig;
- if (!cfgd_enable_plock)
+ if (!cfgd_enable_plock || ls->disable_plock)
return;
/* if we're the only node in the lockspace, then we are the ckpt_node
@@ -1297,8 +1314,8 @@ static void prepare_plocks(struct lockspace *ls)
previous ckpt_node upon receiving the stored message from us. */
if (nodes_added(ls))
- store_plocks(ls);
- send_plocks_stored(ls);
+ store_plocks(ls, &sig);
+ send_plocks_stored(ls, sig);
}
static void apply_changes(struct lockspace *ls)
@@ -1532,6 +1549,7 @@ static void dlm_header_in(struct dlm_header *hd)
hd->global_id = le32_to_cpu(hd->global_id);
hd->flags = le32_to_cpu(hd->flags);
hd->msgdata = le32_to_cpu(hd->msgdata);
+ hd->msgdata2 = le32_to_cpu(hd->msgdata2);
}
static void deliver_cb(cpg_handle_t handle,
@@ -1579,7 +1597,7 @@ static void deliver_cb(cpg_handle_t handle,
case DLM_MSG_PLOCK:
if (cfgd_enable_plock)
receive_plock(ls, hd, len);
- else
+ else if (!ls->disable_plock)
log_error("msg %d nodeid %d enable_plock %d",
hd->type, nodeid, cfgd_enable_plock);
break;
@@ -1587,7 +1605,7 @@ static void deliver_cb(cpg_handle_t handle,
case DLM_MSG_PLOCK_OWN:
if (cfgd_enable_plock && cfgd_plock_ownership)
receive_own(ls, hd, len);
- else
+ else if (!ls->disable_plock)
log_error("msg %d nodeid %d enable_plock %d owner %d",
hd->type, nodeid, cfgd_enable_plock,
cfgd_plock_ownership);
@@ -1596,7 +1614,7 @@ static void deliver_cb(cpg_handle_t handle,
case DLM_MSG_PLOCK_DROP:
if (cfgd_enable_plock && cfgd_plock_ownership)
receive_drop(ls, hd, len);
- else
+ else if (!ls->disable_plock)
log_error("msg %d nodeid %d enable_plock %d owner %d",
hd->type, nodeid, cfgd_enable_plock,
cfgd_plock_ownership);
@@ -1606,7 +1624,7 @@ static void deliver_cb(cpg_handle_t handle,
case DLM_MSG_PLOCK_SYNC_WAITER:
if (cfgd_enable_plock && cfgd_plock_ownership)
receive_sync(ls, hd, len);
- else
+ else if (!ls->disable_plock)
log_error("msg %d nodeid %d enable_plock %d owner %d",
hd->type, nodeid, cfgd_enable_plock,
cfgd_plock_ownership);
@@ -1615,7 +1633,7 @@ static void deliver_cb(cpg_handle_t handle,
case DLM_MSG_PLOCKS_STORED:
if (cfgd_enable_plock)
receive_plocks_stored(ls, hd, len);
- else
+ else if (!ls->disable_plock)
log_error("msg %d nodeid %d enable_plock %d",
hd->type, nodeid, cfgd_enable_plock);
break;
diff --git a/group/dlm_controld/dlm_daemon.h b/group/dlm_controld/dlm_daemon.h
index b61a636..c2423d2 100644
--- a/group/dlm_controld/dlm_daemon.h
+++ b/group/dlm_controld/dlm_daemon.h
@@ -168,6 +168,7 @@ enum {
#define DLM_MFLG_HAVEPLOCK 2 /* accompanies start, we have plock state */
#define DLM_MFLG_NACK 4 /* accompanies start, prevent wrong match when
two outstanding changes are the same */
+#define DLM_MFLG_PLOCK_SIG 8 /* msgdata2 is a plock signature */
struct dlm_header {
uint16_t version[3];
@@ -178,8 +179,8 @@ struct dlm_header {
uint32_t flags; /* DLM_MFLG_ */
uint32_t msgdata; /* in-header payload depends on MSG type; lkid
for deadlock, seq for lockspace membership */
- uint32_t pad1;
- uint64_t pad2;
+ uint32_t msgdata2; /* second MSG-specific data */
+ uint64_t pad;
};
struct lockspace {
@@ -207,6 +208,7 @@ struct lockspace {
int plock_ckpt_node;
int need_plocks;
int save_plocks;
+ int disable_plock;
uint32_t associated_mg_id;
struct list_head saved_messages;
struct list_head plock_resources;
@@ -214,6 +216,10 @@ struct lockspace {
time_t last_plock_time;
struct timeval drop_resources_last;
uint64_t plock_ckpt_handle;
+ uint64_t checkpoint_r_num_first;
+ uint64_t checkpoint_r_num_last;
+ uint32_t checkpoint_r_count;
+ uint32_t checkpoint_p_count;
/* save copy of groupd member callback data for queries */
@@ -333,8 +339,8 @@ void receive_sync(struct lockspace *ls, struct dlm_header *hd, int len);
void receive_drop(struct lockspace *ls, struct dlm_header *hd, int len);
void process_saved_plocks(struct lockspace *ls);
void close_plock_checkpoint(struct lockspace *ls);
-void store_plocks(struct lockspace *ls);
-void retrieve_plocks(struct lockspace *ls);
+void store_plocks(struct lockspace *ls, uint32_t *sig);
+void retrieve_plocks(struct lockspace *ls, uint32_t *sig);
void purge_plocks(struct lockspace *ls, int nodeid, int unmount);
int fill_plock_dump_buf(struct lockspace *ls);
diff --git a/group/dlm_controld/plock.c b/group/dlm_controld/plock.c
index b11341d..1321af4 100644
--- a/group/dlm_controld/plock.c
+++ b/group/dlm_controld/plock.c
@@ -1489,6 +1489,11 @@ void process_plocks(int ci)
goto fail;
}
+ if (ls->disable_plock) {
+ rv = -ENOSYS;
+ goto fail;
+ }
+
log_plock(ls, "read plock %llx %s %s %llx-%llx %d/%u/%llx w %d",
(unsigned long long)info.number,
op_str(info.optype),
@@ -1621,7 +1626,8 @@ static void pack_section_buf(struct lockspace *ls, struct resource *r)
section_len = count * sizeof(struct pack_plock);
}
-static int unpack_section_buf(struct lockspace *ls, char *numbuf, int buflen)
+static int unpack_section_buf(struct lockspace *ls, char *numbuf, int buflen,
+ uint64_t *r_num, int *lock_count)
{
struct pack_plock *pp;
struct posix_lock *po;
@@ -1648,6 +1654,8 @@ static int unpack_section_buf(struct lockspace *ls, char *numbuf, int buflen)
r->owner = owner;
r->last_access = now;
+ *r_num = num;
+
pp = (struct pack_plock *) §ion_buf;
for (i = 0; i < count; i++) {
@@ -1676,6 +1684,7 @@ static int unpack_section_buf(struct lockspace *ls, char *numbuf, int buflen)
}
list_add_tail(&r->list, &ls->plock_resources);
+ *lock_count = count;
return 0;
}
@@ -1798,7 +1807,7 @@ void close_plock_checkpoint(struct lockspace *ls)
it. The ckpt should then disappear and the new node can create a new ckpt
for the next mounter. */
-void store_plocks(struct lockspace *ls)
+void store_plocks(struct lockspace *ls, uint32_t *sig)
{
SaCkptCheckpointCreationAttributesT attr;
SaCkptCheckpointHandleT h;
@@ -1811,15 +1820,21 @@ void store_plocks(struct lockspace *ls)
struct resource *r;
struct posix_lock *po;
struct lock_waiter *w;
- int r_count, lock_count, total_size, section_size, max_section_size;
+ int total_size, section_size, max_section_size;
int len, owner;
+ uint32_t r_count = 0, p_count = 0;
+ uint64_t r_num_first = 0, r_num_last = 0;
- if (!cfgd_enable_plock)
+ if (!cfgd_enable_plock || ls->disable_plock)
return;
/* no change to plock state since we created the last checkpoint */
if (ls->last_checkpoint_time > ls->last_plock_time) {
- log_group(ls, "store_plocks: saved ckpt uptodate");
+ log_group(ls, "store_plocks saved ckpt uptodate");
+ r_num_first = ls->checkpoint_r_num_first;
+ r_num_last = ls->checkpoint_r_num_last;
+ r_count = ls->checkpoint_r_count;
+ p_count = ls->checkpoint_p_count;
goto out;
}
ls->last_checkpoint_time = time(NULL);
@@ -1834,7 +1849,7 @@ void store_plocks(struct lockspace *ls)
the attr fields */
r_count = 0;
- lock_count = 0;
+ p_count = 0;
total_size = 0;
max_section_size = 0;
@@ -1846,22 +1861,23 @@ void store_plocks(struct lockspace *ls)
section_size = 0;
list_for_each_entry(po, &r->locks, list) {
section_size += sizeof(struct pack_plock);
- lock_count++;
+ p_count++;
}
list_for_each_entry(w, &r->waiters, list) {
section_size += sizeof(struct pack_plock);
- lock_count++;
+ p_count++;
}
total_size += section_size;
if (section_size > max_section_size)
max_section_size = section_size;
}
- log_group(ls, "store_plocks: r_count %d, lock_count %d, pp %u bytes",
- r_count, lock_count, (unsigned int)sizeof(struct pack_plock));
-
- log_group(ls, "store_plocks: total %d bytes, max_section %d bytes",
- total_size, max_section_size);
+ log_group(ls, "store_plocks r_count %u p_count %u "
+ "total_size %d max_section_size %d",
+ r_count, p_count, total_size, max_section_size);
+ log_plock(ls, "store_plocks r_count %u p_count %u "
+ "total_size %d max_section_size %d",
+ r_count, p_count, total_size, max_section_size);
attr.creationFlags = SA_CKPT_WR_ALL_REPLICAS;
attr.checkpointSize = total_size;
@@ -1877,20 +1893,20 @@ void store_plocks(struct lockspace *ls)
open_retry:
rv = saCkptCheckpointOpen(system_ckpt_handle, &name,&attr,flags,0,&h);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "store_plocks: ckpt open retry");
+ log_group(ls, "store_plocks ckpt open retry");
sleep(1);
goto open_retry;
}
if (rv == SA_AIS_ERR_EXIST) {
- log_group(ls, "store_plocks: ckpt already exists");
+ log_group(ls, "store_plocks ckpt already exists");
return;
}
if (rv != SA_AIS_OK) {
- log_error("store_plocks: ckpt open error %d %s", rv, ls->name);
+ log_error("store_plocks ckpt open error %d %s", rv, ls->name);
return;
}
- log_group(ls, "store_plocks: open ckpt handle %llx",
+ log_group(ls, "store_plocks open ckpt handle %llx",
(unsigned long long)h);
ls->plock_ckpt_handle = (uint64_t) h;
@@ -1935,20 +1951,24 @@ void store_plocks(struct lockspace *ls)
pack_section_buf(ls, r);
- log_plock(ls, "store_plocks: section size %u id %u \"%s\"",
+ if (!r_num_first)
+ r_num_first = r->number;
+ r_num_last = r->number;
+
+ log_plock(ls, "store_plocks section size %u id %u \"%s\"",
section_len, section_id.idLen, buf);
create_retry:
rv = saCkptSectionCreate(h, §ion_attr, §ion_buf,
section_len);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "store_plocks: ckpt create retry");
+ log_group(ls, "store_plocks ckpt create retry");
sleep(1);
goto create_retry;
}
if (rv == SA_AIS_ERR_EXIST) {
/* this shouldn't happen in general */
- log_group(ls, "store_plocks: clearing old ckpt");
+ log_group(ls, "store_plocks clearing old ckpt");
/* do we need this close or will the close in
the unlink function be ok? */
saCkptCheckpointClose(h);
@@ -1956,19 +1976,36 @@ void store_plocks(struct lockspace *ls)
goto open_retry;
}
if (rv != SA_AIS_OK) {
- log_error("store_plocks: ckpt section create err %d %s",
+ log_error("store_plocks ckpt section create err %d %s",
rv, ls->name);
break;
}
}
out:
- return;
+ *sig = (0xFFFFFFFF & r_num_first) ^ (0xFFFFFFFF & r_num_last) ^
+ r_count ^ p_count;
+
+ log_group(ls, "store_plocks first %llu last %llu r_count %u "
+ "p_count %u sig %x",
+ (unsigned long long)r_num_first,
+ (unsigned long long)r_num_last,
+ r_count, p_count, *sig);
+ log_plock(ls, "store_plocks first %llu last %llu r_count %u "
+ "p_count %u sig %x",
+ (unsigned long long)r_num_first,
+ (unsigned long long)r_num_last,
+ r_count, p_count, *sig);
+
+ ls->checkpoint_r_num_first = r_num_first;
+ ls->checkpoint_r_num_last = r_num_last;
+ ls->checkpoint_r_count = r_count;
+ ls->checkpoint_p_count = p_count;
}
/* called by a node that's just been added to the group to get existing plock
state */
-void retrieve_plocks(struct lockspace *ls)
+void retrieve_plocks(struct lockspace *ls, uint32_t *sig)
{
SaCkptCheckpointHandleT h;
SaCkptSectionIterationHandleT itr;
@@ -1977,9 +2014,11 @@ void retrieve_plocks(struct lockspace *ls)
SaNameT name;
SaAisErrorT rv;
char buf[SECTION_NAME_LEN];
- int len;
+ int len, lock_count;
+ uint32_t r_count = 0, p_count = 0;
+ uint64_t r_num, r_num_first = 0, r_num_last = 0;
- if (!cfgd_enable_plock)
+ if (!cfgd_enable_plock || ls->disable_plock)
return;
log_group(ls, "retrieve_plocks");
@@ -1992,12 +2031,12 @@ void retrieve_plocks(struct lockspace *ls)
rv = saCkptCheckpointOpen(system_ckpt_handle, &name, NULL,
SA_CKPT_CHECKPOINT_READ, 0, &h);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "retrieve_plocks: ckpt open retry");
+ log_group(ls, "retrieve_plocks ckpt open retry");
sleep(1);
goto open_retry;
}
if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt open error %d %s",
+ log_error("retrieve_plocks ckpt open error %d %s",
rv, ls->name);
return;
}
@@ -2005,12 +2044,12 @@ void retrieve_plocks(struct lockspace *ls)
init_retry:
rv = saCkptSectionIterationInitialize(h, SA_CKPT_SECTIONS_ANY, 0, &itr);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "retrieve_plocks: ckpt iterinit retry");
+ log_group(ls, "retrieve_plocks ckpt iterinit retry");
sleep(1);
goto init_retry;
}
if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt iterinit error %d %s",
+ log_error("retrieve_plocks ckpt iterinit error %d %s",
rv, ls->name);
goto out;
}
@@ -2021,12 +2060,12 @@ void retrieve_plocks(struct lockspace *ls)
if (rv == SA_AIS_ERR_NO_SECTIONS)
break;
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "retrieve_plocks: ckpt iternext retry");
+ log_group(ls, "retrieve_plocks ckpt iternext retry");
sleep(1);
goto next_retry;
}
if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt iternext error %d %s",
+ log_error("retrieve_plocks ckpt iternext error %d %s",
rv, ls->name);
goto out_it;
}
@@ -2043,19 +2082,19 @@ void retrieve_plocks(struct lockspace *ls)
memset(&buf, 0, sizeof(buf));
snprintf(buf, SECTION_NAME_LEN, "%s", desc.sectionId.id);
- log_plock(ls, "retrieve_plocks: section size %llu id %u \"%s\"",
+ log_plock(ls, "retrieve_plocks section size %llu id %u \"%s\"",
(unsigned long long)iov.dataSize, iov.sectionId.idLen,
buf);
read_retry:
rv = saCkptCheckpointRead(h, &iov, 1, NULL);
if (rv == SA_AIS_ERR_TRY_AGAIN) {
- log_group(ls, "retrieve_plocks: ckpt read retry");
+ log_group(ls, "retrieve_plocks ckpt read retry");
sleep(1);
goto read_retry;
}
if (rv != SA_AIS_OK) {
- log_error("retrieve_plocks: ckpt read error %d %s",
+ log_error("retrieve_plocks ckpt read error %d %s",
rv, ls->name);
goto out_it;
}
@@ -2064,24 +2103,47 @@ void retrieve_plocks(struct lockspace *ls)
no locks, which exist in ownership mode; the resource
name and owner come from the section id */
- log_plock(ls, "retrieve_plocks: ckpt read %llu bytes",
+ log_plock(ls, "retrieve_plocks ckpt read %llu bytes",
(unsigned long long)iov.readSize);
section_len = iov.readSize;
if (section_len % sizeof(struct pack_plock)) {
- log_error("retrieve_plocks: bad section len %d %s",
+ log_error("retrieve_plocks bad section len %d %s",
section_len, ls->name);
continue;
}
+ r_num = 0;
+ lock_count = 0;
+
unpack_section_buf(ls, (char *)desc.sectionId.id,
- desc.sectionId.idLen);
+ desc.sectionId.idLen, &r_num, &lock_count);
+ r_count++;
+ p_count += lock_count;
+
+ if (!r_num_first)
+ r_num_first = r_num;
+ r_num_last = r_num;
}
out_it:
saCkptSectionIterationFinalize(itr);
out:
saCkptCheckpointClose(h);
+
+ *sig = (0xFFFFFFFF & r_num_first) ^ (0xFFFFFFFF & r_num_last)
+ ^ r_count ^ p_count;
+
+ log_group(ls, "retrieve_plocks first %llu last %llu r_count %u "
+ "p_count %u sig %x",
+ (unsigned long long)r_num_first,
+ (unsigned long long)r_num_last,
+ r_count, p_count, *sig);
+ log_plock(ls, "retrieve_plocks first %llu last %llu r_count %u "
+ "p_count %u sig %x",
+ (unsigned long long)r_num_first,
+ (unsigned long long)r_num_last,
+ r_count, p_count, *sig);
}
/* Called when a node has failed, or we're unmounting. For a node failure, we
@@ -2095,7 +2157,7 @@ void purge_plocks(struct lockspace *ls, int nodeid, int unmount)
struct resource *r, *r2;
int purged = 0;
- if (!cfgd_enable_plock)
+ if (!cfgd_enable_plock || ls->disable_plock)
return;
list_for_each_entry_safe(r, r2, &ls->plock_resources, list) {
14 years
cluster: STABLE3 - rgmanager: Fix tiny memory leak during reconfig
by Lon Hohberger
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 71e0466770a402bfbc625169bedf4b872be7bf84
Parent: 06ea41adb746746e67fb489405e46cd6f9c257d8
Author: Lon Hohberger <lhh(a)redhat.com>
AuthorDate: Tue Mar 30 14:52:21 2010 -0400
Committer: Lon Hohberger <lhh(a)redhat.com>
CommitterDate: Tue Mar 30 14:52:21 2010 -0400
rgmanager: Fix tiny memory leak during reconfig
The ev_script_file was not being freed correctly.
Resolves: bz578249
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
---
rgmanager/src/daemons/event_config.c | 2 ++
1 files changed, 2 insertions(+), 0 deletions(-)
diff --git a/rgmanager/src/daemons/event_config.c b/rgmanager/src/daemons/event_config.c
index f99a980..4ab5bfc 100644
--- a/rgmanager/src/daemons/event_config.c
+++ b/rgmanager/src/daemons/event_config.c
@@ -154,6 +154,8 @@ deconstruct_event(event_t *ev)
{
if (ev->ev_script)
free(ev->ev_script);
+ if (ev->ev_script_file)
+ free(ev->ev_script_file);
if (ev->ev_name)
free(ev->ev_name);
free(ev);
14 years
cluster: STABLE3 - rgmanager: Fix memory leaks during relocation
by Lon Hohberger
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 06ea41adb746746e67fb489405e46cd6f9c257d8
Parent: 3e03a4a55527718f200f57bec6766e5f3d2ac8a5
Author: Lon Hohberger <lhh(a)redhat.com>
AuthorDate: Tue Mar 30 12:49:17 2010 -0400
Committer: Lon Hohberger <lhh(a)redhat.com>
CommitterDate: Tue Mar 30 12:50:01 2010 -0400
rgmanager: Fix memory leaks during relocation
Resolves: bz#578249
Signed-off-by: Lon Hohberger <lhh(a)redhat.com>
---
rgmanager/src/daemons/rg_state.c | 2 ++
rgmanager/src/daemons/rg_thread.c | 4 +++-
2 files changed, 5 insertions(+), 1 deletions(-)
diff --git a/rgmanager/src/daemons/rg_state.c b/rgmanager/src/daemons/rg_state.c
index d9f02b3..79ff59a 100644
--- a/rgmanager/src/daemons/rg_state.c
+++ b/rgmanager/src/daemons/rg_state.c
@@ -1745,6 +1745,7 @@ handle_relocate_req(char *svcName, int orig_request, int preferred_target,
*/
if (svc_start(svcName, RG_START) == 0) {
*new_owner = me;
+ free_member_list(backup);
return 0;
}
} else if (target == preferred_target) {
@@ -1757,6 +1758,7 @@ handle_relocate_req(char *svcName, int orig_request, int preferred_target,
/*
* Great! We're done...
*/
+ free_member_list(backup);
return 0;
}
diff --git a/rgmanager/src/daemons/rg_thread.c b/rgmanager/src/daemons/rg_thread.c
index 517f8fe..446f2d2 100644
--- a/rgmanager/src/daemons/rg_thread.c
+++ b/rgmanager/src/daemons/rg_thread.c
@@ -701,6 +701,7 @@ rt_enqueue_request(const char *resgroupname, int request,
}
if (resgroup->rt_request == RG_RELOCATE) {
+ ret = -1;
switch(request) {
case RG_RELOCATE:
case RG_START_REMOTE:
@@ -711,13 +712,14 @@ rt_enqueue_request(const char *resgroupname, int request,
request, 0);
msg_close(response_ctx);
msg_free_ctx(response_ctx);
+ ret = 0;
break;
}
fprintf(stderr, "Failed to queue request: Would block\n");
/* EWOULDBLOCK */
pthread_mutex_unlock(resgroup->rt_queue_mutex);
pthread_mutex_unlock(&reslist_mutex);
- return 0;
+ return ret;
}
ret = rq_queue_request(resgroup->rt_queue, resgroup->rt_name,
14 years
cluster: RHEL53 - groupd: clean up leaving failed node
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 58a768b6e836ad6406a3da7d71138e2d4ccbe7aa
Parent: b3d878676e3f1984ae8e830c176f0a83443f97c7
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Dec 9 16:53:33 2009 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Mar 29 11:11:09 2010 -0500
groupd: clean up leaving failed node
bz 521817
Due to shutdown+failure scenarios that aren't fully understood,
a node that fails while shutting down can cause the other nodes
to get stuck trying to restart the clvmd group (whether other
groups could be affected is unknown.)
The other nodes will all show something like this from group_tool -v:
dlm 1 clvmd 00010002 LEAVE_STOP_WAIT 1 100020002 1
and group_tool dump will show things like:
1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
This fix is to more or less watch out for this very specific
situation where things get messed up and forcibly clean things
up so the other nodes aren't stuck.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/daemon/app.c | 41 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/group/daemon/app.c b/group/daemon/app.c
index df17896..03952df 100644
--- a/group/daemon/app.c
+++ b/group/daemon/app.c
@@ -228,6 +228,8 @@ struct recovery_set *get_recovery_set(int nodeid)
and goes away, and then we get the add_recovery_set_cpg() matching
the _cman() variant that we ignored? */
+static void clean_up_dead_node(int nodeid);
+
void add_recovery_set_cman(int nodeid)
{
struct recovery_set *rs;
@@ -245,6 +247,9 @@ void add_recovery_set_cman(int nodeid)
log_debug("free recovery set %d not running groupd", nodeid);
list_del(&rs->list);
free(rs);
+
+ clean_up_dead_node(nodeid);
+
return;
}
@@ -1845,3 +1850,39 @@ void groupd_down(int nodeid)
}
}
+/* More hacks to try to work around similar kinds of problems that don't
+ make much sense, bz 521817. I believe the following produces effectively
+ the same problem as in the bz, on one node:
+ service cman start (with groupd -s0, not sure if this could happen otherwise)
+ service clvmd start
+ killall -9 dlm_controld
+ killall -9 groupd
+ killall -9 aisexec
+
+ At this point, the clvmd group in groupd on the other nodes is stuck in
+ LEAVE_ALL_STOPPED waiting for a stopped message from the killed node.
+ The groupd cpg confchg would ordinarily clean things up, but that probably
+ doesn't do anything because the event type is LEAVE instead of a failure.
+ Another way to deal with this would possibly be to do it when we see
+ the nodeid leave the groupd cpg. */
+
+static void clean_up_dead_node(int nodeid)
+{
+ group_t *g;
+ event_t *ev;
+
+ list_for_each_entry(g, &gd_groups, list) {
+ if (g->app && g->app->current_event &&
+ g->app->current_event->nodeid == nodeid) {
+ ev = g->app->current_event;
+
+ log_group(g, "clean_up_dead_node %d ev %d", nodeid,
+ ev->state);
+
+ if (ev->state == EST_LEAVE_STOP_WAIT) {
+ mark_node_stopped(g->app, nodeid);
+ }
+ }
+ }
+}
+
14 years
cluster: RHEL53 - cman/groupd/dlm_controld/gfs_controld: work around ipc deadlock
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 6b989a76f19d88d773f3481528ee79cdaeff20a7
Parent: c63b38a92d10c767dad033822f8baaf430be09fb
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Fri Feb 26 12:07:47 2010 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Fri Feb 26 12:07:47 2010 -0600
cman/groupd/dlm_controld/gfs_controld: work around ipc deadlock
bz 561892
When there are many gfs fs's (approx above 120) in a two
node cluster, and one of the nodes fails, groupd on the
remaining node can deadlock with dlm_controld and gfs_controld.
The problem is caused by so much communication (lots of fs's)
being sent between groupd and the other daemons so quickly
(no other nodes to synchronize with), that the unix socket
buffers fill up, causing both daemons to be blocked writing
stop/start/stop_done/start_done messages to the other.
Since the daemons are single threaded, being blocked on write
means that neither will read to unblock the other.
To determine if you're having this problem, you can strace
groupd, dlm_controld and gfs_controld, and notice that they
are blocked writing strings starting with "stop" or "start".
group_tool will hang since groupd is blocked.
The solution has three main parts:
1. dlm_controld queues its stop_done and start_done messages
and waits to send them to groupd until groupd is finished
sending all the stop/start messages.
2. gfs_controld does the same only for stop_done messages
(start_done messages are already naturally delayed here)
3. groupd skips sending finish messages to dlm_controld,
since dlm_controld does not use them for anything
Each of these changes in behavior are disabled by default and
need to be configured explicitly:
1. <dlm delay_done="2"/> in cluster.conf
2. <gfs_controld delay_done="2"/> in cluster.conf
3. SKIP_DLM_FINISH=1 in /etc/sysconfig/cman
(adds -f0 option to groupd which doesn't read
options from cluster.conf)
The delay_done values are in seconds. If the same problem
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
cman/init.d/cman | 20 +++++-
group/daemon/main.c | 14 ++++-
group/dlm_controld/action.c | 38 ++++++++++
group/dlm_controld/deadlock.c | 1 -
group/dlm_controld/dlm_daemon.h | 12 +++
group/dlm_controld/group.c | 138 +++++++++++++++++++++++++++++++++++---
group/dlm_controld/main.c | 75 +++++++++++++++++++-
group/dlm_controld/member_cman.c | 1 -
group/gfs_controld/group.c | 55 +++++++++++++++
group/gfs_controld/lock_dlm.h | 10 +++
group/gfs_controld/main.c | 60 ++++++++++++++++-
group/gfs_controld/recover.c | 6 ++-
12 files changed, 408 insertions(+), 22 deletions(-)
diff --git a/cman/init.d/cman b/cman/init.d/cman
index 35b121b..b517947 100755
--- a/cman/init.d/cman
+++ b/cman/init.d/cman
@@ -31,6 +31,11 @@
# The default is 60 seconds
[ -z "$CMAN_SHUTDOWN_TIMEOUT" ] && CMAN_SHUTDOWN_TIMEOUT=60
+# SKIP_DLM_FINISH -- setting to 1 will cause groupd to be started with
+# -f0, which causes groupd to not send finish callbacks to dlm_controld
+# as part of a workaround for bz 561892.
+[ -z "$SKIP_DLM_FINISH" ] && SKIP_DLM_FINISH=0
+
# FENCED_START_TIMEOUT -- amount of time to wait for starting fenced
# before giving up. If FENCED_START_TIMEOUT is positive, then we will
# wait FENCED_START_TIMEOUT seconds before giving up and failing when
@@ -61,6 +66,7 @@
[ -n "$NODENAME" ] && cman_join_opts+=" -n $NODENAME"
+
load_modules()
{
errmsg=$( /sbin/modprobe configfs 2>&1 ) || return 1
@@ -145,21 +151,31 @@ start_qdiskd()
start_daemons()
{
status groupd &> /dev/null
- if [ $? -ne 0 ]; then
- errmsg=$( /sbin/groupd 2>&1 ) || return 1
+ if [ $? -ne 0 ]
+ then
+ if [ $SKIP_DLM_FINISH -eq 1 ]
+ then
+ errmsg=$( /sbin/groupd -f0 2>&1 ) || return 1
+ else
+ errmsg=$( /sbin/groupd 2>&1 ) || return 1
+ fi
fi
+
status fenced &> /dev/null
if [ $? -ne 0 ]; then
errmsg=$( /sbin/fenced 2>&1 ) || return 1
fi
+
status dlm_controld &> /dev/null
if [ $? -ne 0 ]; then
errmsg=$( /sbin/dlm_controld 2>&1 ) || return 1
fi
+
status gfs_controld &> /dev/null
if [ $? -ne 0 ]; then
errmsg=$( /sbin/gfs_controld 2>&1 ) || return 1
fi
+
return 0
}
diff --git a/group/daemon/main.c b/group/daemon/main.c
index f5dcc88..7a4fca4 100644
--- a/group/daemon/main.c
+++ b/group/daemon/main.c
@@ -15,7 +15,7 @@
#include "gd_internal.h"
-#define OPTION_STRING "Dhs:Vv"
+#define OPTION_STRING "Dhs:f:Vv"
#define LOCKFILE_NAME "/var/run/groupd.pid"
#define LOG_FILE "/var/log/groupd.log"
@@ -27,6 +27,7 @@ uint32_t gd_event_nr;
char *our_name;
int our_nodeid;
int cman_quorate;
+int dlm_finish = 1;
static int client_maxi;
static int client_size = 0;
@@ -314,6 +315,12 @@ void app_start(app_t *a)
void app_finish(app_t *a)
{
char buf[GROUPD_MSGLEN];
+
+ if (!strncmp(client[a->client].type, "dlm", 3) && !dlm_finish) {
+ log_group(a->g, "skip finish");
+ return;
+ }
+
snprintf(buf, sizeof(buf), "finish %s %d",
a->g->name, a->current_event->event_nr);
app_action(a, buf);
@@ -919,6 +926,7 @@ static void print_usage(void)
printf(" -D Enable debugging code and don't fork\n");
printf(" -h Print this help, then exit\n");
printf(" -s [0|1] Enable (or disable) shutdown mode\n");
+ printf(" -f [0|1] Send finish callbacks to dlm_controld, default 1\n");
printf(" -V Print program version information, then exit\n");
}
@@ -945,6 +953,10 @@ static void decode_arguments(int argc, char **argv)
groupd_shutdown_opt = atoi(optarg);
break;
+ case 'f':
+ dlm_finish = atoi(optarg);
+ break;
+
case 'v':
groupd_debug_verbose++;
break;
diff --git a/group/dlm_controld/action.c b/group/dlm_controld/action.c
index 34e84fe..a7ea3a7 100644
--- a/group/dlm_controld/action.c
+++ b/group/dlm_controld/action.c
@@ -968,6 +968,43 @@ static void set_debug(int cd)
set_configfs_debug(rv);
}
+#define DELAY_DONE_PATH "/cluster/dlm/@delay_done"
+
+static int get_ccs_delay_done(int cd)
+{
+ char path[PATH_MAX], *str;
+ int error, rv;
+
+ memset(path, 0, PATH_MAX);
+ sprintf(path, DELAY_DONE_PATH);
+
+ error = ccs_get(cd, path, &str);
+ if (error || !str)
+ return -1;
+
+ rv = atoi(str);
+
+ if (rv < 0) {
+ log_error("invalid delay_done from ccs");
+ rv = -1;
+ }
+
+ free(str);
+ log_error("ccs dlm/delay_done %d", rv);
+ return rv;
+}
+
+static void set_delay_done(int cd)
+{
+ int rv;
+
+ rv = get_ccs_delay_done(cd);
+ if (rv < 0)
+ return;
+
+ delay_done_cb = rv;
+}
+
void set_ccs_options(void)
{
int cd;
@@ -979,6 +1016,7 @@ void set_ccs_options(void)
set_protocol(cd);
set_timewarn(cd);
set_debug(cd);
+ set_delay_done(cd);
ccs_disconnect(cd);
}
diff --git a/group/dlm_controld/deadlock.c b/group/dlm_controld/deadlock.c
index f21beda..0b1538a 100644
--- a/group/dlm_controld/deadlock.c
+++ b/group/dlm_controld/deadlock.c
@@ -15,7 +15,6 @@
int deadlock_enabled = 0;
-extern struct list_head lockspaces;
extern int our_nodeid;
static SaCkptHandleT global_ckpt_h;
diff --git a/group/dlm_controld/dlm_daemon.h b/group/dlm_controld/dlm_daemon.h
index b95e176..5805242 100644
--- a/group/dlm_controld/dlm_daemon.h
+++ b/group/dlm_controld/dlm_daemon.h
@@ -62,6 +62,16 @@ extern int daemon_debug_opt;
extern int kernel_debug_opt;
extern char daemon_debug_buf[256];
+extern int delay_done_cb;
+extern struct list_head stop_done_list;
+extern int stop_done_entries;
+extern struct list_head start_done_list;
+extern int start_done_entries;
+extern struct list_head lockspaces;
+extern int lockspace_count;
+extern uint64_t last_stop_time;
+extern uint64_t last_start_time;
+
#define log_debug(fmt, args...) \
do { \
snprintf(daemon_debug_buf, 255, "%ld " fmt "\n", time(NULL), ##args); \
@@ -122,6 +132,8 @@ char *nodeid2name(int nodeid);
/* group.c */
int setup_groupd(void);
void process_groupd(int ci);
+void push_stop_done(void);
+void push_start_done(void);
/* main.c */
int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci));
diff --git a/group/dlm_controld/group.c b/group/dlm_controld/group.c
index 2e5b4e6..c80b49e 100644
--- a/group/dlm_controld/group.c
+++ b/group/dlm_controld/group.c
@@ -24,6 +24,7 @@
do the processing within the callback function itself */
group_handle_t gh;
+
static int cb_action;
static char cb_name[MAX_GROUP_NAME_LEN+1];
static int cb_event_nr;
@@ -31,6 +32,7 @@ static unsigned int cb_id;
static int cb_type;
static int cb_member_count;
static int cb_members[MAX_GROUP_MEMBERS];
+static int last_action;
static void stop_cbfn(group_handle_t h, void *private, char *name)
@@ -107,10 +109,108 @@ char *str_members(void)
return str_members_buf;
}
+struct save_done {
+ struct list_head list;
+ int event_nr;
+ char name[MAX_GROUP_NAME_LEN+1];
+};
+
+void push_stop_done(void)
+{
+ struct save_done *sd, *safe;
+ int count;
+
+ if (stop_done_entries > 1)
+ log_error("push_stop_done begin %d", stop_done_entries);
+
+ count = 0;
+
+ list_for_each_entry_safe(sd, safe, &stop_done_list, list) {
+ group_stop_done(gh, sd->name);
+ list_del(&sd->list);
+ free(sd);
+ stop_done_entries--;
+ count++;
+ }
+
+ if (count > 1)
+ log_error("push_stop_done end %d", count);
+}
+
+/* only queue if the last action is also stop? */
+
+static int queue_stop_done(char *name)
+{
+ struct save_done *sd;
+
+ if (!delay_done_cb)
+ return -1;
+
+ if (last_action != DO_STOP)
+ return -1;
+
+ sd = malloc(sizeof(struct save_done));
+ if (!sd)
+ return -1;
+
+ memset(sd, 0, sizeof(struct save_done));
+ strcpy(sd->name, name);
+ list_add_tail(&sd->list, &stop_done_list);
+ stop_done_entries++;
+
+ return 0;
+}
+
+void push_start_done(void)
+{
+ struct save_done *sd, *safe;
+ int count;
+
+ if (start_done_entries > 1)
+ log_error("push_start_done begin %d", start_done_entries);
+
+ count = 0;
+
+ list_for_each_entry_safe(sd, safe, &start_done_list, list) {
+ group_start_done(gh, sd->name, sd->event_nr);
+ list_del(&sd->list);
+ free(sd);
+ start_done_entries--;
+ count++;
+ }
+
+ if (count > 1)
+ log_error("push_start_done end %d", count);
+}
+
+static int queue_start_done(char *name, int event_nr)
+{
+ struct save_done *sd;
+
+ if (!delay_done_cb)
+ return -1;
+
+ if (last_action != DO_START)
+ return -1;
+
+ sd = malloc(sizeof(struct save_done));
+ if (!sd)
+ return -1;
+
+ memset(sd, 0, sizeof(struct save_done));
+ strcpy(sd->name, name);
+ sd->event_nr = event_nr;
+ list_add_tail(&sd->list, &start_done_list);
+ start_done_entries++;
+
+ return 0;
+}
+
void process_groupd(int ci)
{
struct lockspace *ls;
int error = 0, val;
+ int rv;
group_dispatch(gh);
@@ -128,7 +228,10 @@ void process_groupd(int ci)
case DO_STOP:
log_debug("groupd callback: stop %s", cb_name);
set_control(cb_name, 0);
- group_stop_done(gh, cb_name);
+ rv = queue_stop_done(cb_name);
+ if (rv < 0)
+ group_stop_done(gh, cb_name);
+ last_stop_time = time(NULL);
break;
case DO_START:
@@ -145,20 +248,27 @@ void process_groupd(int ci)
/* the dlm doesn't need/use a "finish" stage following
start, so we can just do start_done immediately */
- group_start_done(gh, cb_name, cb_event_nr);
- if (!ls->joining)
+ if (!ls->joining) {
+ rv = queue_start_done(cb_name, cb_event_nr);
+ if (rv < 0)
+ group_start_done(gh, cb_name, cb_event_nr);
+ last_start_time = time(NULL);
break;
+ } else {
+ group_start_done(gh, cb_name, cb_event_nr);
- ls->joining = 0;
- log_debug("join event done %s", cb_name);
+ ls->joining = 0;
+ log_debug("join event done %s", cb_name);
- /* this causes the dlm_new_lockspace() call (typically from
- mount) to complete */
- set_event_done(cb_name, 0);
+ /* this causes the dlm_new_lockspace() call (typically from
+ mount) to complete */
+ set_event_done(cb_name, 0);
- join_deadlock_cpg(ls);
- break;
+ join_deadlock_cpg(ls);
+ last_start_time = time(NULL);
+ break;
+ }
case DO_SETID:
log_debug("groupd callback: set_id %s %x", cb_name, cb_id);
@@ -184,6 +294,7 @@ void process_groupd(int ci)
set_event_done(cb_name, val);
leave_deadlock_cpg(ls);
list_del(&ls->list);
+ lockspace_count--;
free(ls);
break;
@@ -195,6 +306,8 @@ void process_groupd(int ci)
error = -EINVAL;
}
+ last_action = cb_action;
+
cb_action = 0;
out:
return;
@@ -204,6 +317,11 @@ int setup_groupd(void)
{
int rv;
+ INIT_LIST_HEAD(&stop_done_list);
+ INIT_LIST_HEAD(&start_done_list);
+ stop_done_entries = 0;
+ start_done_entries = 0;
+
gh = group_init(NULL, "dlm", 1, &callbacks, GROUPD_TIMEOUT);
if (!gh) {
log_error("group_init error %p %d", gh, errno);
diff --git a/group/dlm_controld/main.c b/group/dlm_controld/main.c
index 1588605..d47ea63 100644
--- a/group/dlm_controld/main.c
+++ b/group/dlm_controld/main.c
@@ -17,15 +17,13 @@
#include <linux/dlm.h>
#include <linux/dlm_netlink.h>
-#define OPTION_STRING "KDhVd:"
+#define OPTION_STRING "KDhVd:y:"
#define LOCKFILE_NAME "/var/run/dlm_controld.pid"
#define DEADLOCK_CHECK_SECS 10
#define NALLOC 16
-struct list_head lockspaces;
-
extern group_handle_t gh;
extern int deadlock_enabled;
@@ -265,6 +263,7 @@ static void process_uevent(int ci)
ls->joining = 1;
list_add(&ls->list, &lockspaces);
+ lockspace_count++;
rv = group_join(gh, argv[3]);
@@ -611,8 +610,11 @@ void cluster_dead(int ci)
static int loop(void)
{
int rv, i;
+ int poll_timeout = -1;
void (*workfn) (int ci);
void (*deadfn) (int ci);
+ uint64_t push_begin = 0;
+ uint64_t now;
rv = setup_listener();
if (rv < 0)
@@ -646,7 +648,7 @@ static int loop(void)
for_loop:
for (;;) {
- rv = poll(pollfd, client_maxi + 1, -1);
+ rv = poll(pollfd, client_maxi + 1, poll_timeout);
if (rv == -1 && errno == EINTR) {
if (daemon_quit && list_empty(&lockspaces)) {
clear_configfs();
@@ -672,6 +674,56 @@ static int loop(void)
deadfn(i);
}
}
+
+ if (delay_done_cb && !list_empty(&stop_done_list)) {
+ if (!push_begin) {
+ push_begin = time(NULL);
+ poll_timeout = 1000;
+ }
+ now = time(NULL);
+
+ if ((stop_done_entries == lockspace_count) ||
+ (now - push_begin >= delay_done_cb)) {
+ if (stop_done_entries > 1) {
+ log_error("stop_done entries %d "
+ "count %d begin %llu "
+ "now %llu last stop %llu",
+ stop_done_entries,
+ lockspace_count,
+ (unsigned long long)push_begin,
+ (unsigned long long)now,
+ (unsigned long long)last_stop_time);
+ }
+ push_stop_done();
+ push_begin = 0;
+ poll_timeout = -1;
+ }
+ }
+
+ if (delay_done_cb && !list_empty(&start_done_list)) {
+ if (!push_begin) {
+ push_begin = time(NULL);
+ poll_timeout = 1000;
+ }
+ now = time(NULL);
+
+ if ((start_done_entries == lockspace_count) ||
+ (now - push_begin >= delay_done_cb)) {
+ if (start_done_entries > 1) {
+ log_error("start_done entries %d "
+ "count %d begin %llu "
+ "now %llu last start %llu",
+ start_done_entries,
+ lockspace_count,
+ (unsigned long long)push_begin,
+ (unsigned long long)now,
+ (unsigned long long)last_start_time);
+ }
+ push_start_done();
+ push_begin = 0;
+ poll_timeout = -1;
+ }
+ }
}
rv = 0;
out:
@@ -754,6 +806,7 @@ static void print_usage(void)
#endif
printf(" -D Enable debugging code and don't fork\n");
printf(" -K Enable kernel dlm debugging messages\n");
+ printf(" -y <sec> Delay done callbacks to groupd by this many seconds, default 0\n");
printf(" -h Print this help, then exit\n");
printf(" -V Print program version information, then exit\n");
}
@@ -780,6 +833,10 @@ static void decode_arguments(int argc, char **argv)
print_usage();
exit(EXIT_SUCCESS);
break;
+
+ case 'y':
+ delay_done_cb = atoi(optarg);
+ break;
#if DEADLOCK
case 'd':
deadlock_enabled = atoi(optarg);
@@ -844,6 +901,7 @@ int main(int argc, char **argv)
prog_name = argv[0];
INIT_LIST_HEAD(&lockspaces);
+ lockspace_count = 0;
decode_arguments(argc, argv);
@@ -871,4 +929,13 @@ char *prog_name;
int daemon_debug_opt;
char daemon_debug_buf[256];
int kernel_debug_opt;
+int delay_done_cb = 0;
+struct list_head stop_done_list;
+int stop_done_entries;
+struct list_head start_done_list;
+int start_done_entries;
+struct list_head lockspaces;
+int lockspace_count;
+uint64_t last_stop_time;
+uint64_t last_start_time;
diff --git a/group/dlm_controld/member_cman.c b/group/dlm_controld/member_cman.c
index 53ba72f..ce280b5 100644
--- a/group/dlm_controld/member_cman.c
+++ b/group/dlm_controld/member_cman.c
@@ -19,7 +19,6 @@ static cman_node_t old_nodes[MAX_NODES];
static int old_node_count;
static cman_node_t cman_nodes[MAX_NODES];
static int cman_node_count;
-extern struct list_head lockspaces;
static int is_member(cman_node_t *node_list, int count, int nodeid)
{
diff --git a/group/gfs_controld/group.c b/group/gfs_controld/group.c
index f797786..f3becb9 100644
--- a/group/gfs_controld/group.c
+++ b/group/gfs_controld/group.c
@@ -23,6 +23,7 @@ static unsigned int cb_id;
static int cb_type;
static int cb_member_count;
static int cb_members[MAX_GROUP_MEMBERS];
+static int last_action;
int do_stop(struct mountgroup *mg);
int do_finish(struct mountgroup *mg);
@@ -99,6 +100,55 @@ char *str_members(void)
return buf;
}
+struct save_done {
+ struct list_head list;
+ char name[MAX_GROUP_NAME_LEN+1];
+};
+
+void push_stop_done(void)
+{
+ struct save_done *sd, *safe;
+ int count;
+
+ if (stop_done_entries > 1)
+ log_error("push_stop_done begin %d", stop_done_entries);
+
+ count = 0;
+
+ list_for_each_entry_safe(sd, safe, &stop_done_list, list) {
+ group_stop_done(gh, sd->name);
+ list_del(&sd->list);
+ free(sd);
+ stop_done_entries--;
+ count++;
+ }
+
+ if (count > 1)
+ log_error("push_stop_done end %d", count);
+}
+
+int queue_stop_done(char *name)
+{
+ struct save_done *sd;
+
+ if (!delay_done_cb)
+ return -1;
+
+ if (last_action != DO_STOP)
+ return -1;
+
+ sd = malloc(sizeof(struct save_done));
+ if (!sd)
+ return -1;
+
+ memset(sd, 0, sizeof(struct save_done));
+ strcpy(sd->name, name);
+ list_add_tail(&sd->list, &stop_done_list);
+ stop_done_entries++;
+
+ return 0;
+}
+
int process_groupd(void)
{
struct mountgroup *mg;
@@ -122,6 +172,7 @@ int process_groupd(void)
switch (cb_action) {
case DO_STOP:
+ last_stop_time = time(NULL);
log_debug("groupd cb: stop %s", cb_name);
mg->last_callback = DO_STOP;
mg->last_stop = mg->last_start;
@@ -158,6 +209,7 @@ int process_groupd(void)
error = -EINVAL;
}
+ last_action = cb_action;
out:
cb_action = 0;
return error;
@@ -167,6 +219,9 @@ int setup_groupd(void)
{
int rv;
+ INIT_LIST_HEAD(&stop_done_list);
+ stop_done_entries = 0;
+
gh = group_init(NULL, LOCK_DLM_GROUP_NAME, LOCK_DLM_GROUP_LEVEL,
&callbacks, 10);
if (!gh) {
diff --git a/group/gfs_controld/lock_dlm.h b/group/gfs_controld/lock_dlm.h
index 746d0c7..cc12e1c 100644
--- a/group/gfs_controld/lock_dlm.h
+++ b/group/gfs_controld/lock_dlm.h
@@ -74,6 +74,13 @@ extern char dump_buf[DUMP_SIZE];
extern int dump_point;
extern int dump_wrap;
+extern int delay_done_cb;
+extern struct list_head stop_done_list;
+extern int stop_done_entries;
+extern struct list_head mounts;
+extern int mountgroup_count;
+extern uint64_t last_stop_time;
+
extern void daemon_dump_save(void);
#define log_debug(fmt, args...) \
@@ -283,6 +290,9 @@ int setup_plocks(void);
int process_plocks(void);
void exit_cman(void);
+void push_stop_done(void);
+int queue_stop_done(char *name);
+
int do_mount(int ci, char *dir, char *type, char *proto, char *table,
char *options, char *dev, struct mountgroup **mg_ret);
int do_unmount(int ci, char *dir, int mnterr);
diff --git a/group/gfs_controld/main.c b/group/gfs_controld/main.c
index b2edc32..7b6fd49 100644
--- a/group/gfs_controld/main.c
+++ b/group/gfs_controld/main.c
@@ -41,7 +41,6 @@ struct client {
int another_mount;
};
-extern struct list_head mounts;
extern struct list_head withdrawn_mounts;
extern group_handle_t gh;
@@ -544,6 +543,8 @@ int setup_uevent(void)
int loop(void)
{
int rv, i, f, error, poll_timeout = -1, ignore_plocks_fd = 0;
+ uint64_t push_begin = 0;
+ uint64_t now;
rv = listen_fd = setup_listen();
if (rv < 0)
@@ -654,6 +655,31 @@ int loop(void)
}
}
}
+
+ if (delay_done_cb && !list_empty(&stop_done_list)) {
+ if (!push_begin) {
+ push_begin = time(NULL);
+ poll_timeout = 1000;
+ }
+ now = time(NULL);
+
+ if ((stop_done_entries == mountgroup_count) ||
+ (now - push_begin >= delay_done_cb)) {
+ if (stop_done_entries > 1) {
+ log_error("stop_done entries %d "
+ "count %d begin %llu "
+ "now %llu last stop %llu",
+ stop_done_entries,
+ mountgroup_count,
+ (unsigned long long)push_begin,
+ (unsigned long long)now,
+ (unsigned long long)last_stop_time);
+ }
+ push_stop_done();
+ push_begin = 0;
+ poll_timeout = -1;
+ }
+ }
}
rv = 0;
out:
@@ -665,11 +691,12 @@ int loop(void)
#define DROP_RESOURCES_TIME_PATH "/cluster/gfs_controld/@drop_resources_time"
#define DROP_RESOURCES_COUNT_PATH "/cluster/gfs_controld/@drop_resources_count"
#define DROP_RESOURCES_AGE_PATH "/cluster/gfs_controld/@drop_resources_age"
+#define DELAY_DONE_PATH "/cluster/gfs_controld/@delay_done"
static void set_ccs_config(void)
{
char path[PATH_MAX], *str;
- int i = 0, cd, error;
+ int i = 0, cd, error, rv;
while ((cd = ccs_connect()) < 0) {
sleep(1);
@@ -737,6 +764,23 @@ static void set_ccs_config(void)
}
if (str)
free(str);
+
+ memset(path, 0, PATH_MAX);
+ snprintf(path, PATH_MAX, "%s", DELAY_DONE_PATH);
+ str = NULL;
+
+ error = ccs_get(cd, path, &str);
+ if (!error) {
+ rv = atoi(str);
+ if (rv < 0) {
+ log_error("invalid delay_done from ccs");
+ } else {
+ delay_done_cb = rv;
+ log_error("ccs gfs_controld/delay_done %d", rv);
+ }
+ }
+ if (str)
+ free(str);
}
static void lockfile(void)
@@ -823,6 +867,7 @@ static void print_usage(void)
printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_COUNT);
printf(" -a <ms> drop resources age (milliseconds)\n");
printf(" Default is %u\n", DEFAULT_DROP_RESOURCES_AGE);
+ printf(" -y <sec> Delay done callbacks to groupd by this many seconds, default 0\n");
printf(" -h Print this help, then exit\n");
printf(" -V Print program version information, then exit\n");
}
@@ -880,6 +925,10 @@ static void decode_arguments(int argc, char **argv)
opt_drop_resources_age = 1;
break;
+ case 'y':
+ delay_done_cb = atoi(optarg);
+ break;
+
case 'h':
print_usage();
exit(EXIT_SUCCESS);
@@ -945,6 +994,7 @@ int main(int argc, char **argv)
INIT_LIST_HEAD(&mounts);
INIT_LIST_HEAD(&withdrawn_mounts);
+ mountgroup_count = 0;
config_no_withdraw = DEFAULT_NO_WITHDRAW;
config_no_plock = DEFAULT_NO_PLOCK;
@@ -1006,4 +1056,10 @@ char daemon_debug_buf[256];
char dump_buf[DUMP_SIZE];
int dump_point;
int dump_wrap;
+int delay_done_cb = 0;
+struct list_head stop_done_list;
+int stop_done_entries;
+struct list_head mounts;
+int mountgroup_count;
+uint64_t last_stop_time;
diff --git a/group/gfs_controld/recover.c b/group/gfs_controld/recover.c
index 52d96ff..3eec64f 100644
--- a/group/gfs_controld/recover.c
+++ b/group/gfs_controld/recover.c
@@ -1676,6 +1676,7 @@ int do_mount(int ci, char *dir, char *type, char *proto, char *table,
}
list_add(&mg->list, &mounts);
+ mountgroup_count++;
group_join(gh, name);
rv = 0;
out:
@@ -1996,6 +1997,7 @@ int do_unmount(int ci, char *dir, int mnterr)
free(mp);
if (list_empty(&mg->mountpoints)) {
list_del(&mg->list);
+ mountgroup_count--;
free(mg);
}
return 0;
@@ -2342,7 +2344,9 @@ int do_stop(struct mountgroup *mg)
}
}
out:
- group_stop_done(gh, mg->name);
+ rv = queue_stop_done(mg->name);
+ if (rv < 0)
+ group_stop_done(gh, mg->name);
return 0;
}
14 years
cluster: RHEL54 - groupd: clean up leaving failed node
by David Teigland
Gitweb: http://git.fedorahosted.org/git/cluster.git?p=cluster.git;a=commitdiff;h=...
Commit: 971ae8c3478df85910f65ea26dd29b37c7ad0a06
Parent: dc78211c8df17704c76ef968f7850495094f8c03
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Wed Dec 9 16:53:33 2009 -0600
Committer: David Teigland <teigland(a)redhat.com>
CommitterDate: Mon Mar 29 11:09:58 2010 -0500
groupd: clean up leaving failed node
bz 521817
Due to shutdown+failure scenarios that aren't fully understood,
a node that fails while shutting down can cause the other nodes
to get stuck trying to restart the clvmd group (whether other
groups could be affected is unknown.)
The other nodes will all show something like this from group_tool -v:
dlm 1 clvmd 00010002 LEAVE_STOP_WAIT 1 100020002 1
and group_tool dump will show things like:
1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
1260396236 1:clvmd waiting for 1 more stopped messages before LEAVE_ALL_STOPPED 1
This fix is to more or less watch out for this very specific
situation where things get messed up and forcibly clean things
up so the other nodes aren't stuck.
Signed-off-by: David Teigland <teigland(a)redhat.com>
---
group/daemon/app.c | 41 +++++++++++++++++++++++++++++++++++++++++
1 files changed, 41 insertions(+), 0 deletions(-)
diff --git a/group/daemon/app.c b/group/daemon/app.c
index df17896..03952df 100644
--- a/group/daemon/app.c
+++ b/group/daemon/app.c
@@ -228,6 +228,8 @@ struct recovery_set *get_recovery_set(int nodeid)
and goes away, and then we get the add_recovery_set_cpg() matching
the _cman() variant that we ignored? */
+static void clean_up_dead_node(int nodeid);
+
void add_recovery_set_cman(int nodeid)
{
struct recovery_set *rs;
@@ -245,6 +247,9 @@ void add_recovery_set_cman(int nodeid)
log_debug("free recovery set %d not running groupd", nodeid);
list_del(&rs->list);
free(rs);
+
+ clean_up_dead_node(nodeid);
+
return;
}
@@ -1845,3 +1850,39 @@ void groupd_down(int nodeid)
}
}
+/* More hacks to try to work around similar kinds of problems that don't
+ make much sense, bz 521817. I believe the following produces effectively
+ the same problem as in the bz, on one node:
+ service cman start (with groupd -s0, not sure if this could happen otherwise)
+ service clvmd start
+ killall -9 dlm_controld
+ killall -9 groupd
+ killall -9 aisexec
+
+ At this point, the clvmd group in groupd on the other nodes is stuck in
+ LEAVE_ALL_STOPPED waiting for a stopped message from the killed node.
+ The groupd cpg confchg would ordinarily clean things up, but that probably
+ doesn't do anything because the event type is LEAVE instead of a failure.
+ Another way to deal with this would possibly be to do it when we see
+ the nodeid leave the groupd cpg. */
+
+static void clean_up_dead_node(int nodeid)
+{
+ group_t *g;
+ event_t *ev;
+
+ list_for_each_entry(g, &gd_groups, list) {
+ if (g->app && g->app->current_event &&
+ g->app->current_event->nodeid == nodeid) {
+ ev = g->app->current_event;
+
+ log_group(g, "clean_up_dead_node %d ev %d", nodeid,
+ ev->state);
+
+ if (ev->state == EST_LEAVE_STOP_WAIT) {
+ mark_node_stopped(g->app, nodeid);
+ }
+ }
+ }
+}
+
14 years