This is an automated email from the git hooks/post-receive script.
teigland pushed a commit to branch master
in repository sanlock.
commit 66ed5ea259da54939ae8c02533a4238c6e3e053d
Author: David Teigland <teigland(a)redhat.com>
AuthorDate: Mon Aug 19 16:22:39 2024 -0500
sanlock: reuse io buffer with hugepages
Avoid repeating malloc/madvise/free every time an io buffer
is needed to read the full paxos lease area.
---
src/paxos_lease.c | 168 ++++++++++++++++++++++++++++++++++++-------------
src/sanlock_internal.h | 6 +-
src/task.c | 3 +
3 files changed, 133 insertions(+), 44 deletions(-)
diff --git a/src/paxos_lease.c b/src/paxos_lease.c
index e62a93f..8f7d816 100644
--- a/src/paxos_lease.c
+++ b/src/paxos_lease.c
@@ -505,7 +505,8 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
int read_len;
int iobuf_len;
int read_mem_align;
- int is_huge = 0;
+ int use_huge = 0;
+ int use_task_hugebuf = 0;
int phase2 = 0;
int d, q, rv = 0;
int q_max = -1;
@@ -517,34 +518,59 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
iobuf_len = token->align_size; /* can increase (from 1MB) for hugepage */
read_mem_align = getpagesize(); /* increases to 2MB for hugepage */
- /* TODO: for hugepages, the task should keep a huge iobuf around to reuse
- rather than alloc+madvise+free each time; after a while of heavy use,
- madvise may be less likely to successfully turn the iobuf into a hugepage. */
-
- if (com.use_hugepages && ((iobuf_len == ONE_MB_IN_BYTES) || !(iobuf_len % TWO_MB_IN_BYTES))) {
+ /* Determine if hugepages/hubebuf is applicable. */
+ if (com.use_hugepages && (num_disks == 1) &&
+ ((iobuf_len == ONE_MB_IN_BYTES) || !(iobuf_len % TWO_MB_IN_BYTES))) {
read_mem_align = TWO_MB_IN_BYTES;
if (iobuf_len < TWO_MB_IN_BYTES)
iobuf_len = TWO_MB_IN_BYTES;
- is_huge = 1;
+ use_huge = 1;
+ }
+
+ /* Reuse a previously allocated buffer. */
+ if (use_huge && task->paxos_hugebuf && (task->paxos_hugebuf_size >= iobuf_len)) {
+ use_task_hugebuf = 1;
+ iobuf_array[0] = task->paxos_hugebuf;
+ task->paxos_hugebuf_used++;
+ goto mem_done;
}
+ /* Allocate a new buffer. */
for (d = 0; d < num_disks; d++) {
p_iobuf[d] = &iobuf_array[d];
rv = posix_memalign((void *)p_iobuf[d], read_mem_align, iobuf_len);
if (rv)
return rv;
+ }
+
+ /* Make the new buffer use hugepages. */
+ if (use_huge) {
+ char *bp;
+ iobuf = iobuf_array[0];
+ madvise(iobuf, iobuf_len, MADV_HUGEPAGE);
+ /* allocate the pages */
+ for (bp = iobuf; bp < iobuf+iobuf_len; bp += TWO_MB_IN_BYTES)
+ memset(bp, 0, 1);
- if (is_huge) {
- char *bp;
- iobuf = iobuf_array[d];
- madvise(iobuf, iobuf_len, MADV_HUGEPAGE);
- /* allocate the pages */
- for (bp = iobuf; bp < iobuf+iobuf_len; bp += TWO_MB_IN_BYTES)
- memset(bp, 0, 1);
+ /*
+ * Save the new buffer to reuse if the task doesn't
+ * have one (Note: the task may already have paxos_hugebuf
+ * with a smaller size than we need here, in which case we
+ * ignore that, allocate a new one, and free it at the end.)
+ * TODO: add config min_hugebuf_size to avoid that case?
+ */
+ if (!task->paxos_hugebuf) {
+ task->paxos_hugebuf = iobuf;
+ task->paxos_hugebuf_size = iobuf_len;
+ task->paxos_hugebuf_created++;
+ use_task_hugebuf = 1;
+ log_taskd(task, "created hugebuf %p %u", iobuf, task->paxos_hugebuf_created);
}
}
+ mem_done:
+
/*
* phase 1
*
@@ -570,7 +596,7 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
num_writes = 0;
for (d = 0; d < num_disks; d++) {
- /* acquire io: write 1 */
+ /* acquire io: write 1 (sector) */
rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock);
if (rv < 0)
continue;
@@ -599,10 +625,17 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
memset(iobuf, 0, read_len);
- /* acquire io: read 2 */
+ /* acquire io: read 2 (lease area) */
rv = read_iobuf(disk->fd, disk->offset, iobuf, read_len, task, token->io_timeout, NULL);
- if (rv == SANLK_AIO_TIMEOUT)
+ if (rv == SANLK_AIO_TIMEOUT) {
iobuf_array[d] = NULL;
+ if (use_task_hugebuf) {
+ /* hugebuf will be freed when aio completes */
+ task->paxos_hugebuf = NULL;
+ task->paxos_hugebuf_size = 0;
+ use_task_hugebuf = 0;
+ }
+ }
if (rv < 0)
continue;
num_reads++;
@@ -788,7 +821,7 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
num_writes = 0;
for (d = 0; d < num_disks; d++) {
- /* acquire io: write 2 */
+ /* acquire io: write 2 (sector) */
rv = write_dblock(task, token, &token->disks[d], token->host_id, &dblock);
if (rv < 0)
continue;
@@ -817,10 +850,17 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
memset(iobuf, 0, read_len);
- /* acquire io: read 3 */
+ /* acquire io: read 3 (lease area) */
rv = read_iobuf(disk->fd, disk->offset, iobuf, read_len, task, token->io_timeout, NULL);
- if (rv == SANLK_AIO_TIMEOUT)
+ if (rv == SANLK_AIO_TIMEOUT) {
iobuf_array[d] = NULL;
+ if (use_task_hugebuf) {
+ /* hugebuf will be freed when aio completes */
+ task->paxos_hugebuf = NULL;
+ task->paxos_hugebuf_size = 0;
+ use_task_hugebuf = 0;
+ }
+ }
if (rv < 0)
continue;
num_reads++;
@@ -939,13 +979,19 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags,
memcpy(dblock_out, &dblock, sizeof(struct paxos_dblock));
error = SANLK_OK;
out:
- for (d = 0; d < num_disks; d++) {
- /* don't free iobufs that have timed out */
- if (!iobuf_array[d])
- continue;
-
- /* TODO: the task should keep huge iobuf and reuse it. */
- free(iobuf_array[d]);
+ if (use_task_hugebuf) {
+ if (rv != SANLK_AIO_TIMEOUT) {
+ /* Keep buffer to reuse. */
+ memset(task->paxos_hugebuf, 0, task->paxos_hugebuf_size);
+ }
+ } else {
+ for (d = 0; d < num_disks; d++) {
+ /* don't free iobufs that have timed out */
+ if (!iobuf_array[d])
+ continue;
+ free(iobuf_array[d]);
+ iobuf_array[d] = NULL;
+ }
}
if (phase2 && (error < 0) &&
@@ -1415,7 +1461,8 @@ static int _lease_read_one(struct task *task,
int read_len;
int iobuf_len;
int read_mem_align;
- int is_huge = 0;
+ int use_huge = 0;
+ int use_task_hugebuf = 0;
int q, tmp_q = -1, rv;
/* FIXME: what case is this? */
@@ -1428,27 +1475,48 @@ static int _lease_read_one(struct task *task,
iobuf_len = token->align_size; /* can increase (from 1MB) for hugepage */
read_mem_align = getpagesize(); /* increases to 2MB for hugepage */
+ /* Determine if hugepages/hubebuf is applicable. */
if (com.use_hugepages && ((iobuf_len == ONE_MB_IN_BYTES) || !(iobuf_len % TWO_MB_IN_BYTES))) {
read_mem_align = TWO_MB_IN_BYTES;
if (iobuf_len < TWO_MB_IN_BYTES)
iobuf_len = TWO_MB_IN_BYTES;
- is_huge = 1;
+ use_huge = 1;
}
- p_iobuf = &iobuf;
+ /* Reuse a previously allocated buffer. */
+ if (use_huge && task->paxos_hugebuf && (task->paxos_hugebuf_size >= iobuf_len)) {
+ use_task_hugebuf = 1;
+ iobuf = task->paxos_hugebuf;
+ task->paxos_hugebuf_used++;
+ goto mem_done;
+ }
+ /* Allocate a new buffer. */
+ p_iobuf = &iobuf;
rv = posix_memalign((void *)p_iobuf, read_mem_align, iobuf_len);
if (rv)
return rv;
- if (is_huge) {
+ /* Make the new buffer use hugepages. */
+ if (use_huge) {
char *bp;
madvise(iobuf, iobuf_len, MADV_HUGEPAGE);
/* allocate the pages */
for (bp = iobuf; bp < iobuf+iobuf_len; bp += TWO_MB_IN_BYTES)
memset(bp, 0, 1);
+
+ /* Save the new buffer to reuse if the task doesn't have one. */
+ if (!task->paxos_hugebuf) {
+ task->paxos_hugebuf = iobuf;
+ task->paxos_hugebuf_size = iobuf_len;
+ task->paxos_hugebuf_created++;
+ use_task_hugebuf = 1;
+ log_taskd(task, "created hugebuf %p %u", iobuf, task->paxos_hugebuf_created);
+ }
}
+ mem_done:
+
rv = read_iobuf(disk->fd, disk->offset, iobuf, read_len, task, token->io_timeout, NULL);
if (rv < 0)
goto out;
@@ -1520,8 +1588,20 @@ static int _lease_read_one(struct task *task,
bk_debug);
out:
- if (rv != SANLK_AIO_TIMEOUT)
- free(iobuf);
+ if (use_task_hugebuf) {
+ if (rv != SANLK_AIO_TIMEOUT) {
+ /* Keep buffer to reuse. */
+ memset(task->paxos_hugebuf, 0, task->paxos_hugebuf_size);
+ } else {
+ /* hugebuf will be freed when aio completes */
+ task->paxos_hugebuf = NULL;
+ task->paxos_hugebuf_size = 0;
+ }
+ } else {
+ if (rv != SANLK_AIO_TIMEOUT)
+ free(iobuf);
+ /* else iobuf will be freed when aio completes */
+ }
return rv;
}
@@ -1717,19 +1797,19 @@ static int write_new_leader(struct task *task,
*/
/*
- * i/o required to acquire a free lease
- * (1 disk in token, 512 byte sectors, default num_hosts of 2000)
+ * acquire io: disk ios needed to acquire a free and uncontended lease
*
* paxos_lease_acquire()
- * paxos_lease_read() 1 read 1 MB (entire lease area)
- * run_ballot()
- * write_dblock() 1 write 512 bytes (1 dblock sector)
- * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors)
- * write_dblock() 1 write 512 bytes (1 dblock sector)
- * read_iobuf() 1 read 1 MB (round up num_hosts + 2 sectors)
- * write_new_leader() 1 write 512 bytes (1 leader sector)
+ * R1 paxos_lease_read() 1 read 1 MB (lease area size)
+ * run_ballot()
+ * W1 write_dblock() 1 write 512 bytes (1 dblock sector)
+ * R2 read_iobuf() 1 read 1 MB (lease area size)
+ * W2 write_dblock() 1 write 512 bytes (1 dblock sector)
+ * R3 read_iobuf() 1 read 1 MB (lease area size)
+ * W3 write_new_leader() 1 write 512 bytes (1 leader sector)
*
* 6 i/os = 3 1MB reads, 3 512 byte writes
+ * (assuming 512 sector size and 1MB area size)
*/
int paxos_lease_acquire(struct task *task,
@@ -1778,7 +1858,7 @@ int paxos_lease_acquire(struct task *task,
memset(&tmp_leader, 0, sizeof(tmp_leader));
copy_cur_leader = 0;
- /* acquire io: read 1 */
+ /* acquire io: read 1 (lease area) */
error = paxos_lease_read(task, token, flags, &cur_leader, &max_mbal, "paxos_acquire", 1);
if (error < 0)
goto out;
@@ -2216,6 +2296,7 @@ int paxos_lease_acquire(struct task *task,
next_lver = larger_lver + 1;
}
+ /* acquire io: write 1 (sector), read 2 (lease area), write 2 (sector), read 3 (lease area) */
error = run_ballot(task, token, flags, cur_leader.num_hosts, next_lver, our_mbal, &dblock, &other_max_mbal, &larger_lver);
if ((error == SANLK_DBLOCK_MBAL) || (error == SANLK_DBLOCK_LVER)) {
@@ -2296,6 +2377,7 @@ int paxos_lease_acquire(struct task *task,
new_leader.checksum = 0; /* set after leader_record_out */
+ /* acquire io: write 3 (sector) */
error = write_new_leader(task, token, &new_leader, "paxos_acquire");
if (error < 0) {
/* See comment in run_ballot about this flag. */
diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h
index 7725eb6..22b7698 100644
--- a/src/sanlock_internal.h
+++ b/src/sanlock_internal.h
@@ -276,10 +276,14 @@ struct task {
unsigned int io_count; /* stats */
unsigned int to_count; /* stats */
+ unsigned int paxos_hugebuf_used; /* stats */
+ unsigned int paxos_hugebuf_created; /* stats */
int use_aio;
int cb_size;
- char *iobuf;
+ int paxos_hugebuf_size;
+ char *paxos_hugebuf; /* paxos_lease_acquire reads of lease area */
+ char *iobuf; /* delta_lease_renew reads of lease area */
io_context_t aio_ctx;
struct aicb *read_iobuf_timeout_aicb;
struct aicb *callbacks;
diff --git a/src/task.c b/src/task.c
index ad2e761..8b73097 100644
--- a/src/task.c
+++ b/src/task.c
@@ -145,6 +145,9 @@ void close_task_aio(struct task *task)
if (task->iobuf)
free(task->iobuf);
+ if (task->paxos_hugebuf)
+ free(task->paxos_hugebuf);
+
skip_aio:
if (task->callbacks)
free(task->callbacks);
--
To stop receiving notification emails like this one, please contact
the administrator of this repository.