@@ -59,6 +59,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += rte_reciprocal.c
@@ -654,6 +654,9 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ms_idx, ret = 0;
+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
const struct rte_memseg *ms;
@@ -678,15 +681,20 @@ rte_memseg_contig_walk(rte_memseg_contig_walk_t func, void *arg)
len = n_segs * msl->page_sz;
ret = func(msl, ms, len, arg);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ } else if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
ms_idx = rte_fbarray_find_next_used(arr,
ms_idx + n_segs);
}
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}
int __rte_experimental
@@ -695,6 +703,9 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ms_idx, ret = 0;
+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
const struct rte_memseg *ms;
@@ -709,14 +720,19 @@ rte_memseg_walk(rte_memseg_walk_t func, void *arg)
while (ms_idx >= 0) {
ms = rte_fbarray_get(arr, ms_idx);
ret = func(msl, ms, arg);
- if (ret < 0)
- return -1;
- else if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ } else if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
}
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}
int __rte_experimental
@@ -725,6 +741,9 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
int i, ret = 0;
+ /* do not allow allocations/frees/init while we iterate */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct rte_memseg_list *msl = &mcfg->memsegs[i];
@@ -732,12 +751,18 @@ rte_memseg_list_walk(rte_memseg_list_walk_t func, void *arg)
continue;
ret = func(msl, arg);
- if (ret < 0)
- return -1;
- if (ret > 0)
- return 1;
+ if (ret < 0) {
+ ret = -1;
+ goto out;
+ }
+ if (ret > 0) {
+ ret = 1;
+ goto out;
+ }
}
- return 0;
+out:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
}
/* init memory subsystem */
@@ -751,6 +776,9 @@ rte_eal_memory_init(void)
if (!mcfg)
return -1;
+ /* lock mem hotplug here, to prevent races while we init */
+ rte_rwlock_read_lock(&mcfg->memory_hotplug_lock);
+
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
#ifndef RTE_ARCH_64
memseg_primary_init_32() :
@@ -760,16 +788,19 @@ rte_eal_memory_init(void)
memseg_secondary_init();
if (retval < 0)
- return -1;
+ goto fail;
retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
- return -1;
+ goto fail;
if (internal_config.no_shconf == 0 && rte_eal_memdevice_init() < 0)
- return -1;
+ goto fail;
return 0;
+fail:
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
+ return -1;
}
@@ -59,6 +59,9 @@ struct rte_mem_config {
rte_rwlock_t qlock; /**< used for tailq operation for thread safe. */
rte_rwlock_t mplock; /**< only used by mempool LIB for thread-safe. */
+ rte_rwlock_t memory_hotplug_lock;
+ /**< indicates whether memory hotplug request is in progress. */
+
/* memory segments and zones */
struct rte_fbarray memzones; /**< Memzone descriptors. */
@@ -460,74 +460,80 @@ malloc_elem_hide_region(struct malloc_elem *elem, void *start, size_t len)
prev = elem->prev;
next = elem->next;
- len_before = RTE_PTR_DIFF(hide_start, elem);
- len_after = RTE_PTR_DIFF(next, hide_end);
-
- if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
- /* split after */
- split_elem(elem, hide_end);
-
- malloc_elem_free_list_insert(hide_end);
- } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
- /* shrink current element */
- elem->size -= len_after;
- memset(hide_end, 0, sizeof(*hide_end));
-
- /* copy next element's data to our pad */
- memcpy(hide_end, next, sizeof(*hide_end));
-
- /* pad next element */
- next->state = ELEM_PAD;
- next->pad = len_after;
-
- /* next element is busy, would've been merged otherwise */
- hide_end->pad = len_after;
- hide_end->size += len_after;
-
- /* adjust pointers to point to our new pad */
- if (next->next)
- next->next->prev = hide_end;
- elem->next = hide_end;
- } else if (len_after > 0) {
- RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n");
- rte_panic("blow up\n");
- return;
+ /* we cannot do anything with non-adjacent elements */
+ if (next && next_elem_is_adjacent(elem)) {
+ len_after = RTE_PTR_DIFF(next, hide_end);
+ if (len_after >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split after */
+ split_elem(elem, hide_end);
+
+ malloc_elem_free_list_insert(hide_end);
+ } else if (len_after >= MALLOC_ELEM_HEADER_LEN) {
+ /* shrink current element */
+ elem->size -= len_after;
+ memset(hide_end, 0, sizeof(*hide_end));
+
+ /* copy next element's data to our pad */
+ memcpy(hide_end, next, sizeof(*hide_end));
+
+ /* pad next element */
+ next->state = ELEM_PAD;
+ next->pad = len_after;
+ next->size -= len_after;
+
+ /* next element busy, would've been merged otherwise */
+ hide_end->pad = len_after;
+ hide_end->size += len_after;
+
+ /* adjust pointers to point to our new pad */
+ if (next->next)
+ next->next->prev = hide_end;
+ elem->next = hide_end;
+ } else if (len_after > 0) {
+ RTE_LOG(ERR, EAL, "Unaligned element, heap is probably corrupt\n");
+ return;
+ }
}
- if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
- /* split before */
- split_elem(elem, hide_start);
+ /* we cannot do anything with non-adjacent elements */
+ if (prev && prev_elem_is_adjacent(elem)) {
+ len_before = RTE_PTR_DIFF(hide_start, elem);
+ if (len_before >= MALLOC_ELEM_OVERHEAD + MIN_DATA_SIZE) {
+ /* split before */
+ split_elem(elem, hide_start);
- prev = elem;
- elem = hide_start;
+ prev = elem;
+ elem = hide_start;
- malloc_elem_free_list_insert(prev);
- } else if (len_before > 0) {
- /*
- * unlike with elements after current, here we don't need to
- * pad elements, but rather just increase the size of previous
- * element, copy the old header and and set up trailer.
- */
- void *trailer = RTE_PTR_ADD(prev,
- prev->size - MALLOC_ELEM_TRAILER_LEN);
+ malloc_elem_free_list_insert(prev);
+ } else if (len_before > 0) {
+ /*
+ * unlike with elements after current, here we don't
+ * need to pad elements, but rather just increase the
+ * size of previous element, copy the old header and set
+ * up trailer.
+ */
+ void *trailer = RTE_PTR_ADD(prev,
+ prev->size - MALLOC_ELEM_TRAILER_LEN);
- memcpy(hide_start, elem, sizeof(*elem));
- hide_start->size = len;
+ memcpy(hide_start, elem, sizeof(*elem));
+ hide_start->size = len;
- prev->size += len_before;
- set_trailer(prev);
+ prev->size += len_before;
+ set_trailer(prev);
- /* update pointers */
- prev->next = hide_start;
- if (next)
- next->prev = hide_start;
+ /* update pointers */
+ prev->next = hide_start;
+ if (next)
+ next->prev = hide_start;
- elem = hide_start;
+ /* erase old trailer */
+ memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
+ /* erase old header */
+ memset(elem, 0, sizeof(*elem));
- /* erase old trailer */
- memset(trailer, 0, MALLOC_ELEM_TRAILER_LEN);
- /* erase old header */
- memset(elem, 0, sizeof(*elem));
+ elem = hide_start;
+ }
}
remove_elem(elem);
@@ -10,6 +10,7 @@
#include <sys/queue.h>
#include <rte_memory.h>
+#include <rte_errno.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
#include <rte_launch.h>
@@ -26,6 +27,7 @@
#include "eal_memalloc.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
+#include "malloc_mp.h"
static unsigned
check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
@@ -81,8 +83,6 @@ malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
malloc_elem_free_list_insert(elem);
- heap->total_size += len;
-
return elem;
}
@@ -171,68 +171,118 @@ heap_alloc(struct malloc_heap *heap, const char *type __rte_unused, size_t size,
return elem == NULL ? NULL : (void *)(&elem[1]);
}
-static int
-try_expand_heap(struct malloc_heap *heap, size_t pg_sz, size_t elt_size,
+/* this function is exposed in malloc_mp.h */
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_segs,
+ struct malloc_elem *elem, void *map_addr, size_t map_len)
+{
+ if (elem != NULL) {
+ malloc_elem_free_list_remove(elem);
+ malloc_elem_hide_region(elem, map_addr, map_len);
+ }
+
+ eal_memalloc_free_seg_bulk(ms, n_segs);
+}
+
+/* this function is exposed in malloc_mp.h */
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
int socket, unsigned int flags, size_t align, size_t bound,
- bool contig)
+ bool contig, struct rte_memseg **ms, int n_segs)
{
- size_t map_len;
struct rte_memseg_list *msl;
- struct rte_memseg **ms;
- struct malloc_elem *elem;
- int n_segs, allocd_pages;
+ struct malloc_elem *elem = NULL;
+ size_t alloc_sz;
+ int allocd_pages;
void *ret, *map_addr;
- align = RTE_MAX(align, MALLOC_ELEM_HEADER_LEN);
- map_len = RTE_ALIGN_CEIL(align + elt_size + MALLOC_ELEM_TRAILER_LEN,
- pg_sz);
-
- n_segs = map_len / pg_sz;
-
- /* we can't know in advance how many pages we'll need, so malloc */
- ms = malloc(sizeof(*ms) * n_segs);
-
allocd_pages = eal_memalloc_alloc_seg_bulk(ms, n_segs, pg_sz,
socket, true);
/* make sure we've allocated our pages... */
if (allocd_pages < 0)
- goto free_ms;
+ return NULL;
map_addr = ms[0]->addr;
msl = rte_mem_virt2memseg_list(map_addr);
+ alloc_sz = (size_t)msl->page_sz * allocd_pages;
/* check if we wanted contiguous memory but didn't get it */
- if (contig && !eal_memalloc_is_contig(msl, map_addr, map_len)) {
+ if (contig && !eal_memalloc_is_contig(msl, map_addr, alloc_sz)) {
RTE_LOG(DEBUG, EAL, "%s(): couldn't allocate physically contiguous space\n",
__func__);
- goto free_pages;
+ goto fail;
}
/* add newly minted memsegs to malloc heap */
- elem = malloc_heap_add_memory(heap, msl, map_addr, map_len);
+ elem = malloc_heap_add_memory(heap, msl, map_addr, alloc_sz);
/* try once more, as now we have allocated new memory */
ret = find_suitable_element(heap, elt_size, flags, align, bound,
contig);
if (ret == NULL)
+ goto fail;
+
+ return elem;
+
+fail:
+ rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz);
+ return NULL;
+}
+
+static int
+try_expand_heap_primary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ void *map_addr;
+ size_t alloc_sz;
+ int n_segs;
+
+ alloc_sz = RTE_ALIGN_CEIL(align + elt_size +
+ MALLOC_ELEM_TRAILER_LEN, pg_sz);
+ n_segs = alloc_sz / pg_sz;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_segs);
+
+ memset(ms, 0, sizeof(*ms) * n_segs);
+
+ if (ms == NULL)
+ return -1;
+
+ elem = alloc_pages_on_heap(heap, pg_sz, elt_size, socket, flags, align,
+ bound, contig, ms, n_segs);
+
+ if (elem == NULL)
+ goto free_ms;
+
+ map_addr = ms[0]->addr;
+
+ /* notify other processes that this has happened */
+ if (request_sync()) {
+ /* we couldn't ensure all processes have mapped memory,
+ * so free it back and notify everyone that it's been
+ * freed back.
+ */
goto free_elem;
+ }
+ heap->total_size += alloc_sz;
RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
- socket, map_len >> 20ULL);
+ socket, alloc_sz >> 20ULL);
free(ms);
return 0;
free_elem:
- malloc_elem_free_list_remove(elem);
- malloc_elem_hide_region(elem, map_addr, map_len);
- heap->total_size -= map_len;
+ rollback_expand_heap(ms, n_segs, elem, map_addr, alloc_sz);
-free_pages:
- eal_memalloc_free_seg_bulk(ms, n_segs);
+ request_sync();
free_ms:
free(ms);
@@ -240,6 +290,59 @@ try_expand_heap(struct malloc_heap *heap, size_t pg_sz, size_t elt_size,
}
static int
+try_expand_heap_secondary(struct malloc_heap *heap, uint64_t pg_sz,
+ size_t elt_size, int socket, unsigned int flags, size_t align,
+ size_t bound, bool contig)
+{
+ struct malloc_mp_req req;
+ int req_result;
+
+ memset(&req, 0, sizeof(req));
+
+ req.t = REQ_TYPE_ALLOC;
+ req.alloc_req.align = align;
+ req.alloc_req.bound = bound;
+ req.alloc_req.contig = contig;
+ req.alloc_req.flags = flags;
+ req.alloc_req.elt_size = elt_size;
+ req.alloc_req.page_sz = pg_sz;
+ req.alloc_req.socket = socket;
+ req.alloc_req.heap = heap; /* it's in shared memory */
+
+ req_result = request_to_primary(&req);
+
+ if (req_result != 0)
+ return -1;
+
+ if (req.result != REQ_RESULT_SUCCESS)
+ return -1;
+
+ return 0;
+}
+
+static int
+try_expand_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int ret;
+
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ ret = try_expand_heap_primary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ } else {
+ ret = try_expand_heap_secondary(heap, pg_sz, elt_size, socket,
+ flags, align, bound, contig);
+ }
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
+ return ret;
+}
+
+static int
compare_pagesz(const void *a, const void *b)
{
const struct rte_memseg_list * const*mpa = a;
@@ -257,11 +360,10 @@ compare_pagesz(const void *a, const void *b)
}
static int
-alloc_mem_on_socket(size_t size, int socket, unsigned int flags, size_t align,
- size_t bound, bool contig)
+alloc_more_mem_on_socket(struct malloc_heap *heap, size_t size, int socket,
+ unsigned int flags, size_t align, size_t bound, bool contig)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- struct malloc_heap *heap = &mcfg->malloc_heaps[socket];
struct rte_memseg_list *requested_msls[RTE_MAX_MEMSEG_LISTS];
struct rte_memseg_list *other_msls[RTE_MAX_MEMSEG_LISTS];
uint64_t requested_pg_sz[RTE_MAX_MEMSEG_LISTS];
@@ -393,7 +495,8 @@ heap_alloc_on_socket(const char *type, size_t size, int socket,
if (ret != NULL)
goto alloc_unlock;
- if (!alloc_mem_on_socket(size, socket, flags, align, bound, contig)) {
+ if (!alloc_more_mem_on_socket(heap, size, socket, flags, align, bound,
+ contig)) {
ret = heap_alloc(heap, type, size, flags, align, bound, contig);
/* this should have succeeded */
@@ -446,14 +549,41 @@ malloc_heap_alloc(const char *type, size_t size, int socket_arg,
return NULL;
}
+/* this function is exposed in malloc_mp.h */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len)
+{
+ int n_segs, seg_idx, max_seg_idx;
+ struct rte_memseg_list *msl;
+ size_t page_sz;
+
+ msl = rte_mem_virt2memseg_list(aligned_start);
+ if (msl == NULL)
+ return -1;
+
+ page_sz = (size_t)msl->page_sz;
+ n_segs = aligned_len / page_sz;
+ seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz;
+ max_seg_idx = seg_idx + n_segs;
+
+ for (; seg_idx < max_seg_idx; seg_idx++) {
+ struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(&msl->memseg_arr, seg_idx);
+ eal_memalloc_free_seg(ms);
+ }
+ return 0;
+}
+
int
malloc_heap_free(struct malloc_elem *elem)
{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct malloc_heap *heap;
void *start, *aligned_start, *end, *aligned_end;
size_t len, aligned_len, page_sz;
struct rte_memseg_list *msl;
- int n_segs, seg_idx, max_seg_idx, ret;
+ int ret;
if (!malloc_elem_cookies_ok(elem) || elem->state != ELEM_BUSY)
return -1;
@@ -494,25 +624,56 @@ malloc_heap_free(struct malloc_elem *elem)
if (aligned_len < page_sz)
goto free_unlock;
+ rte_rwlock_write_lock(&mcfg->memory_hotplug_lock);
+
+ /*
+ * we allow secondary processes to clear the heap of this allocated
+ * memory because it is safe to do so, as even if notifications about
+ * unmapped pages don't make it to other processes, heap is shared
+ * across all processes, and will become empty of this memory anyway,
+ * and nothing can allocate it back unless primary process will be able
+ * to deliver allocation message to every single running process.
+ */
+
malloc_elem_free_list_remove(elem);
malloc_elem_hide_region(elem, (void *) aligned_start, aligned_len);
- /* we don't really care if we fail to deallocate memory */
- n_segs = aligned_len / page_sz;
- seg_idx = RTE_PTR_DIFF(aligned_start, msl->base_va) / page_sz;
- max_seg_idx = seg_idx + n_segs;
+ heap->total_size -= aligned_len;
- for (; seg_idx < max_seg_idx; seg_idx++) {
- struct rte_memseg *ms;
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ /* don't care if any of this fails */
+ malloc_heap_free_pages(aligned_start, aligned_len);
- ms = rte_fbarray_get(&msl->memseg_arr, seg_idx);
- eal_memalloc_free_seg(ms);
+ request_sync();
+ } else {
+ struct malloc_mp_req req;
+
+ memset(&req, 0, sizeof(req));
+
+ req.t = REQ_TYPE_FREE;
+ req.free_req.addr = aligned_start;
+ req.free_req.len = aligned_len;
+
+ /*
+ * we request primary to deallocate pages, but we don't do it
+ * in this thread. instead, we notify primary that we would like
+ * to deallocate pages, and this process will receive another
+ * request (in parallel) that will do it for us on another
+ * thread.
+ *
+ * we also don't really care if this succeeds - the data is
+ * already removed from the heap, so it is, for all intents and
+ * purposes, hidden from the rest of DPDK even if some other
+ * process (including this one) may have these pages mapped.
+ */
+ request_to_primary(&req);
}
- heap->total_size -= aligned_len;
RTE_LOG(DEBUG, EAL, "Heap on socket %d was shrunk by %zdMB\n",
msl->socket_id, aligned_len >> 20ULL);
+
+ rte_rwlock_write_unlock(&mcfg->memory_hotplug_lock);
free_unlock:
rte_spinlock_unlock(&(heap->lock));
return ret;
@@ -600,8 +761,16 @@ rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- if (mcfg == NULL)
+ if (register_mp_requests()) {
+ RTE_LOG(ERR, EAL, "Couldn't register malloc multiprocess actions\n");
return -1;
+ }
+
+ /* unlock mem hotplug here. it's safe for primary as no requests can
+ * even come before primary itself is fully initialized, and secondaries
+ * do not need to initialize the heap.
+ */
+ rte_rwlock_read_unlock(&mcfg->memory_hotplug_lock);
/* secondary process does not need to initialize anything */
if (rte_eal_process_type() != RTE_PROC_PRIMARY)
new file mode 100644
@@ -0,0 +1,744 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#include <string.h>
+#include <sys/time.h>
+
+#include <rte_alarm.h>
+#include <rte_errno.h>
+
+#include "eal_memalloc.h"
+
+#include "malloc_elem.h"
+#include "malloc_mp.h"
+
+#define MP_ACTION_SYNC "mp_malloc_sync"
+/**< request sent by primary process to notify of changes in memory map */
+#define MP_ACTION_ROLLBACK "mp_malloc_rollback"
+/**< request sent by primary process to notify of changes in memory map. this is
+ * essentially a regular sync request, but we cannot send sync requests while
+ * another one is in progress, and we might have to - therefore, we do this as
+ * a separate callback.
+ */
+#define MP_ACTION_REQUEST "mp_malloc_request"
+/**< request sent by secondary process to ask for allocation/deallocation */
+#define MP_ACTION_RESPONSE "mp_malloc_response"
+/**< response sent to secondary process to indicate result of request */
+
+/* forward declarations */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply);
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply);
+
+#define MP_TIMEOUT_S 5 /**< 5 seconds timeouts */
+
+/* when we're allocating, we need to store some state to ensure that we can
+ * roll back later
+ */
+struct primary_alloc_req_state {
+ struct malloc_heap *heap;
+ struct rte_memseg **ms;
+ int ms_len;
+ struct malloc_elem *elem;
+ void *map_addr;
+ size_t map_len;
+};
+
+enum req_state {
+ REQ_STATE_INACTIVE = 0,
+ REQ_STATE_ACTIVE,
+ REQ_STATE_COMPLETE
+};
+
+struct mp_request {
+ TAILQ_ENTRY(mp_request) next;
+ struct malloc_mp_req user_req; /**< contents of request */
+ pthread_cond_t cond; /**< variable we use to time out on this request */
+ enum req_state state; /**< indicate status of this request */
+ struct primary_alloc_req_state alloc_state;
+};
+
+/*
+ * We could've used just a single request, but it may be possible for
+ * secondaries to timeout earlier than the primary, and send a new request while
+ * primary is still expecting replies to the old one. Therefore, each new
+ * request will get assigned a new ID, which is how we will distinguish between
+ * expected and unexpected messages.
+ */
+TAILQ_HEAD(mp_request_list, mp_request);
+static struct {
+ struct mp_request_list list;
+ pthread_mutex_t lock;
+} mp_request_list = {
+ .list = TAILQ_HEAD_INITIALIZER(mp_request_list.list),
+ .lock = PTHREAD_MUTEX_INITIALIZER
+};
+
+/**
+ * General workflow is the following:
+ *
+ * Allocation:
+ * S: send request to primary
+ * P: attempt to allocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * if success, sendmsg success
+ * if failure, roll back allocation and send a rollback request
+ * S: if received msg of success, quit
+ * if received rollback request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * Aside from timeouts, there are three points where we can quit:
+ * - if allocation failed straight away
+ * - if allocation and sync request succeeded
+ * - if allocation succeeded, sync request failed, allocation rolled back and
+ * rollback request received (irrespective of whether it succeeded or failed)
+ *
+ * Deallocation:
+ * S: send request to primary
+ * P: attempt to deallocate memory
+ * if failed, sendmsg failure
+ * if success, send sync request
+ * S: if received msg of failure, quit
+ * if received sync request, synchronize memory map and reply with result
+ * P: if received sync request result
+ * sendmsg sync request result
+ * S: if received msg, quit
+ *
+ * There is no "rollback" from deallocation, as it's safe to have some memory
+ * mapped in some processes - it's absent from the heap, so it won't get used.
+ */
+
+static struct mp_request *
+find_request_by_id(uint64_t id)
+{
+ struct mp_request *req;
+ TAILQ_FOREACH(req, &mp_request_list.list, next) {
+ if (req->user_req.id == id)
+ break;
+ }
+ return req;
+}
+
+/* this ID is, like, totally guaranteed to be absolutely unique. pinky swear. */
+static uint64_t
+get_unique_id(void)
+{
+ uint64_t id;
+ do {
+ id = rte_rand();
+ } while (find_request_by_id(id) != NULL);
+ return id;
+}
+
+/* secondary will respond to sync requests thusly */
+static int
+handle_sync(const struct rte_mp_msg *msg, const void *peer)
+{
+ struct rte_mp_msg reply;
+ const struct malloc_mp_req *req =
+ (const struct malloc_mp_req *)msg->param;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.param;
+ int ret;
+
+ if (req->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected request from primary\n");
+ return -1;
+ }
+
+ memset(&reply, 0, sizeof(reply));
+
+ reply.num_fds = 0;
+ snprintf(reply.name, sizeof(reply.name), "%s", msg->name);
+ reply.len_param = sizeof(*resp);
+
+ ret = eal_memalloc_sync_with_primary();
+
+ resp->t = REQ_TYPE_SYNC;
+ resp->id = req->id;
+ resp->result = ret == 0 ? REQ_RESULT_SUCCESS : REQ_RESULT_FAIL;
+
+ rte_mp_reply(&reply, peer);
+
+ return 0;
+}
+
+static int
+handle_alloc_request(const struct malloc_mp_req *m,
+ struct mp_request *req)
+{
+ const struct malloc_req_alloc *ar = &m->alloc_req;
+ struct malloc_heap *heap;
+ struct malloc_elem *elem;
+ struct rte_memseg **ms;
+ size_t alloc_sz;
+ int n_segs;
+ void *map_addr;
+
+ alloc_sz = RTE_ALIGN_CEIL(ar->align + ar->elt_size +
+ MALLOC_ELEM_TRAILER_LEN, ar->page_sz);
+ n_segs = alloc_sz / ar->page_sz;
+
+ heap = ar->heap;
+
+ /* we can't know in advance how many pages we'll need, so we malloc */
+ ms = malloc(sizeof(*ms) * n_segs);
+
+ memset(ms, 0, sizeof(*ms) * n_segs);
+
+ if (ms == NULL) {
+ RTE_LOG(ERR, EAL, "Couldn't allocate memory for request state\n");
+ goto fail;
+ }
+
+ elem = alloc_pages_on_heap(heap, ar->page_sz, ar->elt_size, ar->socket,
+ ar->flags, ar->align, ar->bound, ar->contig, ms,
+ n_segs);
+
+ if (elem == NULL)
+ goto fail;
+
+ map_addr = ms[0]->addr;
+
+ /* we have succeeded in allocating memory, but we still need to sync
+ * with other processes. however, since DPDK IPC is single-threaded, we
+ * send an asynchronous request and exit this callback.
+ */
+
+ req->alloc_state.ms = ms;
+ req->alloc_state.ms_len = n_segs;
+ req->alloc_state.map_addr = map_addr;
+ req->alloc_state.map_len = alloc_sz;
+ req->alloc_state.elem = elem;
+ req->alloc_state.heap = heap;
+
+ return 0;
+fail:
+ free(ms);
+ return -1;
+}
+
+/* first stage of primary handling requests from secondary */
+static int
+handle_request(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+ int ret;
+
+ /* lock access to request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ /* make sure it's not a dupe */
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ RTE_LOG(ERR, EAL, "Duplicate request id\n");
+ goto fail;
+ }
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Unable to allocate memory for request\n");
+ goto fail;
+ }
+
+ /* erase all data */
+ memset(entry, 0, sizeof(*entry));
+
+ if (m->t == REQ_TYPE_ALLOC) {
+ ret = handle_alloc_request(m, entry);
+ } else if (m->t == REQ_TYPE_FREE) {
+ ret = malloc_heap_free_pages(m->free_req.addr,
+ m->free_req.len);
+ } else {
+ RTE_LOG(ERR, EAL, "Unexpected request from secondary\n");
+ goto fail;
+ }
+
+ if (ret != 0) {
+ struct rte_mp_msg resp_msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)resp_msg.param;
+
+ /* send failure message straight away */
+ resp_msg.num_fds = 0;
+ resp_msg.len_param = sizeof(*resp);
+ snprintf(resp_msg.name, sizeof(resp_msg.name), "%s",
+ MP_ACTION_RESPONSE);
+
+ resp->t = m->t;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = m->id;
+
+ if (rte_mp_sendmsg(&resp_msg)) {
+ RTE_LOG(ERR, EAL, "Couldn't send response\n");
+ goto fail;
+ }
+ /* we did not modify the request */
+ free(entry);
+ } else {
+ struct rte_mp_msg sr_msg;
+ struct malloc_mp_req *sr =
+ (struct malloc_mp_req *)sr_msg.param;
+ struct timespec ts;
+
+ memset(&sr_msg, 0, sizeof(sr_msg));
+
+ /* we can do something, so send sync request asynchronously */
+ sr_msg.num_fds = 0;
+ sr_msg.len_param = sizeof(*sr);
+ snprintf(sr_msg.name, sizeof(sr_msg.name), "%s",
+ MP_ACTION_SYNC);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ sr->t = REQ_TYPE_SYNC;
+ sr->id = m->id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&sr_msg, &ts,
+ handle_sync_response);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Couldn't send sync request\n");
+ if (m->t == REQ_TYPE_ALLOC)
+ free(entry->alloc_state.ms);
+ goto fail;
+ }
+
+ /* mark request as in progress */
+ memcpy(&entry->user_req, m, sizeof(*m));
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+ }
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+/* callback for asynchronous sync requests for primary. this will either do a
+ * sendmsg with results, or trigger rollback request.
+ */
+static int
+handle_sync_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply)
+{
+ enum malloc_req_result result;
+ struct mp_request *entry;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ int i;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ result = REQ_RESULT_SUCCESS;
+
+ if (reply->nb_received != reply->nb_sent)
+ result = REQ_RESULT_FAIL;
+
+ for (i = 0; i < reply->nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply->msgs[i].param;
+
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response to sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->id != entry->user_req.id) {
+ RTE_LOG(ERR, EAL, "Response to wrong sync request\n");
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ if (resp->result == REQ_RESULT_FAIL) {
+ result = REQ_RESULT_FAIL;
+ break;
+ }
+ }
+
+ if (entry->user_req.t == REQ_TYPE_FREE) {
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+
+ memset(&msg, 0, sizeof(msg));
+
+ /* this is a free request, just sendmsg result */
+ resp->t = REQ_TYPE_FREE;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_SUCCESS) {
+ struct malloc_heap *heap = entry->alloc_state.heap;
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)msg.param;
+
+ memset(&msg, 0, sizeof(msg));
+
+ heap->total_size += entry->alloc_state.map_len;
+
+ /* result is success, so just notify secondary about this */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = result;
+ resp->id = entry->user_req.id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+ } else if (entry->user_req.t == REQ_TYPE_ALLOC &&
+ result == REQ_RESULT_FAIL) {
+ struct rte_mp_msg rb_msg;
+ struct malloc_mp_req *rb =
+ (struct malloc_mp_req *)rb_msg.param;
+ struct timespec ts;
+ struct primary_alloc_req_state *state =
+ &entry->alloc_state;
+ int ret;
+
+ memset(&rb_msg, 0, sizeof(rb_msg));
+
+ /* we've failed to sync, so do a rollback */
+ rollback_expand_heap(state->ms, state->ms_len, state->elem,
+ state->map_addr, state->map_len);
+
+ /* send rollback request */
+ rb_msg.num_fds = 0;
+ rb_msg.len_param = sizeof(*rb);
+ snprintf(rb_msg.name, sizeof(rb_msg.name), "%s",
+ MP_ACTION_ROLLBACK);
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* sync requests carry no data */
+ rb->t = REQ_TYPE_SYNC;
+ rb->id = entry->user_req.id;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_async(&rb_msg, &ts,
+ handle_rollback_response);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send rollback request to secondary process\n");
+
+ /* we couldn't send rollback request, but that's OK -
+ * secondary will time out, and memory has been removed
+ * from heap anyway.
+ */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(state->ms);
+ free(entry);
+ goto fail;
+ }
+ } else {
+ RTE_LOG(ERR, EAL, " to sync request of unknown type\n");
+ goto fail;
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+static int
+handle_rollback_response(const struct rte_mp_msg *request,
+ const struct rte_mp_reply *reply __rte_unused)
+{
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *resp = (struct malloc_mp_req *)msg.param;
+ const struct malloc_mp_req *mpreq =
+ (const struct malloc_mp_req *)request->param;
+ struct mp_request *entry;
+
+ /* lock the request */
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ memset(&msg, 0, sizeof(0));
+
+ entry = find_request_by_id(mpreq->id);
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ goto fail;
+ }
+
+ if (entry->user_req.t != REQ_TYPE_ALLOC) {
+ RTE_LOG(ERR, EAL, "Unexpected active request\n");
+ goto fail;
+ }
+
+ /* we don't care if rollback succeeded, request still failed */
+ resp->t = REQ_TYPE_ALLOC;
+ resp->result = REQ_RESULT_FAIL;
+ resp->id = mpreq->id;
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*resp);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_RESPONSE);
+
+ if (rte_mp_sendmsg(&msg))
+ RTE_LOG(ERR, EAL, "Could not send message to secondary process\n");
+
+ /* clean up */
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry->alloc_state.ms);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return 0;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return -1;
+}
+
+/* final stage of the request from secondary */
+static int
+handle_response(const struct rte_mp_msg *msg, const void *peer __rte_unused)
+{
+ const struct malloc_mp_req *m =
+ (const struct malloc_mp_req *)msg->param;
+ struct mp_request *entry;
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = find_request_by_id(m->id);
+ if (entry != NULL) {
+ /* update request status */
+ entry->user_req.result = m->result;
+
+ entry->state = REQ_STATE_COMPLETE;
+
+ /* trigger thread wakeup */
+ pthread_cond_signal(&entry->cond);
+ }
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+
+ return 0;
+}
+
+/* synchronously request memory map sync, this is only called whenever primary
+ * process initiates the allocation.
+ */
+int
+request_sync(void)
+{
+ struct rte_mp_msg msg;
+ struct rte_mp_reply reply;
+ struct malloc_mp_req *req = (struct malloc_mp_req *)msg.param;
+ struct timespec ts;
+ int i, ret;
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&reply, 0, sizeof(reply));
+
+ /* no need to create tailq entries as this is entirely synchronous */
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_SYNC);
+
+ /* sync request carries no data */
+ req->t = REQ_TYPE_SYNC;
+ req->id = get_unique_id();
+
+ ts.tv_nsec = 0;
+ ts.tv_sec = MP_TIMEOUT_S;
+
+ /* there may be stray timeout still waiting */
+ do {
+ ret = rte_mp_request_sync(&msg, &reply, &ts);
+ } while (ret != 0 && rte_errno == EEXIST);
+ if (ret != 0) {
+ RTE_LOG(ERR, EAL, "Could not send sync request to secondary process\n");
+ ret = -1;
+ goto out;
+ }
+
+ if (reply.nb_received != reply.nb_sent) {
+ RTE_LOG(ERR, EAL, "Not all secondaries have responded\n");
+ ret = -1;
+ goto out;
+ }
+
+ for (i = 0; i < reply.nb_received; i++) {
+ struct malloc_mp_req *resp =
+ (struct malloc_mp_req *)reply.msgs[i].param;
+ if (resp->t != REQ_TYPE_SYNC) {
+ RTE_LOG(ERR, EAL, "Unexpected response from secondary\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->id != req->id) {
+ RTE_LOG(ERR, EAL, "Wrong request ID\n");
+ ret = -1;
+ goto out;
+ }
+ if (resp->result != REQ_RESULT_SUCCESS) {
+ RTE_LOG(ERR, EAL, "Secondary process failed to synchronize\n");
+ ret = -1;
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ free(reply.msgs);
+ return ret;
+}
+
+/* this is a synchronous wrapper around a bunch of asynchronous requests to
+ * primary process. this will initiate a request and wait until responses come.
+ */
+int
+request_to_primary(struct malloc_mp_req *user_req)
+{
+ struct rte_mp_msg msg;
+ struct malloc_mp_req *msg_req = (struct malloc_mp_req *)msg.param;
+ struct mp_request *entry;
+ struct timespec ts;
+ struct timeval now;
+ int ret;
+
+ memset(&msg, 0, sizeof(msg));
+ memset(&ts, 0, sizeof(ts));
+
+ pthread_mutex_lock(&mp_request_list.lock);
+
+ entry = malloc(sizeof(*entry));
+ if (entry == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memory for request\n");
+ goto fail;
+ }
+
+ memset(entry, 0, sizeof(*entry));
+
+ if (gettimeofday(&now, NULL) < 0) {
+ RTE_LOG(ERR, EAL, "Cannot get current time\n");
+ goto fail;
+ }
+
+ ts.tv_nsec = (now.tv_usec * 1000) % 1000000000;
+ ts.tv_sec = now.tv_sec + MP_TIMEOUT_S +
+ (now.tv_usec * 1000) / 1000000000;
+
+ /* initialize the request */
+ pthread_cond_init(&entry->cond, NULL);
+
+ msg.num_fds = 0;
+ msg.len_param = sizeof(*msg_req);
+ snprintf(msg.name, sizeof(msg.name), "%s", MP_ACTION_REQUEST);
+
+ /* (attempt to) get a unique id */
+ user_req->id = get_unique_id();
+
+ /* copy contents of user request into the message */
+ memcpy(msg_req, user_req, sizeof(*msg_req));
+
+ if (rte_mp_sendmsg(&msg)) {
+ RTE_LOG(ERR, EAL, "Cannot send message to primary\n");
+ goto fail;
+ }
+
+ /* copy contents of user request into active request */
+ memcpy(&entry->user_req, user_req, sizeof(*user_req));
+
+ /* mark request as in progress */
+ entry->state = REQ_STATE_ACTIVE;
+
+ TAILQ_INSERT_TAIL(&mp_request_list.list, entry, next);
+
+ /* finally, wait on timeout */
+ do {
+ ret = pthread_cond_timedwait(&entry->cond,
+ &mp_request_list.lock, &ts);
+ } while (ret != 0 && ret != ETIMEDOUT);
+
+ if (entry->state != REQ_STATE_COMPLETE) {
+ RTE_LOG(ERR, EAL, "Request timed out\n");
+ ret = -1;
+ } else {
+ ret = 0;
+ user_req->result = entry->user_req.result;
+ }
+ TAILQ_REMOVE(&mp_request_list.list, entry, next);
+ free(entry);
+
+ pthread_mutex_unlock(&mp_request_list.lock);
+ return ret;
+fail:
+ pthread_mutex_unlock(&mp_request_list.lock);
+ free(entry);
+ return -1;
+}
+
+int
+register_mp_requests(void)
+{
+ if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
+ if (rte_mp_action_register(MP_ACTION_REQUEST, handle_request)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_REQUEST);
+ return -1;
+ }
+ } else {
+ if (rte_mp_action_register(MP_ACTION_SYNC, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_ROLLBACK, handle_sync)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_SYNC);
+ return -1;
+ }
+ if (rte_mp_action_register(MP_ACTION_RESPONSE,
+ handle_response)) {
+ RTE_LOG(ERR, EAL, "Couldn't register '%s' action\n",
+ MP_ACTION_RESPONSE);
+ return -1;
+ }
+ }
+ return 0;
+}
new file mode 100644
@@ -0,0 +1,86 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef MALLOC_MP_H
+#define MALLOC_MP_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_random.h>
+#include <rte_spinlock.h>
+#include <rte_tailq.h>
+
+/* forward declarations */
+struct malloc_heap;
+struct rte_memseg;
+
+/* multiprocess synchronization structures for malloc */
+enum malloc_req_type {
+ REQ_TYPE_ALLOC, /**< ask primary to allocate */
+ REQ_TYPE_FREE, /**< ask primary to free */
+ REQ_TYPE_SYNC /**< ask secondary to synchronize its memory map */
+};
+
+enum malloc_req_result {
+ REQ_RESULT_SUCCESS,
+ REQ_RESULT_FAIL
+};
+
+struct malloc_req_alloc {
+ struct malloc_heap *heap;
+ uint64_t page_sz;
+ size_t elt_size;
+ int socket;
+ unsigned int flags;
+ size_t align;
+ size_t bound;
+ bool contig;
+};
+
+struct malloc_req_free {
+ RTE_STD_C11
+ union {
+ void *addr;
+ uint64_t addr_64;
+ };
+ uint64_t len;
+};
+
+struct malloc_mp_req {
+ enum malloc_req_type t;
+ RTE_STD_C11
+ union {
+ struct malloc_req_alloc alloc_req;
+ struct malloc_req_free free_req;
+ };
+ uint64_t id; /**< not to be populated by caller */
+ enum malloc_req_result result;
+};
+
+int
+register_mp_requests(void);
+
+int
+request_to_primary(struct malloc_mp_req *req);
+
+/* synchronous memory map sync request */
+int
+request_sync(void);
+
+/* functions from malloc_heap exposed here */
+int
+malloc_heap_free_pages(void *aligned_start, size_t aligned_len);
+
+struct malloc_elem *
+alloc_pages_on_heap(struct malloc_heap *heap, uint64_t pg_sz, size_t elt_size,
+ int socket, unsigned int flags, size_t align, size_t bound,
+ bool contig, struct rte_memseg **ms, int n_segs);
+
+void
+rollback_expand_heap(struct rte_memseg **ms, int n_segs,
+ struct malloc_elem *elem, void *map_addr, size_t map_len);
+
+#endif // MALLOC_MP_H
@@ -27,6 +27,7 @@ common_sources = files(
'eal_common_timer.c',
'malloc_elem.c',
'malloc_heap.c',
+ 'malloc_mp.c',
'rte_keepalive.c',
'rte_malloc.c',
'rte_reciprocal.c',
@@ -67,6 +67,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_common_fbarray.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_malloc.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_elem.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_heap.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += malloc_mp.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_keepalive.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_service.c
SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += rte_reciprocal.c
@@ -211,6 +211,32 @@ is_zero_length(int fd)
return st.st_blocks == 0;
}
+/* we cannot use rte_memseg_list_walk() here because we will be holding a
+ * write lock whenever we enter every function in this file, however copying
+ * the same iteration code everywhere is not ideal as well. so, use a lockless
+ * copy of memseg list walk here.
+ */
+static int
+memseg_list_walk_thread_unsafe(rte_memseg_list_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ret = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->base_va == NULL)
+ continue;
+
+ ret = func(msl, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ }
+ return 0;
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
@@ -739,7 +765,7 @@ eal_memalloc_alloc_seg_bulk(struct rte_memseg **ms, int n_segs, size_t page_sz,
wa.socket = socket;
wa.segs_allocated = 0;
- ret = rte_memseg_list_walk(alloc_seg_walk, &wa);
+ ret = memseg_list_walk_thread_unsafe(alloc_seg_walk, &wa);
if (ret == 0) {
RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
__func__);
@@ -798,7 +824,7 @@ eal_memalloc_free_seg_bulk(struct rte_memseg **ms, int n_segs)
wa.ms = cur;
wa.hi = hi;
- walk_res = rte_memseg_list_walk(free_seg_walk, &wa);
+ walk_res = memseg_list_walk_thread_unsafe(free_seg_walk, &wa);
if (walk_res == 1)
continue;
if (walk_res == 0)
@@ -1055,7 +1081,7 @@ eal_memalloc_sync_with_primary(void)
if (rte_eal_process_type() == RTE_PROC_PRIMARY)
return 0;
- if (rte_memseg_list_walk(sync_walk, NULL))
+ if (memseg_list_walk_thread_unsafe(sync_walk, NULL))
return -1;
return 0;
}
@@ -53,6 +53,42 @@ static const struct vfio_iommu_type iommu_types[] = {
},
};
+/* for sPAPR IOMMU, we will need to walk memseg list, but we cannot use
+ * rte_memseg_walk() because by the time we enter callback we will be holding a
+ * write lock, so regular rte-memseg_walk will deadlock. copying the same
+ * iteration code everywhere is not ideal as well. so, use a lockless copy of
+ * memseg walk here.
+ */
+static int
+memseg_walk_thread_unsafe(rte_memseg_walk_t func, void *arg)
+{
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int i, ms_idx, ret = 0;
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_memseg *ms;
+ struct rte_fbarray *arr;
+
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ arr = &msl->memseg_arr;
+
+ ms_idx = rte_fbarray_find_next_used(arr, 0);
+ while (ms_idx >= 0) {
+ ms = rte_fbarray_get(arr, ms_idx);
+ ret = func(msl, ms, arg);
+ if (ret < 0)
+ return -1;
+ if (ret > 0)
+ return 1;
+ ms_idx = rte_fbarray_find_next_used(arr, ms_idx + 1);
+ }
+ }
+ return 0;
+}
+
int
vfio_get_group_fd(int iommu_group_no)
{
@@ -884,7 +920,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
/* check if window size needs to be adjusted */
memset(¶m, 0, sizeof(param));
- if (rte_memseg_walk(vfio_spapr_window_size_walk, ¶m) < 0) {
+ if (memseg_walk_thread_unsafe(vfio_spapr_window_size_walk, ¶m) < 0) {
RTE_LOG(ERR, EAL, "Could not get window size\n");
return -1;
}
@@ -902,7 +938,7 @@ vfio_spapr_dma_mem_map(int vfio_container_fd, uint64_t vaddr, uint64_t iova,
RTE_LOG(ERR, EAL, "Could not create new DMA window\n");
return -1;
}
- if (rte_memseg_walk(vfio_spapr_map_walk,
+ if (memseg_walk_thread_unsafe(vfio_spapr_map_walk,
&vfio_container_fd) < 0) {
RTE_LOG(ERR, EAL, "Could not recreate DMA maps\n");
return -1;