@@ -90,7 +90,8 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
CONFIG_RTE_LIBRTE_EAL=y
CONFIG_RTE_MAX_LCORE=128
CONFIG_RTE_MAX_NUMA_NODES=8
-CONFIG_RTE_MAX_MEMSEG=256
+CONFIG_RTE_MAX_MEMSEG_LISTS=16
+CONFIG_RTE_MAX_MEMSEG_PER_LIST=32768
CONFIG_RTE_MAX_MEMZONE=2560
CONFIG_RTE_MAX_TAILQ=32
CONFIG_RTE_ENABLE_ASSERT=n
@@ -148,19 +148,30 @@ rte_pci_unmap_device(struct rte_pci_device *dev)
void *
pci_find_max_end_va(void)
{
- const struct rte_memseg *seg = rte_eal_get_physmem_layout();
- const struct rte_memseg *last = seg;
- unsigned i = 0;
+ void *cur_end, *max_end = NULL;
+ int i = 0;
- for (i = 0; i < RTE_MAX_MEMSEG; i++, seg++) {
- if (seg->addr == NULL)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
- if (seg->addr > last->addr)
- last = seg;
+ if (arr->capacity == 0)
+ continue;
+ /*
+ * we need to handle legacy mem case, so don't rely on page size
+ * to calculate max VA end
+ */
+ while ((i = rte_fbarray_find_next_used(arr, i)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, i);
+ cur_end = RTE_PTR_ADD(ms->addr, ms->len);
+ if (cur_end > max_end)
+ max_end = cur_end;
+ }
}
- return RTE_PTR_ADD(last->addr, last->len);
+ return max_end;
}
/* parse one line of the "resource" sysfs file (note that the 'line'
@@ -99,6 +99,40 @@ static uint64_t vhost_req_user_to_kernel[] = {
[VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
};
+/* returns number of segments processed */
+static int
+add_memory_region(struct vhost_memory_region *mr, const struct rte_fbarray *arr,
+ int reg_start_idx, int max) {
+ const struct rte_memseg *ms;
+ void *start_addr, *expected_addr;
+ uint64_t len;
+ int idx;
+
+ idx = reg_start_idx;
+ len = 0;
+ start_addr = NULL;
+ expected_addr = NULL;
+
+ /* we could've relied on page size, but we have to support legacy mem */
+ while (idx < max){
+ ms = rte_fbarray_get(arr, idx);
+ if (expected_addr == NULL) {
+ start_addr = ms->addr;
+ expected_addr = RTE_PTR_ADD(ms->addr, ms->len);
+ } else if (ms->addr != expected_addr)
+ break;
+ len += ms->len;
+ idx++;
+ }
+
+ mr->guest_phys_addr = (uint64_t)(uintptr_t) start_addr;
+ mr->userspace_addr = (uint64_t)(uintptr_t) start_addr;
+ mr->memory_size = len;
+ mr->mmap_offset = 0;
+
+ return idx;
+}
+
/* By default, vhost kernel module allows 64 regions, but DPDK allows
* 256 segments. As a relief, below function merges those virtually
* adjacent memsegs into one region.
@@ -106,8 +140,7 @@ static uint64_t vhost_req_user_to_kernel[] = {
static struct vhost_memory_kernel *
prepare_vhost_memory_kernel(void)
{
- uint32_t i, j, k = 0;
- struct rte_memseg *seg;
+ uint32_t list_idx, region_nr = 0;
struct vhost_memory_region *mr;
struct vhost_memory_kernel *vm;
@@ -117,52 +150,41 @@ prepare_vhost_memory_kernel(void)
if (!vm)
return NULL;
- for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
- seg = &rte_eal_get_configuration()->mem_config->memseg[i];
- if (!seg->addr)
- break;
-
- int new_region = 1;
-
- for (j = 0; j < k; ++j) {
- mr = &vm->regions[j];
+ for (list_idx = 0; list_idx < RTE_MAX_MEMSEG_LISTS; ++list_idx) {
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl = &mcfg->memsegs[list_idx];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+ int reg_start_idx, search_idx;
- if (mr->userspace_addr + mr->memory_size ==
- (uint64_t)(uintptr_t)seg->addr) {
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
-
- if ((uint64_t)(uintptr_t)seg->addr + seg->len ==
- mr->userspace_addr) {
- mr->guest_phys_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr =
- (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size += seg->len;
- new_region = 0;
- break;
- }
- }
-
- if (new_region == 0)
+ /* skip empty segment lists */
+ if (arr->count == 0)
continue;
- mr = &vm->regions[k++];
- /* use vaddr here! */
- mr->guest_phys_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->userspace_addr = (uint64_t)(uintptr_t)seg->addr;
- mr->memory_size = seg->len;
- mr->mmap_offset = 0;
-
- if (k >= max_regions) {
- free(vm);
- return NULL;
+ search_idx = 0;
+ while ((reg_start_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ int reg_n_pages;
+ if (region_nr >= max_regions) {
+ free(vm);
+ return NULL;
+ }
+ mr = &vm->regions[region_nr++];
+
+ /*
+ * we know memseg starts at search_idx, check how many
+ * segments there are
+ */
+ reg_n_pages = rte_fbarray_find_contig_used(arr,
+ search_idx);
+
+ /* look at at most reg_n_pages of memsegs */
+ search_idx = add_memory_region(mr, arr, reg_start_idx,
+ search_idx + reg_n_pages);
}
}
- vm->nregions = k;
+ vm->nregions = region_nr;
vm->padding = 0;
return vm;
}
@@ -42,6 +42,7 @@
#include <sys/mman.h>
#include <sys/queue.h>
+#include <rte_fbarray.h>
#include <rte_memory.h>
#include <rte_eal.h>
#include <rte_eal_memconfig.h>
@@ -58,6 +59,8 @@
* which is a multiple of hugepage size.
*/
+#define MEMSEG_LIST_FMT "memseg-%luk-%i"
+
static uint64_t baseaddr_offset;
void *
@@ -117,6 +120,178 @@ eal_get_virtual_area(void *requested_addr, uint64_t *size,
return addr;
}
+static uint64_t
+get_mem_amount(uint64_t page_sz) {
+ uint64_t area_sz;
+
+ // TODO: saner heuristics
+ /* limit to RTE_MAX_MEMSEG_PER_LIST pages or 128G worth of memory */
+ area_sz = RTE_MIN(page_sz * RTE_MAX_MEMSEG_PER_LIST, 1ULL << 37);
+
+ return rte_align64pow2(area_sz);
+}
+
+static int
+get_max_num_pages(uint64_t page_sz, uint64_t mem_amount) {
+ return mem_amount / page_sz;
+}
+
+static int
+get_min_num_pages(int max_pages) {
+ return RTE_MIN(256, max_pages);
+}
+
+static int
+alloc_memseg_list(struct rte_memseg_list *msl, uint64_t page_sz,
+ int socket_id) {
+ char name[RTE_FBARRAY_NAME_LEN];
+ int min_pages, max_pages;
+ uint64_t mem_amount;
+ void *addr;
+
+ if (!internal_config.legacy_mem) {
+ mem_amount = get_mem_amount(page_sz);
+ max_pages = get_max_num_pages(page_sz, mem_amount);
+ min_pages = get_min_num_pages(max_pages);
+
+ // TODO: allow shrink?
+ addr = eal_get_virtual_area(NULL, &mem_amount, page_sz, 0);
+ if (addr == NULL) {
+ RTE_LOG(ERR, EAL, "Cannot reserve memory\n");
+ return -1;
+ }
+ } else {
+ addr = NULL;
+ min_pages = 256;
+ max_pages = 256;
+ }
+
+ snprintf(name, sizeof(name), MEMSEG_LIST_FMT, page_sz >> 10, socket_id);
+ if (rte_fbarray_alloc(&msl->memseg_arr, name, min_pages, max_pages,
+ sizeof(struct rte_memseg))) {
+ RTE_LOG(ERR, EAL, "Cannot allocate memseg list\n");
+ return -1;
+ }
+
+ msl->hugepage_sz = page_sz;
+ msl->socket_id = socket_id;
+ msl->base_va = addr;
+
+ return 0;
+}
+
+static int
+memseg_init(void) {
+ struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+ int socket_id, hpi_idx, msl_idx = 0;
+ struct rte_memseg_list *msl;
+
+ if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
+ RTE_LOG(ERR, EAL, "Secondary process not supported\n");
+ return -1;
+ }
+
+ /* create memseg lists */
+ for (hpi_idx = 0; hpi_idx < (int) internal_config.num_hugepage_sizes;
+ hpi_idx++) {
+ struct hugepage_info *hpi;
+ uint64_t hugepage_sz;
+
+ hpi = &internal_config.hugepage_info[hpi_idx];
+ hugepage_sz = hpi->hugepage_sz;
+
+ for (socket_id = 0; socket_id < (int) rte_num_sockets();
+ socket_id++) {
+ if (msl_idx >= RTE_MAX_MEMSEG_LISTS) {
+ RTE_LOG(ERR, EAL,
+ "No more space in memseg lists\n");
+ return -1;
+ }
+ msl = &mcfg->memsegs[msl_idx++];
+
+ if (alloc_memseg_list(msl, hugepage_sz, socket_id)) {
+ return -1;
+ }
+ }
+ }
+ return 0;
+}
+
+static const struct rte_memseg *
+virt2memseg(const void *addr, const struct rte_memseg_list *msl) {
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ const struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+
+ /* first, find appropriate memseg list, if it wasn't specified */
+ if (msl == NULL) {
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ void *start, *end;
+ msl = &mcfg->memsegs[msl_idx];
+
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.capacity);
+ if (addr >= start && addr < end)
+ break;
+ }
+ /* if we didn't find our memseg list */
+ if (msl_idx == RTE_MAX_MEMSEG_LISTS)
+ return NULL;
+ } else {
+ /* a memseg list was specified, check if it's the right one */
+ void *start, *end;
+ start = msl->base_va;
+ end = RTE_PTR_ADD(start, msl->hugepage_sz *
+ msl->memseg_arr.capacity);
+
+ if (addr < start || addr >= end)
+ return NULL;
+ }
+
+ /* now, calculate index */
+ arr = &msl->memseg_arr;
+ ms_idx = RTE_PTR_DIFF(addr, msl->base_va) / msl->hugepage_sz;
+ return rte_fbarray_get(arr, ms_idx);
+}
+
+static const struct rte_memseg *
+virt2memseg_legacy(const void *addr) {
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl;
+ const struct rte_fbarray *arr;
+ int msl_idx, ms_idx;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ const struct rte_memseg *ms;
+ void *start, *end;
+ ms = rte_fbarray_get(arr, ms_idx);
+ start = ms->addr;
+ end = RTE_PTR_ADD(start, ms->len);
+ if (addr >= start && addr < end)
+ return ms;
+ ms_idx++;
+ }
+ }
+ return NULL;
+}
+
+const struct rte_memseg *
+rte_mem_virt2memseg(const void *addr, const struct rte_memseg_list *msl) {
+ /* for legacy memory, we just walk the list, like in the old days. */
+ if (internal_config.legacy_mem) {
+ return virt2memseg_legacy(addr);
+ } else {
+ return virt2memseg(addr, msl);
+ }
+}
+
/*
* Return a pointer to a read-only table of struct rte_physmem_desc
@@ -126,7 +301,9 @@ eal_get_virtual_area(void *requested_addr, uint64_t *size,
const struct rte_memseg *
rte_eal_get_physmem_layout(void)
{
- return rte_eal_get_configuration()->mem_config->memseg;
+ struct rte_fbarray *arr;
+ arr = &rte_eal_get_configuration()->mem_config->memsegs[0].memseg_arr;
+ return rte_fbarray_get(arr, 0);
}
@@ -141,11 +318,24 @@ rte_eal_get_physmem_size(void)
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ /* for legacy mem mode, walk the memsegs */
+ if (internal_config.legacy_mem) {
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+ int ms_idx = 0;
- total_len += mcfg->memseg[i].len;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx) >= 0)) {
+ const struct rte_memseg *ms =
+ rte_fbarray_get(arr, ms_idx);
+ total_len += ms->len;
+ }
+ } else
+ total_len += msl->hugepage_sz * msl->memseg_arr.count;
}
return total_len;
@@ -161,21 +351,29 @@ rte_dump_physmem_layout(FILE *f)
/* get pointer to global configuration */
mcfg = rte_eal_get_configuration()->mem_config;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (mcfg->memseg[i].addr == NULL)
- break;
-
- fprintf(f, "Segment %u: IOVA:0x%"PRIx64", len:%zu, "
- "virt:%p, socket_id:%"PRId32", "
- "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
- "nrank:%"PRIx32"\n", i,
- mcfg->memseg[i].iova,
- mcfg->memseg[i].len,
- mcfg->memseg[i].addr,
- mcfg->memseg[i].socket_id,
- mcfg->memseg[i].hugepage_sz,
- mcfg->memseg[i].nchannel,
- mcfg->memseg[i].nrank);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+ int m_idx = 0;
+
+ if (arr->count == 0)
+ continue;
+
+ while ((m_idx = rte_fbarray_find_next_used(arr, m_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, m_idx);
+ fprintf(f, "Page %u-%u: iova:0x%"PRIx64", len:%zu, "
+ "virt:%p, socket_id:%"PRId32", "
+ "hugepage_sz:%"PRIu64", nchannel:%"PRIx32", "
+ "nrank:%"PRIx32"\n", i, m_idx,
+ ms->iova,
+ ms->len,
+ ms->addr,
+ ms->socket_id,
+ ms->hugepage_sz,
+ ms->nchannel,
+ ms->nrank);
+ m_idx++;
+ }
}
}
@@ -220,9 +418,14 @@ rte_mem_lock_page(const void *virt)
int
rte_eal_memory_init(void)
{
+ int retval;
RTE_LOG(DEBUG, EAL, "Setting up physically contiguous memory...\n");
- const int retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
+ retval = memseg_init();
+ if (retval < 0)
+ return -1;
+
+ retval = rte_eal_process_type() == RTE_PROC_PRIMARY ?
rte_eal_hugepage_init() :
rte_eal_hugepage_attach();
if (retval < 0)
@@ -254,10 +254,9 @@ memzone_reserve_aligned_thread_unsafe(const char *name, size_t len,
mz->iova = rte_malloc_virt2iova(mz_addr);
mz->addr = mz_addr;
mz->len = (requested_len == 0 ? elem->size : requested_len);
- mz->hugepage_sz = elem->ms->hugepage_sz;
- mz->socket_id = elem->ms->socket_id;
+ mz->hugepage_sz = elem->msl->hugepage_sz;
+ mz->socket_id = elem->msl->socket_id;
mz->flags = 0;
- mz->memseg_id = elem->ms - rte_eal_get_configuration()->mem_config->memseg;
return mz;
}
@@ -52,6 +52,7 @@ struct hugepage_file {
int socket_id; /**< NUMA socket ID */
int file_id; /**< the '%d' in HUGEFILE_FMT */
int memseg_id; /**< the memory segment to which page belongs */
+ int memseg_list_id; /**< the memory segment list to which page belongs */
char filepath[MAX_HUGEPAGE_PATH]; /**< path to backing file on filesystem */
};
@@ -40,12 +40,30 @@
#include <rte_malloc_heap.h>
#include <rte_rwlock.h>
#include <rte_pause.h>
+#include <rte_fbarray.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
+ * memseg list is a special case as we need to store a bunch of other data
+ * together with the array itself.
+ */
+struct rte_memseg_list {
+ RTE_STD_C11
+ union {
+ void *base_va;
+ /**< Base virtual address for this memseg list. */
+ uint64_t addr_64;
+ /**< Makes sure addr is always 64-bits */
+ };
+ int socket_id; /**< Socket ID for all memsegs in this list. */
+ uint64_t hugepage_sz; /**< page size for all memsegs in this list. */
+ struct rte_fbarray memseg_arr;
+};
+
+/**
* the structure for the memory configuration for the RTE.
* Used by the rte_config structure. It is separated out, as for multi-process
* support, the memory details should be shared across instances
@@ -71,9 +89,11 @@ struct rte_mem_config {
uint32_t memzone_cnt; /**< Number of allocated memzones */
/* memory segments and zones */
- struct rte_memseg memseg[RTE_MAX_MEMSEG]; /**< Physmem descriptors. */
struct rte_memzone memzone[RTE_MAX_MEMZONE]; /**< Memzone descriptors. */
+ struct rte_memseg_list memsegs[RTE_MAX_MEMSEG_LISTS];
+ /**< list of dynamic arrays holding memsegs */
+
struct rte_tailq_head tailq_head[RTE_MAX_TAILQ]; /**< Tailqs for objects */
/* Heaps of Malloc per socket */
@@ -50,6 +50,9 @@ extern "C" {
#include <rte_common.h>
+/* forward declaration for pointers */
+struct rte_memseg_list;
+
__extension__
enum rte_page_sizes {
RTE_PGSIZE_4K = 1ULL << 12,
@@ -158,6 +161,19 @@ phys_addr_t rte_mem_virt2phy(const void *virt);
rte_iova_t rte_mem_virt2iova(const void *virt);
/**
+ * Get memseg corresponding to virtual memory address.
+ *
+ * @param virt
+ * The virtual address.
+ * @param msl
+ * Memseg list in which to look for memsegs (can be NULL).
+ * @return
+ * Memseg to which this virtual address belongs to.
+ */
+const struct rte_memseg *rte_mem_virt2memseg(const void *virt,
+ const struct rte_memseg_list *msl);
+
+/**
* Get the layout of the available physical memory.
*
* It can be useful for an application to have the full physical
@@ -54,11 +54,11 @@
* Initialize a general malloc_elem header structure
*/
void
-malloc_elem_init(struct malloc_elem *elem,
- struct malloc_heap *heap, const struct rte_memseg *ms, size_t size)
+malloc_elem_init(struct malloc_elem *elem, struct malloc_heap *heap,
+ const struct rte_memseg_list *msl, size_t size)
{
elem->heap = heap;
- elem->ms = ms;
+ elem->msl = msl;
elem->prev = NULL;
elem->next = NULL;
memset(&elem->free_list, 0, sizeof(elem->free_list));
@@ -172,7 +172,7 @@ split_elem(struct malloc_elem *elem, struct malloc_elem *split_pt)
const size_t old_elem_size = (uintptr_t)split_pt - (uintptr_t)elem;
const size_t new_elem_size = elem->size - old_elem_size;
- malloc_elem_init(split_pt, elem->heap, elem->ms, new_elem_size);
+ malloc_elem_init(split_pt, elem->heap, elem->msl, new_elem_size);
split_pt->prev = elem;
split_pt->next = next_elem;
if (next_elem)
@@ -34,7 +34,7 @@
#ifndef MALLOC_ELEM_H_
#define MALLOC_ELEM_H_
-#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
/* dummy definition of struct so we can use pointers to it in malloc_elem struct */
struct malloc_heap;
@@ -50,7 +50,7 @@ struct malloc_elem {
struct malloc_elem *volatile prev; /* points to prev elem in memseg */
struct malloc_elem *volatile next; /* points to next elem in memseg */
LIST_ENTRY(malloc_elem) free_list; /* list of free elements in heap */
- const struct rte_memseg *ms;
+ const struct rte_memseg_list *msl;
volatile enum elem_state state;
uint32_t pad;
size_t size;
@@ -137,7 +137,7 @@ malloc_elem_from_data(const void *data)
void
malloc_elem_init(struct malloc_elem *elem,
struct malloc_heap *heap,
- const struct rte_memseg *ms,
+ const struct rte_memseg_list *msl,
size_t size);
void
@@ -50,6 +50,7 @@
#include <rte_memcpy.h>
#include <rte_atomic.h>
+#include "eal_internal_cfg.h"
#include "malloc_elem.h"
#include "malloc_heap.h"
@@ -91,22 +92,25 @@ check_hugepage_sz(unsigned flags, uint64_t hugepage_sz)
}
/*
- * Expand the heap with a memseg.
- * This reserves the zone and sets a dummy malloc_elem header at the end
- * to prevent overflow. The rest of the zone is added to free list as a single
- * large free block
+ * Expand the heap with a memory area.
*/
-static void
-malloc_heap_add_memseg(struct malloc_heap *heap, struct rte_memseg *ms)
+static struct malloc_elem *
+malloc_heap_add_memory(struct malloc_heap *heap, struct rte_memseg_list *msl,
+ void *start, size_t len)
{
- struct malloc_elem *start_elem = (struct malloc_elem *)ms->addr;
- const size_t elem_size = ms->len - MALLOC_ELEM_OVERHEAD;
+ struct malloc_elem *elem = start;
+
+ malloc_elem_init(elem, heap, msl, len);
+
+ malloc_elem_insert(elem);
+
+ elem = malloc_elem_join_adjacent_free(elem);
- malloc_elem_init(start_elem, heap, ms, elem_size);
- malloc_elem_insert(start_elem);
- malloc_elem_free_list_insert(start_elem);
+ malloc_elem_free_list_insert(elem);
- heap->total_size += elem_size;
+ heap->total_size += len;
+
+ return elem;
}
/*
@@ -127,7 +131,7 @@ find_suitable_element(struct malloc_heap *heap, size_t size,
for (elem = LIST_FIRST(&heap->free_head[idx]);
!!elem; elem = LIST_NEXT(elem, free_list)) {
if (malloc_elem_can_hold(elem, size, align, bound)) {
- if (check_hugepage_sz(flags, elem->ms->hugepage_sz))
+ if (check_hugepage_sz(flags, elem->msl->hugepage_sz))
return elem;
if (alt_elem == NULL)
alt_elem = elem;
@@ -249,16 +253,62 @@ int
rte_eal_malloc_heap_init(void)
{
struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
- unsigned ms_cnt;
- struct rte_memseg *ms;
+ int msl_idx;
+ struct rte_memseg_list *msl;
if (mcfg == NULL)
return -1;
- for (ms = &mcfg->memseg[0], ms_cnt = 0;
- (ms_cnt < RTE_MAX_MEMSEG) && (ms->len > 0);
- ms_cnt++, ms++) {
- malloc_heap_add_memseg(&mcfg->malloc_heaps[ms->socket_id], ms);
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ int start;
+ struct rte_fbarray *arr;
+ struct malloc_heap *heap;
+
+ msl = &mcfg->memsegs[msl_idx];
+ arr = &msl->memseg_arr;
+ heap = &mcfg->malloc_heaps[msl->socket_id];
+
+ if (arr->capacity == 0)
+ continue;
+
+ /* for legacy mode, just walk the list */
+ if (internal_config.legacy_mem) {
+ int ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ malloc_heap_add_memory(heap, msl, ms->addr, ms->len);
+ ms_idx++;
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, ms->len >> 20ULL);
+ }
+ continue;
+ }
+
+ /* find first segment */
+ start = rte_fbarray_find_next_used(arr, 0);
+
+ while (start >= 0) {
+ int contig_segs;
+ struct rte_memseg *start_seg;
+ size_t len, hugepage_sz = msl->hugepage_sz;
+
+ /* find how many pages we can lump in together */
+ contig_segs = rte_fbarray_find_contig_used(arr, start);
+ start_seg = rte_fbarray_get(arr, start);
+ len = contig_segs * hugepage_sz;
+
+ /*
+ * we've found (hopefully) a bunch of contiguous
+ * segments, so add them to the heap.
+ */
+ malloc_heap_add_memory(heap, msl, start_seg->addr, len);
+
+ RTE_LOG(DEBUG, EAL, "Heap on socket %d was expanded by %zdMB\n",
+ msl->socket_id, len >> 20ULL);
+
+ start = rte_fbarray_find_next_used(arr,
+ start + contig_segs);
+ }
}
return 0;
@@ -251,17 +251,21 @@ rte_malloc_set_limit(__rte_unused const char *type,
rte_iova_t
rte_malloc_virt2iova(const void *addr)
{
- rte_iova_t iova;
+ const struct rte_memseg *ms;
const struct malloc_elem *elem = malloc_elem_from_data(addr);
+
if (elem == NULL)
return RTE_BAD_IOVA;
- if (elem->ms->iova == RTE_BAD_IOVA)
- return RTE_BAD_IOVA;
if (rte_eal_iova_mode() == RTE_IOVA_VA)
- iova = (uintptr_t)addr;
- else
- iova = elem->ms->iova +
- RTE_PTR_DIFF(addr, elem->ms->addr);
- return iova;
+ return (uintptr_t) addr;
+
+ ms = rte_mem_virt2memseg(addr, elem->msl);
+ if (ms == NULL)
+ return RTE_BAD_IOVA;
+
+ if (ms->iova == RTE_BAD_IOVA)
+ return RTE_BAD_IOVA;
+
+ return ms->iova + RTE_PTR_DIFF(addr, ms->addr);
}
@@ -102,8 +102,8 @@ static int mem_cfg_fd = -1;
static struct flock wr_lock = {
.l_type = F_WRLCK,
.l_whence = SEEK_SET,
- .l_start = offsetof(struct rte_mem_config, memseg),
- .l_len = sizeof(early_mem_config.memseg),
+ .l_start = offsetof(struct rte_mem_config, memsegs),
+ .l_len = sizeof(early_mem_config.memsegs),
};
/* Address of global and public configuration */
@@ -661,17 +661,20 @@ eal_parse_args(int argc, char **argv)
static void
eal_check_mem_on_local_socket(void)
{
- const struct rte_memseg *ms;
+ const struct rte_memseg_list *msl;
int i, socket_id;
socket_id = rte_lcore_to_socket_id(rte_config.master_lcore);
- ms = rte_eal_get_physmem_layout();
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++)
- if (ms[i].socket_id == socket_id &&
- ms[i].len > 0)
- return;
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ if (msl->socket_id != socket_id)
+ continue;
+ /* for legacy memory, check if there's anything allocated */
+ if (internal_config.legacy_mem && msl->memseg_arr.count == 0)
+ continue;
+ return;
+ }
RTE_LOG(WARNING, EAL, "WARNING: Master core has no "
"memory on local socket!\n");
@@ -929,6 +929,24 @@ huge_recover_sigbus(void)
}
}
+static struct rte_memseg_list *
+get_memseg_list(int socket, uint64_t page_sz) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl;
+ int msl_idx;
+
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ msl = &mcfg->memsegs[msl_idx];
+ if (msl->hugepage_sz != page_sz)
+ continue;
+ if (msl->socket_id != socket)
+ continue;
+ return msl;
+ }
+ return NULL;
+}
+
/*
* Prepare physical memory mapping: fill configuration structure with
* these infos, return 0 on success.
@@ -946,11 +964,14 @@ eal_legacy_hugepage_init(void)
struct rte_mem_config *mcfg;
struct hugepage_file *hugepage = NULL, *tmp_hp = NULL;
struct hugepage_info used_hp[MAX_HUGEPAGE_SIZES];
+ struct rte_fbarray *arr;
+ struct rte_memseg *ms;
uint64_t memory[RTE_MAX_NUMA_NODES];
unsigned hp_offset;
int i, j, new_memseg;
+ int ms_idx, msl_idx;
int nr_hugefiles, nr_hugepages = 0;
void *addr;
@@ -963,6 +984,9 @@ eal_legacy_hugepage_init(void)
/* hugetlbfs can be disabled */
if (internal_config.no_hugetlbfs) {
+ arr = &mcfg->memsegs[0].memseg_arr;
+ ms = rte_fbarray_get(arr, 0);
+
addr = mmap(NULL, internal_config.memory, PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
if (addr == MAP_FAILED) {
@@ -970,14 +994,15 @@ eal_legacy_hugepage_init(void)
strerror(errno));
return -1;
}
+ rte_fbarray_set_used(arr, 0, true);
if (rte_eal_iova_mode() == RTE_IOVA_VA)
- mcfg->memseg[0].iova = (uintptr_t)addr;
+ ms->iova = (uintptr_t)addr;
else
- mcfg->memseg[0].iova = RTE_BAD_IOVA;
- mcfg->memseg[0].addr = addr;
- mcfg->memseg[0].hugepage_sz = RTE_PGSIZE_4K;
- mcfg->memseg[0].len = internal_config.memory;
- mcfg->memseg[0].socket_id = 0;
+ ms->iova = RTE_BAD_IOVA;
+ ms->addr = addr;
+ ms->hugepage_sz = RTE_PGSIZE_4K;
+ ms->len = internal_config.memory;
+ ms->socket_id = 0;
return 0;
}
@@ -1218,27 +1243,59 @@ eal_legacy_hugepage_init(void)
#endif
if (new_memseg) {
- j += 1;
- if (j == RTE_MAX_MEMSEG)
- break;
+ struct rte_memseg_list *msl;
+ int socket;
+ uint64_t page_sz;
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
- mcfg->memseg[j].len = hugepage[i].size;
- mcfg->memseg[j].socket_id = hugepage[i].socket_id;
- mcfg->memseg[j].hugepage_sz = hugepage[i].size;
+ socket = hugepage[i].socket_id;
+ page_sz = hugepage[i].size;
+
+ if (page_sz == 0)
+ continue;
+
+ /* figure out where to put this memseg */
+ msl = get_memseg_list(socket, page_sz);
+ if (!msl)
+ rte_panic("Unknown socket or page sz: %i %lx\n",
+ socket, page_sz);
+ msl_idx = msl - &mcfg->memsegs[0];
+ arr = &msl->memseg_arr;
+ /*
+ * we may run out of space, so check if we have enough
+ * and expand if necessary
+ */
+ if (arr->count >= arr->len) {
+ int new_len = arr->len * 2;
+ new_len = RTE_MIN(new_len, arr->capacity);
+ if (rte_fbarray_resize(arr, new_len)) {
+ RTE_LOG(ERR, EAL, "Couldn't expand memseg list\n");
+ break;
+ }
+ }
+ ms_idx = rte_fbarray_find_next_free(arr, 0);
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
+ ms->len = page_sz;
+ ms->socket_id = socket;
+ ms->hugepage_sz = page_sz;
+
+ /* segment may be empty */
+ rte_fbarray_set_used(arr, ms_idx, true);
}
/* continuation of previous memseg */
else {
#ifdef RTE_ARCH_PPC_64
/* Use the phy and virt address of the last page as segment
* address for IBM Power architecture */
- mcfg->memseg[j].iova = hugepage[i].physaddr;
- mcfg->memseg[j].addr = hugepage[i].final_va;
+ ms->iova = hugepage[i].physaddr;
+ ms->addr = hugepage[i].final_va;
#endif
- mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
+ ms->len += ms->hugepage_sz;
}
- hugepage[i].memseg_id = j;
+ hugepage[i].memseg_id = ms_idx;
+ hugepage[i].memseg_list_id = msl_idx;
}
if (i < nr_hugefiles) {
@@ -1248,7 +1305,7 @@ eal_legacy_hugepage_init(void)
"Please either increase it or request less amount "
"of memory.\n",
i, nr_hugefiles, RTE_STR(CONFIG_RTE_MAX_MEMSEG),
- RTE_MAX_MEMSEG);
+ RTE_MAX_MEMSEG_PER_LIST);
goto fail;
}
@@ -1289,8 +1346,9 @@ eal_legacy_hugepage_attach(void)
const struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
struct hugepage_file *hp = NULL;
unsigned num_hp = 0;
- unsigned i, s = 0; /* s used to track the segment number */
- unsigned max_seg = RTE_MAX_MEMSEG;
+ unsigned i;
+ int ms_idx, msl_idx;
+ unsigned cur_seg, max_seg;
off_t size = 0;
int fd, fd_zero = -1, fd_hugepage = -1;
@@ -1315,53 +1373,63 @@ eal_legacy_hugepage_attach(void)
}
/* map all segments into memory to make sure we get the addrs */
- for (s = 0; s < RTE_MAX_MEMSEG; ++s) {
- void *base_addr;
+ max_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
- /*
- * the first memory segment with len==0 is the one that
- * follows the last valid segment.
- */
- if (mcfg->memseg[s].len == 0)
- break;
+ ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *base_addr;
- /*
- * fdzero is mmapped to get a contiguous block of virtual
- * addresses of the appropriate memseg size.
- * use mmap to get identical addresses as the primary process.
- */
- base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
- PROT_READ,
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ /*
+ * the first memory segment with len==0 is the one that
+ * follows the last valid segment.
+ */
+ if (ms->len == 0)
+ break;
+
+ /*
+ * fdzero is mmapped to get a contiguous block of virtual
+ * addresses of the appropriate memseg size.
+ * use mmap to get identical addresses as the primary process.
+ */
+ base_addr = mmap(ms->addr, ms->len,
+ PROT_READ,
#ifdef RTE_ARCH_PPC_64
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
#else
- MAP_PRIVATE,
+ MAP_PRIVATE,
#endif
- fd_zero, 0);
- if (base_addr == MAP_FAILED ||
- base_addr != mcfg->memseg[s].addr) {
- max_seg = s;
- if (base_addr != MAP_FAILED) {
- /* errno is stale, don't use */
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p], got [%p] - "
- "please use '--base-virtaddr' option\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, base_addr);
- munmap(base_addr, mcfg->memseg[s].len);
- } else {
- RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
- "in /dev/zero at [%p]: '%s'\n",
- (unsigned long long)mcfg->memseg[s].len,
- mcfg->memseg[s].addr, strerror(errno));
- }
- if (aslr_enabled() > 0) {
- RTE_LOG(ERR, EAL, "It is recommended to "
- "disable ASLR in the kernel "
- "and retry running both primary "
- "and secondary processes\n");
+ fd_zero, 0);
+ if (base_addr == MAP_FAILED || base_addr != ms->addr) {
+ if (base_addr != MAP_FAILED) {
+ /* errno is stale, don't use */
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
+ "in /dev/zero at [%p], got [%p] - "
+ "please use '--base-virtaddr' option\n",
+ (unsigned long long)ms->len,
+ ms->addr, base_addr);
+ munmap(base_addr, ms->len);
+ } else {
+ RTE_LOG(ERR, EAL, "Could not mmap %llu bytes "
+ "in /dev/zero at [%p]: '%s'\n",
+ (unsigned long long)ms->len,
+ ms->addr, strerror(errno));
+ }
+ if (aslr_enabled() > 0) {
+ RTE_LOG(ERR, EAL, "It is recommended to "
+ "disable ASLR in the kernel "
+ "and retry running both primary "
+ "and secondary processes\n");
+ }
+ goto error;
}
- goto error;
+ max_seg++;
+ ms_idx++;
}
}
@@ -1375,46 +1443,54 @@ eal_legacy_hugepage_attach(void)
num_hp = size / sizeof(struct hugepage_file);
RTE_LOG(DEBUG, EAL, "Analysing %u files\n", num_hp);
- s = 0;
- while (s < RTE_MAX_MEMSEG && mcfg->memseg[s].len > 0){
- void *addr, *base_addr;
- uintptr_t offset = 0;
- size_t mapping_size;
- /*
- * free previously mapped memory so we can map the
- * hugepages into the space
- */
- base_addr = mcfg->memseg[s].addr;
- munmap(base_addr, mcfg->memseg[s].len);
-
- /* find the hugepages for this segment and map them
- * we don't need to worry about order, as the server sorted the
- * entries before it did the second mmap of them */
- for (i = 0; i < num_hp && offset < mcfg->memseg[s].len; i++){
- if (hp[i].memseg_id == (int)s){
- fd = open(hp[i].filepath, O_RDWR);
- if (fd < 0) {
- RTE_LOG(ERR, EAL, "Could not open %s\n",
- hp[i].filepath);
- goto error;
- }
- mapping_size = hp[i].size;
- addr = mmap(RTE_PTR_ADD(base_addr, offset),
- mapping_size, PROT_READ | PROT_WRITE,
- MAP_SHARED, fd, 0);
- close(fd); /* close file both on success and on failure */
- if (addr == MAP_FAILED ||
- addr != RTE_PTR_ADD(base_addr, offset)) {
- RTE_LOG(ERR, EAL, "Could not mmap %s\n",
- hp[i].filepath);
- goto error;
+ /* map all segments into memory to make sure we get the addrs */
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+
+ ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+ void *addr, *base_addr;
+ uintptr_t offset = 0;
+ size_t mapping_size;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+ /*
+ * free previously mapped memory so we can map the
+ * hugepages into the space
+ */
+ base_addr = ms->addr;
+ munmap(base_addr, ms->len);
+
+ /* find the hugepages for this segment and map them
+ * we don't need to worry about order, as the server sorted the
+ * entries before it did the second mmap of them */
+ for (i = 0; i < num_hp && offset < ms->len; i++){
+ if (hp[i].memseg_id == ms_idx &&
+ hp[i].memseg_list_id == msl_idx) {
+ fd = open(hp[i].filepath, O_RDWR);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Could not open %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ mapping_size = hp[i].size;
+ addr = mmap(RTE_PTR_ADD(base_addr, offset),
+ mapping_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ close(fd); /* close file both on success and on failure */
+ if (addr == MAP_FAILED ||
+ addr != RTE_PTR_ADD(base_addr, offset)) {
+ RTE_LOG(ERR, EAL, "Could not mmap %s\n",
+ hp[i].filepath);
+ goto error;
+ }
+ offset+=mapping_size;
}
- offset+=mapping_size;
}
- }
- RTE_LOG(DEBUG, EAL, "Mapped segment %u of size 0x%llx\n", s,
- (unsigned long long)mcfg->memseg[s].len);
- s++;
+ RTE_LOG(DEBUG, EAL, "Mapped segment of size 0x%llx\n",
+ (unsigned long long)ms->len); }
}
/* unmap the hugepage config file, since we are done using it */
munmap(hp, size);
@@ -1423,8 +1499,27 @@ eal_legacy_hugepage_attach(void)
return 0;
error:
- for (i = 0; i < max_seg && mcfg->memseg[i].len > 0; i++)
- munmap(mcfg->memseg[i].addr, mcfg->memseg[i].len);
+ /* map all segments into memory to make sure we get the addrs */
+ cur_seg = 0;
+ for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[msl_idx];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (cur_seg >= max_seg)
+ break;
+
+ ms_idx = 0;
+ while ((ms_idx = rte_fbarray_find_next_used(arr, ms_idx)) >= 0) {
+ struct rte_memseg *ms = rte_fbarray_get(arr, ms_idx);
+
+ if (cur_seg >= max_seg)
+ break;
+ ms = rte_fbarray_get(arr, i);
+ munmap(ms->addr, ms->len);
+
+ cur_seg++;
+ }
+ }
if (hp != NULL && hp != MAP_FAILED)
munmap(hp, size);
if (fd_zero >= 0)
@@ -696,33 +696,52 @@ vfio_get_group_no(const char *sysfs_base,
static int
vfio_type1_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;
/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ const struct rte_memseg_list *msl;
+ const struct rte_fbarray *arr;
+ int ms_idx, next_idx;
- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ next_idx = 0;
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno,
- strerror(errno));
- return -1;
+ // TODO: don't bother with physical addresses?
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;
+
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
}
}
@@ -732,8 +751,8 @@ vfio_type1_dma_map(int vfio_container_fd)
static int
vfio_spapr_dma_map(int vfio_container_fd)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
int i, ret;
+ uint64_t hugepage_sz = 0;
struct vfio_iommu_spapr_register_memory reg = {
.argsz = sizeof(reg),
@@ -767,17 +786,31 @@ vfio_spapr_dma_map(int vfio_container_fd)
}
/* create DMA window from 0 to max(phys_addr + len) */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].addr == NULL)
- break;
-
- create.window_size = RTE_MAX(create.window_size,
- ms[i].iova + ms[i].len);
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+ int idx, next_idx;
+
+ if (msl->base_va == NULL)
+ continue;
+ if (msl->memseg_arr.count == 0)
+ continue;
+
+ next_idx = 0;
+ while ((idx = rte_fbarray_find_next_used(arr, next_idx)) >= 0) {
+ const struct rte_memseg *ms = rte_fbarray_get(arr, idx);
+ hugepage_sz = RTE_MAX(hugepage_sz, ms->hugepage_sz);
+ create.window_size = RTE_MAX(create.window_size,
+ ms[i].iova + ms[i].len);
+ next_idx = idx + 1;
+ }
}
/* sPAPR requires window size to be a power of 2 */
create.window_size = rte_align64pow2(create.window_size);
- create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+ create.page_shift = __builtin_ctzll(hugepage_sz);
create.levels = 1;
ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
@@ -793,41 +826,60 @@ vfio_spapr_dma_map(int vfio_container_fd)
}
/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
struct vfio_iommu_type1_dma_map dma_map;
+ const struct rte_memseg_list *msl;
+ const struct rte_fbarray *arr;
+ int ms_idx, next_idx;
- if (ms[i].addr == NULL)
- break;
+ msl = &rte_eal_get_configuration()->mem_config->memsegs[i];
+ arr = &msl->memseg_arr;
- reg.vaddr = (uintptr_t) ms[i].addr;
- reg.size = ms[i].len;
- ret = ioctl(vfio_container_fd,
- VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ /* skip empty memseg lists */
+ if (arr->count == 0)
+ continue;
- memset(&dma_map, 0, sizeof(dma_map));
- dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
- dma_map.vaddr = ms[i].addr_64;
- dma_map.size = ms[i].len;
- if (rte_eal_iova_mode() == RTE_IOVA_VA)
- dma_map.iova = dma_map.vaddr;
- else
- dma_map.iova = ms[i].iova;
- dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
- VFIO_DMA_MAP_FLAG_WRITE;
+ next_idx = 0;
- ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+ while ((ms_idx = rte_fbarray_find_next_used(arr,
+ next_idx) >= 0)) {
+ uint64_t addr, len, hw_addr;
+ const struct rte_memseg *ms;
+ next_idx = ms_idx + 1;
- if (ret) {
- RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
- return -1;
- }
+ ms = rte_fbarray_get(arr, ms_idx);
+
+ addr = ms->addr_64;
+ len = ms->hugepage_sz;
+ hw_addr = ms->iova;
+ reg.vaddr = (uintptr_t) addr;
+ reg.size = len;
+ ret = ioctl(vfio_container_fd,
+ VFIO_IOMMU_SPAPR_REGISTER_MEMORY, ®);
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot register vaddr for IOMMU, error %i (%s)\n",
+ errno, strerror(errno));
+ return -1;
+ }
+
+ memset(&dma_map, 0, sizeof(dma_map));
+ dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+ dma_map.vaddr = addr;
+ dma_map.size = len;
+ dma_map.iova = hw_addr;
+ dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
+ VFIO_DMA_MAP_FLAG_WRITE;
+
+ ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+ if (ret) {
+ RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
+ "error %i (%s)\n", errno,
+ strerror(errno));
+ return -1;
+ }
+ }
}
return 0;
@@ -41,6 +41,7 @@
#include <rte_common.h>
#include <rte_memory.h>
+#include <rte_eal_memconfig.h>
#include <rte_per_lcore.h>
#include <rte_launch.h>
#include <rte_eal.h>
@@ -734,15 +735,23 @@ test_malloc_bad_params(void)
return -1;
}
-/* Check if memory is available on a specific socket */
+/* Check if memory is avilable on a specific socket */
static int
is_mem_on_socket(int32_t socket)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
unsigned i;
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (socket == ms[i].socket_id)
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl =
+ &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+
+ if (msl->socket_id != socket)
+ continue;
+
+ if (arr->count)
return 1;
}
return 0;
@@ -755,16 +764,8 @@ is_mem_on_socket(int32_t socket)
static int32_t
addr_to_socket(void * addr)
{
- const struct rte_memseg *ms = rte_eal_get_physmem_layout();
- unsigned i;
-
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if ((ms[i].addr <= addr) &&
- ((uintptr_t)addr <
- ((uintptr_t)ms[i].addr + (uintptr_t)ms[i].len)))
- return ms[i].socket_id;
- }
- return -1;
+ const struct rte_memseg *ms = rte_mem_virt2memseg(addr, NULL);
+ return ms == NULL ? -1 : ms->socket_id;
}
/* Test using rte_[c|m|zm]alloc_socket() on a specific socket */
@@ -34,8 +34,11 @@
#include <stdio.h>
#include <stdint.h>
+#include <rte_eal.h>
+#include <rte_eal_memconfig.h>
#include <rte_memory.h>
#include <rte_common.h>
+#include <rte_memzone.h>
#include "test.h"
@@ -54,10 +57,12 @@
static int
test_memory(void)
{
+ const struct rte_memzone *mz = NULL;
uint64_t s;
unsigned i;
size_t j;
- const struct rte_memseg *mem;
+ const struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
/*
* dump the mapped memory: the python-expect script checks
@@ -69,20 +74,43 @@ test_memory(void)
/* check that memory size is != 0 */
s = rte_eal_get_physmem_size();
if (s == 0) {
- printf("No memory detected\n");
- return -1;
+ printf("No memory detected, attempting to allocate\n");
+ mz = rte_memzone_reserve("tmp", 1000, SOCKET_ID_ANY, 0);
+
+ if (!mz) {
+ printf("Failed to allocate a memzone\n");
+ return -1;
+ }
}
/* try to read memory (should not segfault) */
- mem = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG && mem[i].addr != NULL ; i++) {
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ const struct rte_memseg_list *msl = &mcfg->memsegs[i];
+ const struct rte_fbarray *arr = &msl->memseg_arr;
+ int search_idx, cur_idx;
+
+ if (arr->count == 0)
+ continue;
+
+ search_idx = 0;
- /* check memory */
- for (j = 0; j<mem[i].len; j++) {
- *((volatile uint8_t *) mem[i].addr + j);
+ while ((cur_idx = rte_fbarray_find_next_used(arr,
+ search_idx)) >= 0) {
+ const struct rte_memseg *ms;
+
+ ms = rte_fbarray_get(arr, cur_idx);
+
+ /* check memory */
+ for (j = 0; j < ms->len; j++) {
+ *((volatile uint8_t *) ms->addr + j);
+ }
+ search_idx = cur_idx + 1;
}
}
+ if (mz)
+ rte_memzone_free(mz);
+
return 0;
}
@@ -132,22 +132,25 @@ static int
test_memzone_reserve_flags(void)
{
const struct rte_memzone *mz;
- const struct rte_memseg *ms;
int hugepage_2MB_avail = 0;
int hugepage_1GB_avail = 0;
int hugepage_16MB_avail = 0;
int hugepage_16GB_avail = 0;
const size_t size = 100;
int i = 0;
- ms = rte_eal_get_physmem_layout();
- for (i = 0; i < RTE_MAX_MEMSEG; i++) {
- if (ms[i].hugepage_sz == RTE_PGSIZE_2M)
+
+ for (i = 0; i < RTE_MAX_MEMSEG_LISTS; i++) {
+ struct rte_mem_config *mcfg =
+ rte_eal_get_configuration()->mem_config;
+ struct rte_memseg_list *msl = &mcfg->memsegs[i];
+
+ if (msl->hugepage_sz == RTE_PGSIZE_2M)
hugepage_2MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_1G)
+ if (msl->hugepage_sz == RTE_PGSIZE_1G)
hugepage_1GB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16M)
+ if (msl->hugepage_sz == RTE_PGSIZE_16M)
hugepage_16MB_avail = 1;
- if (ms[i].hugepage_sz == RTE_PGSIZE_16G)
+ if (msl->hugepage_sz == RTE_PGSIZE_16G)
hugepage_16GB_avail = 1;
}
/* Display the availability of 2MB ,1GB, 16MB, 16GB pages */