[dpdk-dev] [RFC 12/23] eal/memalloc: add support for dynamic memory allocation

Anatoly Burakov anatoly.burakov at intel.com
Tue Dec 19 12:04:36 CET 2017


Nothing uses that code yet. The bulk of it is copied from old
memory allocation stuff (eal_memory.c). We provide an API to
allocate either one page or multiple pages, guaranteeing that
we'll get contiguous VA for all of the pages that we requested.

Signed-off-by: Anatoly Burakov <anatoly.burakov at intel.com>
---
 lib/librte_eal/common/eal_memalloc.h       |  47 ++++
 lib/librte_eal/linuxapp/eal/Makefile       |   2 +
 lib/librte_eal/linuxapp/eal/eal_memalloc.c | 416 +++++++++++++++++++++++++++++
 3 files changed, 465 insertions(+)
 create mode 100755 lib/librte_eal/common/eal_memalloc.h
 create mode 100755 lib/librte_eal/linuxapp/eal/eal_memalloc.c

diff --git a/lib/librte_eal/common/eal_memalloc.h b/lib/librte_eal/common/eal_memalloc.h
new file mode 100755
index 0000000..59fd330
--- /dev/null
+++ b/lib/librte_eal/common/eal_memalloc.h
@@ -0,0 +1,47 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef EAL_MEMALLOC_H
+#define EAL_MEMALLOC_H
+
+#include <stdbool.h>
+
+#include <rte_memory.h>
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket);
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n, uint64_t size,
+		int socket, bool exact);
+
+#endif // EAL_MEMALLOC_H
diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
index 782e1ad..88f10e9 100644
--- a/lib/librte_eal/linuxapp/eal/Makefile
+++ b/lib/librte_eal/linuxapp/eal/Makefile
@@ -62,6 +62,7 @@ SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_thread.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_log.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_vfio_mp_sync.c
+SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_memalloc.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_debug.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_lcore.c
 SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += eal_timer.c
@@ -105,6 +106,7 @@ CFLAGS_eal_interrupts.o := -D_GNU_SOURCE
 CFLAGS_eal_vfio_mp_sync.o := -D_GNU_SOURCE
 CFLAGS_eal_timer.o := -D_GNU_SOURCE
 CFLAGS_eal_lcore.o := -D_GNU_SOURCE
+CFLAGS_eal_memalloc.o := -D_GNU_SOURCE
 CFLAGS_eal_thread.o := -D_GNU_SOURCE
 CFLAGS_eal_log.o := -D_GNU_SOURCE
 CFLAGS_eal_common_log.o := -D_GNU_SOURCE
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
new file mode 100755
index 0000000..527c2f6
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -0,0 +1,416 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#define _FILE_OFFSET_BITS 64
+#include <errno.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <inttypes.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/queue.h>
+#include <sys/file.h>
+#include <unistd.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+#include <numa.h>
+#include <numaif.h>
+#endif
+
+#include <rte_common.h>
+#include <rte_log.h>
+#include <rte_eal_memconfig.h>
+#include <rte_eal.h>
+#include <rte_memory.h>
+
+#include "eal_filesystem.h"
+#include "eal_internal_cfg.h"
+#include "eal_memalloc.h"
+
+static sigjmp_buf huge_jmpenv;
+
+static void __rte_unused huge_sigbus_handler(int signo __rte_unused)
+{
+	siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might be
+ * clobbered by a call to longjmp.
+ */
+static int __rte_unused huge_wrap_sigsetjmp(void)
+{
+	return sigsetjmp(huge_jmpenv, 1);
+}
+
+static struct sigaction huge_action_old;
+static int huge_need_recover;
+
+static void __rte_unused
+huge_register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = huge_sigbus_handler;
+
+	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old);
+}
+
+static void __rte_unused
+huge_recover_sigbus(void)
+{
+	if (huge_need_recover) {
+		sigaction(SIGBUS, &huge_action_old, NULL);
+		huge_need_recover = 0;
+	}
+}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+static bool
+prepare_numa(int *oldpolicy, struct bitmask *oldmask, int socket_id) {
+	bool have_numa = true;
+
+	/* Check if kernel supports NUMA. */
+	if (numa_available() != 0) {
+		RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
+		have_numa = false;
+	}
+
+	if (have_numa) {
+		RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
+		if (get_mempolicy(oldpolicy, oldmask->maskp,
+				  oldmask->size + 1, 0, 0) < 0) {
+			RTE_LOG(ERR, EAL,
+				"Failed to get current mempolicy: %s. "
+				"Assuming MPOL_DEFAULT.\n", strerror(errno));
+			oldpolicy = MPOL_DEFAULT;
+		}
+		RTE_LOG(DEBUG, EAL,
+			"Setting policy MPOL_PREFERRED for socket %d\n",
+			socket_id);
+		numa_set_preferred(socket_id);
+	}
+	return have_numa;
+}
+
+static void
+resotre_numa(int *oldpolicy, struct bitmask *oldmask) {
+	RTE_LOG(DEBUG, EAL,
+		"Restoring previous memory policy: %d\n", *oldpolicy);
+	if (oldpolicy == MPOL_DEFAULT) {
+		numa_set_localalloc();
+	} else if (set_mempolicy(*oldpolicy, oldmask->maskp,
+				 oldmask->size + 1) < 0) {
+		RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
+			strerror(errno));
+		numa_set_localalloc();
+	}
+	numa_free_cpumask(oldmask);
+}
+#endif
+
+static int
+alloc_page(struct rte_memseg *ms, void *addr, uint64_t size, int socket_id,
+		struct hugepage_info *hi, unsigned list_idx, unsigned seg_idx) {
+	int cur_socket_id = 0;
+	uint64_t fa_offset;
+	char path[PATH_MAX];
+	int ret = 0;
+
+	if (internal_config.single_file_segments) {
+		eal_get_hugefile_path(path, sizeof(path), hi->hugedir, list_idx);
+	} else {
+		eal_get_hugefile_path(path, sizeof(path), hi->hugedir,
+				list_idx * RTE_MAX_MEMSEG_PER_LIST + seg_idx);
+	}
+
+	/* try to create hugepage file */
+	int fd = open(path, O_CREAT | O_RDWR, 0600);
+	if (fd < 0) {
+		RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
+				strerror(errno));
+		goto fname;
+	}
+	if (internal_config.single_file_segments) {
+		fa_offset = seg_idx * size;
+		if (fallocate(fd, 0, fa_offset, size)) {
+			RTE_LOG(DEBUG, EAL, "%s(): fallocate() failed: %s\n",
+				__func__, strerror(errno));
+			goto opened;
+		}
+	} else {
+		if (ftruncate(fd, size) < 0) {
+			RTE_LOG(DEBUG, EAL, "%s(): ftruncate() failed: %s\n",
+				__func__, strerror(errno));
+			goto opened;
+		}
+		fa_offset = 0;
+	}
+
+	/* map the segment, and populate page tables,
+	 * the kernel fills this segment with zeros */
+	void *va = mmap(addr, size, PROT_READ | PROT_WRITE,
+			MAP_SHARED | MAP_POPULATE | MAP_FIXED, fd, fa_offset);
+	if (va == MAP_FAILED) {
+		RTE_LOG(DEBUG, EAL, "%s(): mmap() failed: %s\n", __func__,
+			strerror(errno));
+		goto resized;
+	}
+	if (va != addr) {
+		RTE_LOG(DEBUG, EAL, "%s(): wrong mmap() address\n", __func__);
+		goto mapped;
+	}
+
+	rte_iova_t iova = rte_mem_virt2iova(addr);
+	if (iova == RTE_BAD_PHYS_ADDR) {
+		RTE_LOG(DEBUG, EAL, "%s(): can't get IOVA addr\n",
+			__func__);
+		goto mapped;
+	}
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	move_pages(getpid(), 1, &addr, NULL, &cur_socket_id, 0);
+
+	if (cur_socket_id != socket_id) {
+		RTE_LOG(DEBUG, EAL,
+				"%s(): allocation happened on wrong socket (wanted %d, got %d)\n",
+			__func__, socket_id, cur_socket_id);
+		goto mapped;
+	}
+#endif
+
+	/* In linux, hugetlb limitations, like cgroup, are
+	 * enforced at fault time instead of mmap(), even
+	 * with the option of MAP_POPULATE. Kernel will send
+	 * a SIGBUS signal. To avoid to be killed, save stack
+	 * environment here, if SIGBUS happens, we can jump
+	 * back here.
+	 */
+	if (huge_wrap_sigsetjmp()) {
+		RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more hugepages of size %uMB\n",
+			(unsigned)(size / 0x100000));
+		goto mapped;
+	}
+	*(int *)addr = *(int *) addr;
+
+	close(fd);
+
+	ms->addr = addr;
+	ms->hugepage_sz = size;
+	ms->len = size;
+	ms->nchannel = rte_memory_get_nchannel();
+	ms->nrank = rte_memory_get_nrank();
+	ms->iova = iova;
+	ms->socket_id = socket_id;
+
+	goto out;
+
+mapped:
+	munmap(addr, size);
+resized:
+	if (internal_config.single_file_segments)
+		fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+				fa_offset, size);
+	else {
+		unlink(path);
+	}
+opened:
+	close(fd);
+fname:
+	/* anything but goto out is an error */
+	ret = -1;
+out:
+	return ret;
+}
+
+int
+eal_memalloc_alloc_page_bulk(struct rte_memseg **ms, int n,
+		uint64_t size, int socket, bool exact) {
+	struct rte_mem_config *mcfg = rte_eal_get_configuration()->mem_config;
+	struct rte_memseg_list *msl = NULL;
+	void *addr;
+	unsigned msl_idx;
+	int cur_idx, next_idx, end_idx, i, ret = 0;
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	bool have_numa;
+	int oldpolicy;
+	struct bitmask *oldmask = numa_allocate_nodemask();
+#endif
+	struct hugepage_info *hi = NULL;
+
+	/* dynamic allocation not supported in legacy mode */
+	if (internal_config.legacy_mem)
+		return -1;
+
+	for (i = 0; i < (int) RTE_DIM(internal_config.hugepage_info); i++) {
+		if (size ==
+				internal_config.hugepage_info[i].hugepage_sz) {
+			hi = &internal_config.hugepage_info[i];
+			break;
+		}
+	}
+	if (!hi) {
+		RTE_LOG(ERR, EAL, "%s(): can't find relevant hugepage_info entry\n",
+			__func__);
+		return -1;
+	}
+
+	/* find our memseg list */
+	for (msl_idx = 0; msl_idx < RTE_MAX_MEMSEG_LISTS; msl_idx++) {
+		struct rte_memseg_list *cur_msl = &mcfg->memsegs[msl_idx];
+
+		if (cur_msl->hugepage_sz != size) {
+			continue;
+		}
+		if (cur_msl->socket_id != socket) {
+			continue;
+		}
+		msl = cur_msl;
+		break;
+	}
+	if (!msl) {
+		RTE_LOG(ERR, EAL, "%s(): couldn't find suitable memseg_list\n",
+			__func__);
+		return -1;
+	}
+
+	/* first, try finding space in already existing list */
+	cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr, 0, n);
+
+	if (cur_idx < 0) {
+		int old_len = msl->memseg_arr.len;
+		int space = 0;
+		int new_len = old_len;
+
+		/* grow new len until we can either fit n or can't grow */
+		while (new_len < msl->memseg_arr.capacity &&
+				(space < n)) {
+			new_len = RTE_MIN(new_len * 2, msl->memseg_arr.capacity);
+			space = new_len - old_len;
+		}
+
+		/* check if we can expand the list */
+		if (old_len == new_len) {
+			/* can't expand, the list is full */
+			RTE_LOG(ERR, EAL, "%s(): no space in memseg list\n",
+				__func__);
+			return -1;
+		}
+
+		if (rte_fbarray_resize(&msl->memseg_arr, new_len)) {
+			RTE_LOG(ERR, EAL, "%s(): can't resize memseg list\n",
+				__func__);
+			return -1;
+		}
+
+		/*
+		 * we could conceivably end up with free space at the end of the
+		 * list that wasn't enough to cover everything but can cover
+		 * some of it, so start at (old_len - n) if possible.
+		 */
+		next_idx = RTE_MAX(0, old_len - n);
+
+		cur_idx = rte_fbarray_find_next_n_free(&msl->memseg_arr,
+				next_idx, n);
+
+		if (cur_idx < 0) {
+			/* still no space, bail out */
+			RTE_LOG(ERR, EAL, "%s(): no space in memseg list\n",
+				__func__);
+			return -1;
+		}
+	}
+
+	end_idx = cur_idx + n;
+
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	have_numa = prepare_numa(&oldpolicy, oldmask, socket);
+#endif
+
+	for (i = 0; cur_idx < end_idx; cur_idx++, i++) {
+		struct rte_memseg *cur;
+
+		cur = rte_fbarray_get(&msl->memseg_arr, cur_idx);
+		addr = RTE_PTR_ADD(msl->base_va,
+				cur_idx * msl->hugepage_sz);
+
+		if (alloc_page(cur, addr, size, socket, hi, msl_idx, cur_idx)) {
+			RTE_LOG(DEBUG, EAL, "attempted to allocate %i pages, but only %i were allocated\n",
+				n, i);
+
+			/* if exact number of pages wasn't requested, stop */
+			if (!exact) {
+				ret = i;
+				goto restore_numa;
+			}
+			if (ms)
+				memset(ms, 0, sizeof(struct rte_memseg*) * n);
+			ret = -1;
+			goto restore_numa;
+		}
+		if (ms)
+			ms[i] = cur;
+
+		rte_fbarray_set_used(&msl->memseg_arr, cur_idx, true);
+	}
+	ret = n;
+
+restore_numa:
+#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
+	if (have_numa)
+		resotre_numa(&oldpolicy, oldmask);
+#endif
+	return ret;
+}
+
+struct rte_memseg *
+eal_memalloc_alloc_page(uint64_t size, int socket) {
+	struct rte_memseg *ms;
+	if (eal_memalloc_alloc_page_bulk(&ms, 1, size, socket, true) < 0)
+		return NULL;
+	return ms;
+}
-- 
2.7.4



More information about the dev mailing list