[dpdk-dev,RFC,10/10] mem: enable memfd-based hugepage allocation

Message ID 8b65d1c810721ef2ffe8019ec9504eb2112bb91e.1527776837.git.anatoly.burakov@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Anatoly Burakov May 31, 2018, 2:32 p.m. UTC
  This will supplant no-shared-files mode to use memfd-based hugetlbfs
allocation instead of hugetlbfs mounts. Due to memfd only being
supported kernel 4.14+ and glibc 2.27+, a compile-time check is
performed along with runtime checks.

Signed-off-by: Anatoly Burakov <anatoly.burakov@intel.com>
---
 .../linuxapp/eal/eal_hugepage_info.c          | 136 ++++++++++++++----
 lib/librte_eal/linuxapp/eal/eal_memalloc.c    | 105 +++++++++++++-
 lib/librte_eal/linuxapp/eal/eal_memfd.h       |  28 ++++
 lib/librte_eal/linuxapp/eal/eal_memory.c      |   4 +-
 4 files changed, 234 insertions(+), 39 deletions(-)
 create mode 100644 lib/librte_eal/linuxapp/eal/eal_memfd.h
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
index 02b1c4ff1..1a80ee0ee 100644
--- a/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
+++ b/lib/librte_eal/linuxapp/eal/eal_hugepage_info.c
@@ -30,6 +30,7 @@ 
 #include "eal_internal_cfg.h"
 #include "eal_hugepages.h"
 #include "eal_filesystem.h"
+#include "eal_memfd.h"
 
 static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
 static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
@@ -313,11 +314,85 @@  compare_hpi(const void *a, const void *b)
 	return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
 }
 
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+	uint64_t total_pages = 0;
+	unsigned int i;
+
+	/*
+	 * first, try to put all hugepages into relevant sockets, but
+	 * if first attempts fails, fall back to collecting all pages
+	 * in one socket and sorting them later
+	 */
+	total_pages = 0;
+	/* we also don't want to do this for legacy init */
+	if (!internal_config.legacy_mem)
+		for (i = 0; i < rte_socket_count(); i++) {
+			int socket = rte_socket_id_by_idx(i);
+			unsigned int num_pages =
+					get_num_hugepages_on_node(
+						dirent->d_name, socket);
+			hpi->num_pages[socket] = num_pages;
+			total_pages += num_pages;
+		}
+	/*
+	 * we failed to sort memory from the get go, so fall
+	 * back to old way
+	 */
+	if (total_pages == 0) {
+		hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+		/* for 32-bit systems, limit number of hugepages to
+		 * 1GB per page size */
+		hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+				RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+	}
+}
+
+static int
+check_memfd_pagesize_supported(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+	int sz_flag, fd;
+
+	/* first, check if this particular pagesize is supported */
+	sz_flag = eal_memalloc_get_memfd_pagesize_flag(page_sz);
+	if (sz_flag == 0) {
+		RTE_LOG(ERR, EAL, "Unexpected memfd hugepage size: %"
+			PRIu64" bytes\n", page_sz);
+		return 0;
+	}
+
+	/* does currently running kernel support it? */
+	fd = memfd_create("memfd_test", sz_flag | MFD_HUGETLB);
+	if (fd >= 0) {
+		/* success */
+		close(fd);
+		return 1;
+	}
+	/* creating memfd failed, but if the error wasn't EINVAL, reserving of
+	 * hugepages via memfd is supported by the kernel
+	 */
+	if (errno != EINVAL) {
+		return 1;
+	}
+	RTE_LOG(DEBUG, EAL, "Kernel does not support memfd hugepages of size %"
+		PRIu64" bytes\n", page_sz);
+#else
+	RTE_LOG(DEBUG, EAL, "Memfd hugepage support not enabled at compile time\n");
+	RTE_SET_USED(page_sz);
+#endif
+	return 0;
+}
+
 static int
 hugepage_info_init(void)
 {	const char dirent_start_text[] = "hugepages-";
 	const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
-	unsigned int i, total_pages, num_sizes = 0;
+	unsigned int i, num_sizes = 0;
 	DIR *dir;
 	struct dirent *dirent;
 
@@ -343,6 +418,10 @@  hugepage_info_init(void)
 		hpi->hugepage_sz =
 			rte_str_to_size(&dirent->d_name[dirent_start_len]);
 
+		/* by default, memfd_hugepage_supported is 1 */
+		memfd_hugepage_supported &=
+			check_memfd_pagesize_supported(hpi->hugepage_sz);
+
 		/* first, check if we have a mountpoint */
 		if (get_hugepage_dir(hpi->hugepage_sz,
 			hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
@@ -355,6 +434,23 @@  hugepage_info_init(void)
 					"%" PRIu64 " reserved, but no mounted "
 					"hugetlbfs found for that size\n",
 					num_pages, hpi->hugepage_sz);
+
+			/* no shared files mode may still be able to allocate
+			 * without a valid mountpoint via memfd, but we cannot
+			 * use memfd in legacy mode, because we cannot sort
+			 * pages, so only allow empty mountpoints in non-legacy
+			 * mode.
+			 */
+			if (internal_config.no_shared_files &&
+					!internal_config.legacy_mem &&
+					memfd_hugepage_supported) {
+				RTE_LOG(NOTICE, EAL, "No shared files mode enabled, "
+					"hugepages of size %" PRIu64 " bytes "
+					"will be allocated anonymously\n",
+					hpi->hugepage_sz);
+				calc_num_pages(hpi, dirent);
+				num_sizes++;
+			}
 			continue;
 		}
 
@@ -371,35 +467,14 @@  hugepage_info_init(void)
 		if (clear_hugedir(hpi->hugedir) == -1)
 			break;
 
-		/*
-		 * first, try to put all hugepages into relevant sockets, but
-		 * if first attempts fails, fall back to collecting all pages
-		 * in one socket and sorting them later
-		 */
-		total_pages = 0;
-		/* we also don't want to do this for legacy init */
-		if (!internal_config.legacy_mem)
-			for (i = 0; i < rte_socket_count(); i++) {
-				int socket = rte_socket_id_by_idx(i);
-				unsigned int num_pages =
-						get_num_hugepages_on_node(
-							dirent->d_name, socket);
-				hpi->num_pages[socket] = num_pages;
-				total_pages += num_pages;
-			}
-		/*
-		 * we failed to sort memory from the get go, so fall
-		 * back to old way
-		 */
-		if (total_pages == 0)
-			hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+		calc_num_pages(hpi, dirent);
 
-#ifndef RTE_ARCH_64
-		/* for 32-bit systems, limit number of hugepages to
-		 * 1GB per page size */
-		hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
-					    RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
+		if (internal_config.no_shared_files &&
+				!internal_config.legacy_mem &&
+				memfd_hugepage_supported)
+			RTE_LOG(NOTICE, EAL, "No shared files mode enabled, "
+				"hugepages of size %" PRIu64 " bytes will be "
+				"allocated anonymously\n", hpi->hugepage_sz);
 
 		num_sizes++;
 	}
@@ -423,8 +498,7 @@  hugepage_info_init(void)
 
 		for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
 			num_pages += hpi->num_pages[j];
-		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
-				num_pages > 0)
+		if (num_pages > 0)
 			return 0;
 	}
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_memalloc.c b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
index f57d307dd..c4d57c349 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memalloc.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memalloc.c
@@ -39,6 +39,7 @@ 
 #include "eal_filesystem.h"
 #include "eal_internal_cfg.h"
 #include "eal_memalloc.h"
+#include "eal_memfd.h"
 
 /*
  * not all kernel version support fallocate on hugetlbfs, so fall back to
@@ -46,6 +47,11 @@ 
  */
 static int fallocate_supported = -1; /* unknown */
 
+/* not all kernel versions support memfd hugepages. assume supported unless
+ * shown otherwise.
+ */
+int memfd_hugepage_supported = 1;
+
 /* for single-file segments, we need some kind of mechanism to keep track of
  * which hugepages can be freed back to the system, and which cannot. we cannot
  * use flock() because they don't allow locking parts of a file, and we cannot
@@ -293,6 +299,49 @@  static int unlock_segment(int list_idx, int seg_idx)
 	return 0;
 }
 
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+	switch (page_sz) {
+	case RTE_PGSIZE_1G:
+		return MFD_HUGE_1GB;
+	case RTE_PGSIZE_2M:
+		return MFD_HUGE_2MB;
+	default:
+		return -1;
+	}
+#endif
+	return 0;
+}
+
+static int
+get_memfd_seg_fd(unsigned int list_idx,
+		unsigned int seg_idx, int sz_flag)
+{
+#ifdef MEMFD_SUPPORTED
+	int flags = MFD_HUGETLB | sz_flag;
+	char name[64];
+	int fd;
+
+	snprintf(name, sizeof(name) - 1, "memseg-%d-%d", list_idx,
+			seg_idx);
+
+	fd = memfd_create(name, flags);
+	if (fd < 0) {
+		RTE_LOG(ERR, EAL, "Couldn't create memfd hugepage: %s\n",
+			strerror(errno));
+		return -1;
+	}
+	return fd;
+#else
+	RTE_SET_USED(list_idx);
+	RTE_SET_USED(seg_idx);
+	RTE_SET_USED(sz_flag);
+	return -1;
+#endif
+}
+
 static int
 get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 		unsigned int list_idx, unsigned int seg_idx)
@@ -342,6 +391,27 @@  get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
 	return fd;
 }
 
+static int
+get_seg_fd_no_shared(char *path, int buflen, struct hugepage_info *hi,
+		unsigned int list_idx, unsigned int seg_idx)
+{
+	int sz_flag;
+
+	/* if memfd hugepages are not supported, create regular files */
+	if (memfd_hugepage_supported == 0)
+		return get_seg_fd(path, buflen, hi, list_idx, seg_idx);
+
+	/* pick correct page size flags */
+	sz_flag = eal_memalloc_get_memfd_pagesize_flag(hi->hugepage_sz);
+	if (sz_flag == 0) {
+		RTE_LOG(ERR, EAL, "Unexpected page size: %"
+			PRIu64 "\n", hi->hugepage_sz);
+		return -1;
+	}
+
+	return get_memfd_seg_fd(list_idx, seg_idx, sz_flag);
+}
+
 static int
 resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
 		uint64_t fa_offset, uint64_t page_sz, bool grow)
@@ -491,8 +561,16 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 	int fd;
 	size_t alloc_sz;
 
-	/* takes out a read lock on segment or segment list */
-	fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+	if (internal_config.no_shared_files) {
+		/* if allocating memfd hugepages is supported, do that,
+		 * otherwise fallback to regular allocation
+		 */
+		fd = get_seg_fd_no_shared(path, sizeof(path), hi, list_idx,
+				seg_idx);
+	} else {
+		/* takes out a read lock on segment or segment list */
+		fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+	}
 	if (fd < 0) {
 		RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
 		return -1;
@@ -512,7 +590,8 @@  alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
 				__func__, strerror(errno));
 			goto resized;
 		}
-		if (internal_config.no_shared_files) {
+		if (internal_config.no_shared_files &&
+				memfd_hugepage_supported == 0) {
 			if (unlink(path)) {
 				RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
 					__func__, strerror(errno));
@@ -616,7 +695,7 @@  free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
 {
 	uint64_t map_offset;
 	char path[PATH_MAX];
-	int fd, ret;
+	int fd, ret = 0;
 
 	/* erase page data */
 	memset(ms->addr, 0, ms->len);
@@ -685,6 +764,7 @@  alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	size_t page_sz;
 	int cur_idx, start_idx, j, dir_fd = -1;
 	unsigned int msl_idx, need, i;
+	bool mountpoint_is_empty;
 
 	if (msl->page_sz != wa->page_sz)
 		return 0;
@@ -704,6 +784,12 @@  alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 		return 0;
 	start_idx = cur_idx;
 
+	/* if we're in no-shared-files mode and memfd is supported, we will
+	 * allow empty mountpoints because memfd doesn't require a mountpoint.
+	 */
+	mountpoint_is_empty =
+			strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
 	/* do not allow any page allocations during the time we're allocating,
 	 * because file creation and locking operations are not atomic,
 	 * and we might be the first or the last ones to use a particular page,
@@ -712,7 +798,7 @@  alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	 * during init, we already hold a write lock, so don't try to take out
 	 * another one.
 	 */
-	if (wa->hi->lock_descriptor == -1) {
+	if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
 		if (dir_fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
@@ -794,6 +880,7 @@  free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	struct free_walk_param *wa = arg;
 	uintptr_t start_addr, end_addr;
 	int msl_idx, seg_idx, ret, dir_fd = -1;
+	bool mountpoint_is_empty;
 
 	start_addr = (uintptr_t) msl->base_va;
 	end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
@@ -802,6 +889,12 @@  free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 			(uintptr_t)wa->ms->addr >= end_addr)
 		return 0;
 
+	/* if we're in no shared files mode and memfd is supported, we will
+	 * allow empty mountpoints because memfd doesn't require a mountpoint.
+	 */
+	mountpoint_is_empty =
+			strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
 	msl_idx = msl - mcfg->memsegs;
 	seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
 
@@ -816,7 +909,7 @@  free_seg_walk(const struct rte_memseg_list *msl, void *arg)
 	 * during init, we already hold a write lock, so don't try to take out
 	 * another one.
 	 */
-	if (wa->hi->lock_descriptor == -1) {
+	if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
 		dir_fd = open(wa->hi->hugedir, O_RDONLY);
 		if (dir_fd < 0) {
 			RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
diff --git a/lib/librte_eal/linuxapp/eal/eal_memfd.h b/lib/librte_eal/linuxapp/eal/eal_memfd.h
new file mode 100644
index 000000000..55e6dbb2c
--- /dev/null
+++ b/lib/librte_eal/linuxapp/eal/eal_memfd.h
@@ -0,0 +1,28 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMFD_H
+#define EAL_MEMFD_H
+
+#include <stdint.h>
+
+/*
+ * For memfd hugepages, both kernel and glibc version must support them. So,
+ * check for both.
+ */
+#include <features.h> /* glibc version */
+#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 27
+#include <linux/version.h> /* linux kernel version */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+#define MEMFD_SUPPORTED
+#include <linux/memfd.h>
+#endif /* linux version check */
+#endif /* glibc version check */
+
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz);
+
+extern int memfd_hugepage_supported;
+
+#endif /* EAL_MEMFD_H */
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index d7b43b5c1..b26e21be8 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -44,6 +44,7 @@ 
 #include "eal_internal_cfg.h"
 #include "eal_filesystem.h"
 #include "eal_hugepages.h"
+#include "eal_memfd.h"
 
 #define PFN_MASK_SIZE	8
 
@@ -1060,8 +1061,7 @@  get_socket_mem_size(int socket)
 
 	for (i = 0; i < internal_config.num_hugepage_sizes; i++){
 		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
-			size += hpi->hugepage_sz * hpi->num_pages[socket];
+		size += hpi->hugepage_sz * hpi->num_pages[socket];
 	}
 
 	return size;