@@ -30,6 +30,7 @@
#include "eal_internal_cfg.h"
#include "eal_hugepages.h"
#include "eal_filesystem.h"
+#include "eal_memfd.h"
static const char sys_dir_path[] = "/sys/kernel/mm/hugepages";
static const char sys_pages_numa_dir_path[] = "/sys/devices/system/node";
@@ -313,11 +314,85 @@ compare_hpi(const void *a, const void *b)
return hpi_b->hugepage_sz - hpi_a->hugepage_sz;
}
+static void
+calc_num_pages(struct hugepage_info *hpi, struct dirent *dirent)
+{
+ uint64_t total_pages = 0;
+ unsigned int i;
+
+ /*
+ * first, try to put all hugepages into relevant sockets, but
+ * if first attempts fails, fall back to collecting all pages
+ * in one socket and sorting them later
+ */
+ total_pages = 0;
+ /* we also don't want to do this for legacy init */
+ if (!internal_config.legacy_mem)
+ for (i = 0; i < rte_socket_count(); i++) {
+ int socket = rte_socket_id_by_idx(i);
+ unsigned int num_pages =
+ get_num_hugepages_on_node(
+ dirent->d_name, socket);
+ hpi->num_pages[socket] = num_pages;
+ total_pages += num_pages;
+ }
+ /*
+ * we failed to sort memory from the get go, so fall
+ * back to old way
+ */
+ if (total_pages == 0) {
+ hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+
+#ifndef RTE_ARCH_64
+ /* for 32-bit systems, limit number of hugepages to
+ * 1GB per page size */
+ hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
+ RTE_PGSIZE_1G / hpi->hugepage_sz);
+#endif
+ }
+}
+
+static int
+check_memfd_pagesize_supported(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+ int sz_flag, fd;
+
+ /* first, check if this particular pagesize is supported */
+ sz_flag = eal_memalloc_get_memfd_pagesize_flag(page_sz);
+ if (sz_flag == 0) {
+ RTE_LOG(ERR, EAL, "Unexpected memfd hugepage size: %"
+ PRIu64" bytes\n", page_sz);
+ return 0;
+ }
+
+ /* does currently running kernel support it? */
+ fd = memfd_create("memfd_test", sz_flag | MFD_HUGETLB);
+ if (fd >= 0) {
+ /* success */
+ close(fd);
+ return 1;
+ }
+ /* creating memfd failed, but if the error wasn't EINVAL, reserving of
+ * hugepages via memfd is supported by the kernel
+ */
+ if (errno != EINVAL) {
+ return 1;
+ }
+ RTE_LOG(DEBUG, EAL, "Kernel does not support memfd hugepages of size %"
+ PRIu64" bytes\n", page_sz);
+#else
+ RTE_LOG(DEBUG, EAL, "Memfd hugepage support not enabled at compile time\n");
+ RTE_SET_USED(page_sz);
+#endif
+ return 0;
+}
+
static int
hugepage_info_init(void)
{ const char dirent_start_text[] = "hugepages-";
const size_t dirent_start_len = sizeof(dirent_start_text) - 1;
- unsigned int i, total_pages, num_sizes = 0;
+ unsigned int i, num_sizes = 0;
DIR *dir;
struct dirent *dirent;
@@ -343,6 +418,10 @@ hugepage_info_init(void)
hpi->hugepage_sz =
rte_str_to_size(&dirent->d_name[dirent_start_len]);
+ /* by default, memfd_hugepage_supported is 1 */
+ memfd_hugepage_supported &=
+ check_memfd_pagesize_supported(hpi->hugepage_sz);
+
/* first, check if we have a mountpoint */
if (get_hugepage_dir(hpi->hugepage_sz,
hpi->hugedir, sizeof(hpi->hugedir)) < 0) {
@@ -355,6 +434,23 @@ hugepage_info_init(void)
"%" PRIu64 " reserved, but no mounted "
"hugetlbfs found for that size\n",
num_pages, hpi->hugepage_sz);
+
+ /* no shared files mode may still be able to allocate
+ * without a valid mountpoint via memfd, but we cannot
+ * use memfd in legacy mode, because we cannot sort
+ * pages, so only allow empty mountpoints in non-legacy
+ * mode.
+ */
+ if (internal_config.no_shared_files &&
+ !internal_config.legacy_mem &&
+ memfd_hugepage_supported) {
+ RTE_LOG(NOTICE, EAL, "No shared files mode enabled, "
+ "hugepages of size %" PRIu64 " bytes "
+ "will be allocated anonymously\n",
+ hpi->hugepage_sz);
+ calc_num_pages(hpi, dirent);
+ num_sizes++;
+ }
continue;
}
@@ -371,35 +467,14 @@ hugepage_info_init(void)
if (clear_hugedir(hpi->hugedir) == -1)
break;
- /*
- * first, try to put all hugepages into relevant sockets, but
- * if first attempts fails, fall back to collecting all pages
- * in one socket and sorting them later
- */
- total_pages = 0;
- /* we also don't want to do this for legacy init */
- if (!internal_config.legacy_mem)
- for (i = 0; i < rte_socket_count(); i++) {
- int socket = rte_socket_id_by_idx(i);
- unsigned int num_pages =
- get_num_hugepages_on_node(
- dirent->d_name, socket);
- hpi->num_pages[socket] = num_pages;
- total_pages += num_pages;
- }
- /*
- * we failed to sort memory from the get go, so fall
- * back to old way
- */
- if (total_pages == 0)
- hpi->num_pages[0] = get_num_hugepages(dirent->d_name);
+ calc_num_pages(hpi, dirent);
-#ifndef RTE_ARCH_64
- /* for 32-bit systems, limit number of hugepages to
- * 1GB per page size */
- hpi->num_pages[0] = RTE_MIN(hpi->num_pages[0],
- RTE_PGSIZE_1G / hpi->hugepage_sz);
-#endif
+ if (internal_config.no_shared_files &&
+ !internal_config.legacy_mem &&
+ memfd_hugepage_supported)
+ RTE_LOG(NOTICE, EAL, "No shared files mode enabled, "
+ "hugepages of size %" PRIu64 " bytes will be "
+ "allocated anonymously\n", hpi->hugepage_sz);
num_sizes++;
}
@@ -423,8 +498,7 @@ hugepage_info_init(void)
for (j = 0; j < RTE_MAX_NUMA_NODES; j++)
num_pages += hpi->num_pages[j];
- if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0 &&
- num_pages > 0)
+ if (num_pages > 0)
return 0;
}
@@ -39,6 +39,7 @@
#include "eal_filesystem.h"
#include "eal_internal_cfg.h"
#include "eal_memalloc.h"
+#include "eal_memfd.h"
/*
* not all kernel version support fallocate on hugetlbfs, so fall back to
@@ -46,6 +47,11 @@
*/
static int fallocate_supported = -1; /* unknown */
+/* not all kernel versions support memfd hugepages. assume supported unless
+ * shown otherwise.
+ */
+int memfd_hugepage_supported = 1;
+
/* for single-file segments, we need some kind of mechanism to keep track of
* which hugepages can be freed back to the system, and which cannot. we cannot
* use flock() because they don't allow locking parts of a file, and we cannot
@@ -293,6 +299,49 @@ static int unlock_segment(int list_idx, int seg_idx)
return 0;
}
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz)
+{
+#ifdef MEMFD_SUPPORTED
+ switch (page_sz) {
+ case RTE_PGSIZE_1G:
+ return MFD_HUGE_1GB;
+ case RTE_PGSIZE_2M:
+ return MFD_HUGE_2MB;
+ default:
+ return -1;
+ }
+#endif
+ return 0;
+}
+
+static int
+get_memfd_seg_fd(unsigned int list_idx,
+ unsigned int seg_idx, int sz_flag)
+{
+#ifdef MEMFD_SUPPORTED
+ int flags = MFD_HUGETLB | sz_flag;
+ char name[64];
+ int fd;
+
+ snprintf(name, sizeof(name) - 1, "memseg-%d-%d", list_idx,
+ seg_idx);
+
+ fd = memfd_create(name, flags);
+ if (fd < 0) {
+ RTE_LOG(ERR, EAL, "Couldn't create memfd hugepage: %s\n",
+ strerror(errno));
+ return -1;
+ }
+ return fd;
+#else
+ RTE_SET_USED(list_idx);
+ RTE_SET_USED(seg_idx);
+ RTE_SET_USED(sz_flag);
+ return -1;
+#endif
+}
+
static int
get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
unsigned int list_idx, unsigned int seg_idx)
@@ -342,6 +391,27 @@ get_seg_fd(char *path, int buflen, struct hugepage_info *hi,
return fd;
}
+static int
+get_seg_fd_no_shared(char *path, int buflen, struct hugepage_info *hi,
+ unsigned int list_idx, unsigned int seg_idx)
+{
+ int sz_flag;
+
+ /* if memfd hugepages are not supported, create regular files */
+ if (memfd_hugepage_supported == 0)
+ return get_seg_fd(path, buflen, hi, list_idx, seg_idx);
+
+ /* pick correct page size flags */
+ sz_flag = eal_memalloc_get_memfd_pagesize_flag(hi->hugepage_sz);
+ if (sz_flag == 0) {
+ RTE_LOG(ERR, EAL, "Unexpected page size: %"
+ PRIu64 "\n", hi->hugepage_sz);
+ return -1;
+ }
+
+ return get_memfd_seg_fd(list_idx, seg_idx, sz_flag);
+}
+
static int
resize_hugefile(int fd, char *path, int list_idx, int seg_idx,
uint64_t fa_offset, uint64_t page_sz, bool grow)
@@ -491,8 +561,16 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
int fd;
size_t alloc_sz;
- /* takes out a read lock on segment or segment list */
- fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ if (internal_config.no_shared_files) {
+ /* if allocating memfd hugepages is supported, do that,
+ * otherwise fallback to regular allocation
+ */
+ fd = get_seg_fd_no_shared(path, sizeof(path), hi, list_idx,
+ seg_idx);
+ } else {
+ /* takes out a read lock on segment or segment list */
+ fd = get_seg_fd(path, sizeof(path), hi, list_idx, seg_idx);
+ }
if (fd < 0) {
RTE_LOG(ERR, EAL, "Couldn't get fd on hugepage file\n");
return -1;
@@ -512,7 +590,8 @@ alloc_seg(struct rte_memseg *ms, void *addr, int socket_id,
__func__, strerror(errno));
goto resized;
}
- if (internal_config.no_shared_files) {
+ if (internal_config.no_shared_files &&
+ memfd_hugepage_supported == 0) {
if (unlink(path)) {
RTE_LOG(DEBUG, EAL, "%s(): unlink() failed: %s\n",
__func__, strerror(errno));
@@ -616,7 +695,7 @@ free_seg(struct rte_memseg *ms, struct hugepage_info *hi,
{
uint64_t map_offset;
char path[PATH_MAX];
- int fd, ret;
+ int fd, ret = 0;
/* erase page data */
memset(ms->addr, 0, ms->len);
@@ -685,6 +764,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
size_t page_sz;
int cur_idx, start_idx, j, dir_fd = -1;
unsigned int msl_idx, need, i;
+ bool mountpoint_is_empty;
if (msl->page_sz != wa->page_sz)
return 0;
@@ -704,6 +784,12 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
return 0;
start_idx = cur_idx;
+ /* if we're in no-shared-files mode and memfd is supported, we will
+ * allow empty mountpoints because memfd doesn't require a mountpoint.
+ */
+ mountpoint_is_empty =
+ strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
/* do not allow any page allocations during the time we're allocating,
* because file creation and locking operations are not atomic,
* and we might be the first or the last ones to use a particular page,
@@ -712,7 +798,7 @@ alloc_seg_walk(const struct rte_memseg_list *msl, void *arg)
* during init, we already hold a write lock, so don't try to take out
* another one.
*/
- if (wa->hi->lock_descriptor == -1) {
+ if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
dir_fd = open(wa->hi->hugedir, O_RDONLY);
if (dir_fd < 0) {
RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
@@ -794,6 +880,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
struct free_walk_param *wa = arg;
uintptr_t start_addr, end_addr;
int msl_idx, seg_idx, ret, dir_fd = -1;
+ bool mountpoint_is_empty;
start_addr = (uintptr_t) msl->base_va;
end_addr = start_addr + msl->memseg_arr.len * (size_t)msl->page_sz;
@@ -802,6 +889,12 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
(uintptr_t)wa->ms->addr >= end_addr)
return 0;
+ /* if we're in no shared files mode and memfd is supported, we will
+ * allow empty mountpoints because memfd doesn't require a mountpoint.
+ */
+ mountpoint_is_empty =
+ strnlen(wa->hi->hugedir, sizeof(wa->hi->hugedir)) == 0;
+
msl_idx = msl - mcfg->memsegs;
seg_idx = RTE_PTR_DIFF(wa->ms->addr, start_addr) / msl->page_sz;
@@ -816,7 +909,7 @@ free_seg_walk(const struct rte_memseg_list *msl, void *arg)
* during init, we already hold a write lock, so don't try to take out
* another one.
*/
- if (wa->hi->lock_descriptor == -1) {
+ if (wa->hi->lock_descriptor == -1 && !mountpoint_is_empty) {
dir_fd = open(wa->hi->hugedir, O_RDONLY);
if (dir_fd < 0) {
RTE_LOG(ERR, EAL, "%s(): Cannot open '%s': %s\n",
new file mode 100644
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2018 Intel Corporation
+ */
+
+#ifndef EAL_MEMFD_H
+#define EAL_MEMFD_H
+
+#include <stdint.h>
+
+/*
+ * For memfd hugepages, both kernel and glibc version must support them. So,
+ * check for both.
+ */
+#include <features.h> /* glibc version */
+#if __GLIBC__ >= 2 && __GLIBC_MINOR__ >= 27
+#include <linux/version.h> /* linux kernel version */
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 14, 0)
+#define MEMFD_SUPPORTED
+#include <linux/memfd.h>
+#endif /* linux version check */
+#endif /* glibc version check */
+
+int
+eal_memalloc_get_memfd_pagesize_flag(uint64_t page_sz);
+
+extern int memfd_hugepage_supported;
+
+#endif /* EAL_MEMFD_H */
@@ -44,6 +44,7 @@
#include "eal_internal_cfg.h"
#include "eal_filesystem.h"
#include "eal_hugepages.h"
+#include "eal_memfd.h"
#define PFN_MASK_SIZE 8
@@ -1060,8 +1061,7 @@ get_socket_mem_size(int socket)
for (i = 0; i < internal_config.num_hugepage_sizes; i++){
struct hugepage_info *hpi = &internal_config.hugepage_info[i];
- if (strnlen(hpi->hugedir, sizeof(hpi->hugedir)) != 0)
- size += hpi->hugepage_sz * hpi->num_pages[socket];
+ size += hpi->hugepage_sz * hpi->num_pages[socket];
}
return size;