[dpdk-dev] [PATCH v5] eal: fix allocating all free hugepages

Pei, Yulong yulong.pei at intel.com
Mon Jun 6 04:49:26 CEST 2016


Tested-by: Yulong Pei <Yulong.pei at intel.com>

1. Run dpdk app with multiple mount points, it works as expected.
2. Create new cgroup with limited hugepages like the following, and Run dpdk app with the newly created cgroup, it works as expected.

#cgcreate -g hugetlb:/test-subgroup
# cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
# cgexec -g hugetlb:test-subgroup ./x86_64-native-linuxapp-gcc/app/testpmd -c 0x3 -n 4 -- -i

Best Regards
Yulong Pei

-----Original Message-----
From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Jianfeng Tan
Sent: Tuesday, May 31, 2016 11:37 AM
To: dev at dpdk.org
Cc: Gonzalez Monroy, Sergio <sergio.gonzalez.monroy at intel.com>; nhorman at tuxdriver.com; david.marchand at 6wind.com; thomas.monjalon at 6wind.com; Tan, Jianfeng <jianfeng.tan at intel.com>
Subject: [dpdk-dev] [PATCH v5] eal: fix allocating all free hugepages

EAL memory init allocates all free hugepages of the whole system, which seen from sysfs, even when applications do not ask so many.
When there is a limitation on how many hugepages an application can use (such as cgroup.hugetlb), or hugetlbfs is specified with an option of size (exceeding the quota of the fs), it just fails to start even there are enough hugepages allocated.

To fix above issue, this patch:
 - Changes the logic to continue memory init to see if hugetlb
   requirement of application can be addressed by already allocated
   hugepages.
 - To make sure each hugepage is allocated successfully, we add a
   recover mechanism, which relies on a mem access to fault-in
   hugepages, and if it fails with SIGBUS, recover to previously
   saved stack environment with siglongjmp().

For the case of CONFIG_RTE_EAL_SINGLE_FILE_SEGMENTS (enabled by default when compiling IVSHMEM target), it's indispensable to mapp all free hugepages in the system. Under this case, it fails to start when allocating fails.

Test example:
  a. cgcreate -g hugetlb:/test-subgroup
  b. cgset -r hugetlb.1GB.limit_in_bytes=2147483648 test-subgroup
  c. cgexec -g hugetlb:test-subgroup \
          ./examples/helloworld/build/helloworld -c 0x2 -n 4

       
Fixes: af75078fece ("first public release")

Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
Acked-by: Neil Horman <nhorman at tuxdriver.com>
---
v5:
 - Make this method as default instead of using an option.
 - When SIGBUS is triggered in the case of RTE_EAL_SINGLE_FILE_SEGMENTS,
   just return error.
 - Add prefix "huge_" to newly added function and static variables.
 - Move the internal_config.memory assignment after the page allocations.
v4:
 - Change map_all_hugepages to return unsigned instead of int.
v3:
 - Reword commit message to include it fixes the hugetlbfs quota issue.
 - setjmp -> sigsetjmp.
 - Fix RTE_LOG complaint from ERR to DEBUG as it does not mean init error
   so far.
 - Fix the second map_all_hugepages's return value check.
v2:
 - Address the compiling error by move setjmp into a wrap method.

 lib/librte_eal/linuxapp/eal/eal.c        |  20 -----
 lib/librte_eal/linuxapp/eal/eal_memory.c | 138 ++++++++++++++++++++++++++++---
 2 files changed, 125 insertions(+), 33 deletions(-)

diff --git a/lib/librte_eal/linuxapp/eal/eal.c b/lib/librte_eal/linuxapp/eal/eal.c
index 8aafd51..4a8dfbd 100644
--- a/lib/librte_eal/linuxapp/eal/eal.c
+++ b/lib/librte_eal/linuxapp/eal/eal.c
@@ -465,24 +465,6 @@ eal_parse_vfio_intr(const char *mode)
 	return -1;
 }
 
-static inline size_t
-eal_get_hugepage_mem_size(void)
-{
-	uint64_t size = 0;
-	unsigned i, j;
-
-	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
-		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
-		if (hpi->hugedir != NULL) {
-			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
-				size += hpi->hugepage_sz * hpi->num_pages[j];
-			}
-		}
-	}
-
-	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX;
-}
-
 /* Parse the arguments for --log-level only */  static void  eal_log_level_parse(int argc, char **argv) @@ -766,8 +748,6 @@ rte_eal_init(int argc, char **argv)
 	if (internal_config.memory == 0 && internal_config.force_sockets == 0) {
 		if (internal_config.no_hugetlbfs)
 			internal_config.memory = MEMSIZE_IF_NO_HUGE_PAGE;
-		else
-			internal_config.memory = eal_get_hugepage_mem_size();
 	}
 
 	if (internal_config.vmware_tsc_map == 1) { diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index 5b9132c..dc6f49b 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -80,6 +80,8 @@
 #include <errno.h>
 #include <sys/ioctl.h>
 #include <sys/time.h>
+#include <signal.h>
+#include <setjmp.h>
 
 #include <rte_log.h>
 #include <rte_memory.h>
@@ -309,6 +311,21 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
 	return addr;
 }
 
+static sigjmp_buf huge_jmpenv;
+
+static void huge_sigbus_handler(int signo __rte_unused) {
+	siglongjmp(huge_jmpenv, 1);
+}
+
+/* Put setjmp into a wrap method to avoid compiling error. Any 
+non-volatile,
+ * non-static local variable in the stack frame calling sigsetjmp might 
+be
+ * clobbered by a call to longjmp.
+ */
+static int huge_wrap_sigsetjmp(void)
+{
+	return sigsetjmp(huge_jmpenv, 1);
+}
 /*
  * Mmap all hugepages of hugepage table: it first open a file in
  * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the @@ -316,7 +333,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
  * in hugepg_tbl[i].final_va. The second mapping (when orig is 0) tries to
  * map continguous physical blocks in contiguous virtual blocks.
  */
-static int
+static unsigned
 map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		struct hugepage_info *hpi, int orig)
 {
@@ -394,9 +411,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		/* try to create hugepage file */
 		fd = open(hugepg_tbl[i].filepath, O_CREAT | O_RDWR, 0755);
 		if (fd < 0) {
-			RTE_LOG(ERR, EAL, "%s(): open failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
 					strerror(errno));
-			return -1;
+			return i;
 		}
 
 		/* map the segment, and populate page tables, @@ -404,10 +421,10 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		virtaddr = mmap(vma_addr, hugepage_sz, PROT_READ | PROT_WRITE,
 				MAP_SHARED | MAP_POPULATE, fd, 0);
 		if (virtaddr == MAP_FAILED) {
-			RTE_LOG(ERR, EAL, "%s(): mmap failed: %s\n", __func__,
+			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
 					strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		if (orig) {
@@ -417,12 +434,33 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 			hugepg_tbl[i].final_va = virtaddr;
 		}
 
+		if (orig) {
+			/* In linux, hugetlb limitations, like cgroup, are
+			 * enforced at fault time instead of mmap(), even
+			 * with the option of MAP_POPULATE. Kernel will send
+			 * a SIGBUS signal. To avoid to be killed, save stack
+			 * environment here, if SIGBUS happens, we can jump
+			 * back here.
+			 */
+			if (huge_wrap_sigsetjmp()) {
+				RTE_LOG(DEBUG, EAL, "SIGBUS: Cannot mmap more "
+					"hugepages of size %u MB\n",
+					(unsigned)(hugepage_sz / 0x100000));
+				munmap(virtaddr, hugepage_sz);
+				close(fd);
+				unlink(hugepg_tbl[i].filepath);
+				return i;
+			}
+			*(int *)virtaddr = 0;
+		}
+
+
 		/* set shared flock on the file. */
 		if (flock(fd, LOCK_SH | LOCK_NB) == -1) {
-			RTE_LOG(ERR, EAL, "%s(): Locking file failed:%s \n",
+			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
 				__func__, strerror(errno));
 			close(fd);
-			return -1;
+			return i;
 		}
 
 		close(fd);
@@ -430,7 +468,8 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		vma_addr = (char *)vma_addr + hugepage_sz;
 		vma_len -= hugepage_sz;
 	}
-	return 0;
+
+	return i;
 }
 
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
@@ -1036,6 +1075,51 @@ calc_num_pages_per_socket(uint64_t * memory,
 	return total_num_pages;
 }
 
+static inline size_t
+eal_get_hugepage_mem_size(void)
+{
+	uint64_t size = 0;
+	unsigned i, j;
+
+	for (i = 0; i < internal_config.num_hugepage_sizes; i++) {
+		struct hugepage_info *hpi = &internal_config.hugepage_info[i];
+		if (hpi->hugedir != NULL) {
+			for (j = 0; j < RTE_MAX_NUMA_NODES; j++) {
+				size += hpi->hugepage_sz * hpi->num_pages[j];
+			}
+		}
+	}
+
+	return (size < SIZE_MAX) ? (size_t)(size) : SIZE_MAX; }
+
+static struct sigaction huge_action_old; static int huge_need_recover;
+
+static void
+huge_register_sigbus(void)
+{
+	sigset_t mask;
+	struct sigaction action;
+
+	sigemptyset(&mask);
+	sigaddset(&mask, SIGBUS);
+	action.sa_flags = 0;
+	action.sa_mask = mask;
+	action.sa_handler = huge_sigbus_handler;
+
+	huge_need_recover = !sigaction(SIGBUS, &action, &huge_action_old); }
+
+static void
+huge_recover_sigbus(void)
+{
+	if (huge_need_recover) {
+		sigaction(SIGBUS, &huge_action_old, NULL);
+		huge_need_recover = 0;
+	}
+}
+
 /*
  * Prepare physical memory mapping: fill configuration structure with
  * these infos, return 0 on success.
@@ -1122,8 +1206,11 @@ rte_eal_hugepage_init(void)
 
 	hp_offset = 0; /* where we start the current page size entries */
 
+	huge_register_sigbus();
+
 	/* map all hugepages and sort them */
 	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
+		unsigned pages_old, pages_new;
 		struct hugepage_info *hpi;
 
 		/*
@@ -1137,10 +1224,28 @@ rte_eal_hugepage_init(void)
 			continue;
 
 		/* map all hugepages available */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 1) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to mmap %u MB hugepages\n",
-					(unsigned)(hpi->hugepage_sz / 0x100000));
+		pages_old = hpi->num_pages[0];
+		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
+		if (pages_new < pages_old) {
+#ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
+			RTE_LOG(ERR, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
+#else
+			RTE_LOG(DEBUG, EAL,
+				"%d not %d hugepages of size %u MB allocated\n",
+				pages_new, pages_old,
+				(unsigned)(hpi->hugepage_sz / 0x100000));
+
+			int pages = pages_old - pages_new;
+
+			nr_hugepages -= pages;
+			hpi->num_pages[0] = pages_new;
+			if (pages_new == 0)
+				continue;
+#endif
 		}
 
 		/* find physical addresses and sockets for each hugepage */ @@ -1172,8 +1277,9 @@ rte_eal_hugepage_init(void)
 		hp_offset += new_pages_count[i];
 #else
 		/* remap all hugepages */
-		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) < 0){
-			RTE_LOG(DEBUG, EAL, "Failed to remap %u MB pages\n",
+		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
+		    hpi->num_pages[0]) {
+			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
 					(unsigned)(hpi->hugepage_sz / 0x100000));
 			goto fail;
 		}
@@ -1187,6 +1293,11 @@ rte_eal_hugepage_init(void)  #endif
 	}
 
+	huge_recover_sigbus();
+
+	if (internal_config.memory == 0 && internal_config.force_sockets == 0)
+		internal_config.memory = eal_get_hugepage_mem_size();
+
 #ifdef RTE_EAL_SINGLE_FILE_SEGMENTS
 	nr_hugefiles = 0;
 	for (i = 0; i < (int) internal_config.num_hugepage_sizes; i++) { @@ -1373,6 +1484,7 @@ rte_eal_hugepage_init(void)
 	return 0;
 
 fail:
+	huge_recover_sigbus();
 	free(tmp_hp);
 	return -1;
 }
--
2.1.4



More information about the dev mailing list