[dpdk-dev] [PATCH] mem: balanced allocation of hugepages

Tan, Jianfeng jianfeng.tan at intel.com
Thu Feb 16 14:26:26 CET 2017


Hi,

> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets at samsung.com]
> Sent: Thursday, February 16, 2017 9:01 PM
> To: dev at dpdk.org; David Marchand; Gonzalez Monroy, Sergio
> Cc: Heetae Ahn; Yuanhan Liu; Tan, Jianfeng; Neil Horman; Pei, Yulong; Ilya
> Maximets; stable at dpdk.org
> Subject: [PATCH] mem: balanced allocation of hugepages
> 
> Currently EAL allocates hugepages one by one not paying
> attention from which NUMA node allocation was done.
> 
> Such behaviour leads to allocation failure if number of
> available hugepages for application limited by cgroups
> or hugetlbfs and memory requested not only from the first
> socket.
> 
> Example:
> 	# 90 x 1GB hugepages availavle in a system
> 
> 	cgcreate -g hugetlb:/test
> 	# Limit to 32GB of hugepages
> 	cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
> 	# Request 4GB from each of 2 sockets
> 	cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
> 
> 	EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
> 	EAL: 32 not 90 hugepages of size 1024 MB allocated
> 	EAL: Not enough memory available on socket 1!
> 	     Requested: 4096MB, available: 0MB
> 	PANIC in rte_eal_init():
> 	Cannot init memory
> 
> 	This happens beacause all allocated pages are
> 	on socket 0.

For such an use case, why not just use "numactl --interleave=0,1 <DPDK app> xxx"?

Do you see use case like --socket-mem 2048,1024 and only three 1GB-hugepage are allowed?

Thanks,
Jianfeng

> 
> Fix this issue by setting mempolicy MPOL_PREFERRED for each
> hugepage to one of requested nodes in a round-robin fashion.
> In this case all allocated pages will be fairly distributed
> between all requested nodes.
> 
> New config option RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> introduced and disabled by default because of external
> dependency from libnuma.
> 
> Cc: <stable at dpdk.org>
> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
> 
> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
> ---
>  config/common_base                       |  1 +
>  lib/librte_eal/Makefile                  |  4 ++
>  lib/librte_eal/linuxapp/eal/eal_memory.c | 66
> ++++++++++++++++++++++++++++++++
>  mk/rte.app.mk                            |  3 ++
>  4 files changed, 74 insertions(+)
> 
> diff --git a/config/common_base b/config/common_base
> index 71a4fcb..fbcebbd 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -97,6 +97,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>  CONFIG_RTE_EAL_IGB_UIO=n
>  CONFIG_RTE_EAL_VFIO=n
>  CONFIG_RTE_MALLOC_DEBUG=n
> +CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES=n
> 
>  # Default driver path (or "" to disable)
>  CONFIG_RTE_EAL_PMD_PATH=""
> diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile
> index cf11a09..5ae3846 100644
> --- a/lib/librte_eal/Makefile
> +++ b/lib/librte_eal/Makefile
> @@ -35,4 +35,8 @@ DIRS-y += common
>  DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp
>  DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += bsdapp
> 
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +LDLIBS += -lnuma
> +endif
> +
>  include $(RTE_SDK)/mk/rte.subdir.mk
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index a956bb2..8536a36 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -82,6 +82,9 @@
>  #include <sys/time.h>
>  #include <signal.h>
>  #include <setjmp.h>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#include <numaif.h>
> +#endif
> 
>  #include <rte_log.h>
>  #include <rte_memory.h>
> @@ -359,6 +362,21 @@ static int huge_wrap_sigsetjmp(void)
>  	return sigsetjmp(huge_jmpenv, 1);
>  }
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#ifndef ULONG_SIZE
> +#define ULONG_SIZE sizeof(unsigned long)
> +#endif
> +#ifndef ULONG_BITS
> +#define ULONG_BITS (ULONG_SIZE * CHAR_BIT)
> +#endif
> +#ifndef DIV_ROUND_UP
> +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
> +#endif
> +#ifndef BITS_TO_LONGS
> +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, ULONG_SIZE)
> +#endif
> +#endif
> +
>  /*
>   * Mmap all hugepages of hugepage table: it first open a file in
>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -375,10 +393,48 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
>  	void *virtaddr;
>  	void *vma_addr = NULL;
>  	size_t vma_len = 0;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +	unsigned long
> nodemask[BITS_TO_LONGS(RTE_MAX_NUMA_NODES)] = {0UL};
> +	unsigned long maxnode = 0;
> +	int node_id = -1;
> +
> +	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> +		if (internal_config.socket_mem[i])
> +			maxnode = i + 1;
> +#endif
> 
>  	for (i = 0; i < hpi->num_pages[0]; i++) {
>  		uint64_t hugepage_sz = hpi->hugepage_sz;
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +		if (maxnode) {
> +			node_id = (node_id + 1) % RTE_MAX_NUMA_NODES;
> +			while (!internal_config.socket_mem[node_id])
> +				node_id = (node_id + 1) %
> RTE_MAX_NUMA_NODES;
> +
> +			nodemask[node_id / ULONG_BITS] =
> +						1UL << (node_id %
> ULONG_BITS);
> +
> +			RTE_LOG(DEBUG, EAL,
> +				"Setting policy MPOL_PREFERRED for
> socket %d\n",
> +				node_id);
> +			/*
> +			 * Due to old linux kernel bug (feature?) we have to
> +			 * increase maxnode by 1. It will be unconditionally
> +			 * decreased back to normal value inside the syscall
> +			 * handler.
> +			 */
> +			if (set_mempolicy(MPOL_PREFERRED,
> +					  nodemask, maxnode + 1) < 0) {
> +				RTE_LOG(ERR, EAL,
> +					"Failed to set policy
> MPOL_PREFERRED: "
> +					"%s\n", strerror(errno));
> +				return i;
> +			}
> +
> +			nodemask[node_id / ULONG_BITS] = 0UL;
> +		}
> +#endif
>  		if (orig) {
>  			hugepg_tbl[i].file_id = i;
>  			hugepg_tbl[i].size = hugepage_sz;
> @@ -489,6 +545,10 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
>  		vma_len -= hugepage_sz;
>  	}
> 
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +	if (maxnode && set_mempolicy(MPOL_DEFAULT, NULL, 0) < 0)
> +		RTE_LOG(ERR, EAL, "Failed to set mempolicy
> MPOL_DEFAULT\n");
> +#endif
>  	return i;
>  }
> 
> @@ -573,6 +634,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl,
> struct hugepage_info *hpi)
>  			if (hugepg_tbl[i].orig_va == va) {
>  				hugepg_tbl[i].socket_id = socket_id;
>  				hp_count++;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +				RTE_LOG(DEBUG, EAL,
> +					"Hugepage %s is on socket %d\n",
> +					hugepg_tbl[i].filepath, socket_id);
> +#endif
>  			}
>  		}
>  	}
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 92f3635..c2153b9 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -159,6 +159,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>  # The static libraries do not know their dependencies.
>  # So linking with static library requires explicit dependencies.
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrt
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lnuma
> +endif
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lm
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrt
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lm
> --
> 2.7.4



More information about the dev mailing list