[dpdk-dev] [PATCH v9 1/2] mem: balanced allocation of hugepages

Hemant Agrawal hemant.agrawal at nxp.com
Thu Jun 29 07:32:22 CEST 2017


On 6/27/2017 3:54 PM, Ilya Maximets wrote:
> Currently EAL allocates hugepages one by one not paying attention
> from which NUMA node allocation was done.
>
> Such behaviour leads to allocation failure if number of available
> hugepages for application limited by cgroups or hugetlbfs and
> memory requested not only from the first socket.
>
> Example:
> 	# 90 x 1GB hugepages availavle in a system
>
> 	cgcreate -g hugetlb:/test
> 	# Limit to 32GB of hugepages
> 	cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
> 	# Request 4GB from each of 2 sockets
> 	cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>
> 	EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
> 	EAL: 32 not 90 hugepages of size 1024 MB allocated
> 	EAL: Not enough memory available on socket 1!
> 	     Requested: 4096MB, available: 0MB
> 	PANIC in rte_eal_init():
> 	Cannot init memory
>
> 	This happens beacause all allocated pages are
> 	on socket 0.
>
> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage
> to one of requested nodes using following schema:
>
> 	1) Allocate essential hugepages:
> 		1.1) Allocate as many hugepages from numa N to
> 		     only fit requested memory for this numa.
> 		1.2) repeat 1.1 for all numa nodes.
> 	2) Try to map all remaining free hugepages in a round-robin
> 	   fashion.
> 	3) Sort pages and choose the most suitable.
>
> In this case all essential memory will be allocated and all remaining
> pages will be fairly distributed between all requested nodes.
>
> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and
> enabled by default for linuxapp except armv7 and dpaa2.
> Enabling of this option adds libnuma as a dependency for EAL.
>
> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>
> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
> ---
>  config/common_base                        |   1 +
>  config/common_linuxapp                    |   1 +
>  config/defconfig_arm-armv7a-linuxapp-gcc  |   3 +
>  config/defconfig_arm64-dpaa2-linuxapp-gcc |   3 +
>  lib/librte_eal/linuxapp/eal/Makefile      |   3 +
>  lib/librte_eal/linuxapp/eal/eal_memory.c  | 120 ++++++++++++++++++++++++++++--
>  mk/rte.app.mk                             |   3 +
>  7 files changed, 126 insertions(+), 8 deletions(-)
>
> diff --git a/config/common_base b/config/common_base
> index f6aafd1..660588a 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>  CONFIG_RTE_EAL_IGB_UIO=n
>  CONFIG_RTE_EAL_VFIO=n
>  CONFIG_RTE_MALLOC_DEBUG=n
> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>
>  #
>  # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
> diff --git a/config/common_linuxapp b/config/common_linuxapp
> index b3cf41b..64bef87 100644
> --- a/config/common_linuxapp
> +++ b/config/common_linuxapp
> @@ -35,6 +35,7 @@
>  CONFIG_RTE_EXEC_ENV="linuxapp"
>  CONFIG_RTE_EXEC_ENV_LINUXAPP=y
>
> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>  CONFIG_RTE_EAL_IGB_UIO=y
>  CONFIG_RTE_EAL_VFIO=y
>  CONFIG_RTE_KNI_KMOD=y
> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
> index 19607eb..e06b1d4 100644
> --- a/config/defconfig_arm-armv7a-linuxapp-gcc
> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc
> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y
>  CONFIG_RTE_TOOLCHAIN="gcc"
>  CONFIG_RTE_TOOLCHAIN_GCC=y
>
> +# NUMA is not supported on ARM
> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
> +
>  # ARM doesn't have support for vmware TSC map
>  CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
>
> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc b/config/defconfig_arm64-dpaa2-linuxapp-gcc
> index 2304ab6..f78449d 100644
> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
>
>  CONFIG_RTE_PKTMBUF_HEADROOM=256
>
> +# Doesn't support NUMA
> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
> +

DPAA2 does not support NUMA so,
CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n

>  #
>  # Compile Support Libraries for DPAA2
>  #
> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
> index 640afd0..8651e27 100644
> --- a/lib/librte_eal/linuxapp/eal/Makefile
> +++ b/lib/librte_eal/linuxapp/eal/Makefile
> @@ -50,6 +50,9 @@ LDLIBS += -ldl
>  LDLIBS += -lpthread
>  LDLIBS += -lgcc_s
>  LDLIBS += -lrt
> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +LDLIBS += -lnuma
> +endif
>
>  # specific to linuxapp exec-env
>  SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index e17c9cb..647d89c 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -54,6 +54,10 @@
>  #include <sys/time.h>
>  #include <signal.h>
>  #include <setjmp.h>
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +#include <numa.h>
> +#include <numaif.h>
> +#endif
>
>  #include <rte_log.h>
>  #include <rte_memory.h>
> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void)
>  	return sigsetjmp(huge_jmpenv, 1);
>  }
>
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +/* Callback for numa library. */
> +void numa_error(char *where)
> +{
> +	RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
> +}
> +#endif
> +
>  /*
>   * Mmap all hugepages of hugepage table: it first open a file in
>   * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void)
>   * map continguous physical blocks in contiguous virtual blocks.
>   */
>  static unsigned
> -map_all_hugepages(struct hugepage_file *hugepg_tbl,
> -		struct hugepage_info *hpi, int orig)
> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
> +		  uint64_t *essential_memory __rte_unused, int orig)
>  {
>  	int fd;
>  	unsigned i;
>  	void *virtaddr;
>  	void *vma_addr = NULL;
>  	size_t vma_len = 0;
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +	int node_id = -1;
> +	int essential_prev = 0;
> +	int oldpolicy;
> +	struct bitmask *oldmask = numa_allocate_nodemask();
> +	bool have_numa = true;
> +	unsigned long maxnode = 0;
> +
> +	/* Check if kernel supports NUMA. */
> +	if (numa_available() != 0) {
> +		RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
> +		have_numa = false;
> +	}
> +
> +	if (orig && have_numa) {
> +		RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
> +		if (get_mempolicy(&oldpolicy, oldmask->maskp,
> +				  oldmask->size + 1, 0, 0) < 0) {
> +			RTE_LOG(ERR, EAL,
> +				"Failed to get current mempolicy: %s. "
> +				"Assuming MPOL_DEFAULT.\n", strerror(errno));
> +			oldpolicy = MPOL_DEFAULT;
> +		}
> +		for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> +			if (internal_config.socket_mem[i])
> +				maxnode = i + 1;
> +	}
> +#endif
>
>  	for (i = 0; i < hpi->num_pages[0]; i++) {
>  		uint64_t hugepage_sz = hpi->hugepage_sz;
>
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +		if (maxnode) {
> +			unsigned int j;
> +
> +			for (j = 0; j < maxnode; j++)
> +				if (essential_memory[j])
> +					break;
> +
> +			if (j == maxnode) {
> +				node_id = (node_id + 1) % maxnode;
> +				while (!internal_config.socket_mem[node_id]) {
> +					node_id++;
> +					node_id %= maxnode;
> +				}
> +				essential_prev = 0;
> +			} else {
> +				node_id = j;
> +				essential_prev = essential_memory[j];
> +
> +				if (essential_memory[j] < hugepage_sz)
> +					essential_memory[j] = 0;
> +				else
> +					essential_memory[j] -= hugepage_sz;
> +			}
> +
> +			RTE_LOG(DEBUG, EAL,
> +				"Setting policy MPOL_PREFERRED for socket %d\n",
> +				node_id);
> +			numa_set_preferred(node_id);
> +		}
> +#endif
> +
>  		if (orig) {
>  			hugepg_tbl[i].file_id = i;
>  			hugepg_tbl[i].size = hugepage_sz;
> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  		if (fd < 0) {
>  			RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
>  					strerror(errno));
> -			return i;
> +			goto out;
>  		}
>
>  		/* map the segment, and populate page tables,
> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  			RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
>  					strerror(errno));
>  			close(fd);
> -			return i;
> +			goto out;
>  		}
>
>  		if (orig) {
> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  				munmap(virtaddr, hugepage_sz);
>  				close(fd);
>  				unlink(hugepg_tbl[i].filepath);
> -				return i;
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +				if (maxnode)
> +					essential_memory[node_id] =
> +						essential_prev;
> +#endif
> +				goto out;
>  			}
>  			*(int *)virtaddr = 0;
>  		}
> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  			RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>  				__func__, strerror(errno));
>  			close(fd);
> -			return i;
> +			goto out;
>  		}
>
>  		close(fd);
> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  		vma_len -= hugepage_sz;
>  	}
>
> +out:
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +	if (maxnode) {
> +		RTE_LOG(DEBUG, EAL,
> +			"Restoring previous memory policy: %d\n", oldpolicy);
> +		if (oldpolicy == MPOL_DEFAULT) {
> +			numa_set_localalloc();
> +		} else if (set_mempolicy(oldpolicy, oldmask->maskp,
> +					 oldmask->size + 1) < 0) {
> +			RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
> +				strerror(errno));
> +			numa_set_localalloc();
> +		}
> +	}
> +	numa_free_cpumask(oldmask);
> +#endif
>  	return i;
>  }
>
> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
>  			if (hugepg_tbl[i].orig_va == va) {
>  				hugepg_tbl[i].socket_id = socket_id;
>  				hp_count++;
> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> +				RTE_LOG(DEBUG, EAL,
> +					"Hugepage %s is on socket %d\n",
> +					hugepg_tbl[i].filepath, socket_id);
> +#endif
>  			}
>  		}
>  	}
> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void)
>
>  	huge_register_sigbus();
>
> +	/* make a copy of socket_mem, needed for balanced allocation. */
> +	for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> +		memory[i] = internal_config.socket_mem[i];
> +
> +
>  	/* map all hugepages and sort them */
>  	for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
>  		unsigned pages_old, pages_new;
> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void)
>
>  		/* map all hugepages available */
>  		pages_old = hpi->num_pages[0];
> -		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
> +		pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
> +					      memory, 1);
>  		if (pages_new < pages_old) {
>  			RTE_LOG(DEBUG, EAL,
>  				"%d not %d hugepages of size %u MB allocated\n",
> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void)
>  		      sizeof(struct hugepage_file), cmp_physaddr);
>
>  		/* remap all hugepages */
> -		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
> +		if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
>  		    hpi->num_pages[0]) {
>  			RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
>  					(unsigned)(hpi->hugepage_sz / 0x100000));
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index bcaf1b3..4fe22d1 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>  # The static libraries do not know their dependencies.
>  # So linking with static library requires explicit dependencies.
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lrt
> +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy)
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL)            += -lnuma
> +endif
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lm
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED)          += -lrt
>  _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lm
>




More information about the dev mailing list