[dpdk-dev] [PATCH] mem: balanced allocation of hugepages
Tan, Jianfeng
jianfeng.tan at intel.com
Thu Feb 16 14:26:26 CET 2017
Hi,
> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets at samsung.com]
> Sent: Thursday, February 16, 2017 9:01 PM
> To: dev at dpdk.org; David Marchand; Gonzalez Monroy, Sergio
> Cc: Heetae Ahn; Yuanhan Liu; Tan, Jianfeng; Neil Horman; Pei, Yulong; Ilya
> Maximets; stable at dpdk.org
> Subject: [PATCH] mem: balanced allocation of hugepages
>
> Currently EAL allocates hugepages one by one not paying
> attention from which NUMA node allocation was done.
>
> Such behaviour leads to allocation failure if number of
> available hugepages for application limited by cgroups
> or hugetlbfs and memory requested not only from the first
> socket.
>
> Example:
> # 90 x 1GB hugepages availavle in a system
>
> cgcreate -g hugetlb:/test
> # Limit to 32GB of hugepages
> cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
> # Request 4GB from each of 2 sockets
> cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>
> EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
> EAL: 32 not 90 hugepages of size 1024 MB allocated
> EAL: Not enough memory available on socket 1!
> Requested: 4096MB, available: 0MB
> PANIC in rte_eal_init():
> Cannot init memory
>
> This happens beacause all allocated pages are
> on socket 0.
For such an use case, why not just use "numactl --interleave=0,1 <DPDK app> xxx"?
Do you see use case like --socket-mem 2048,1024 and only three 1GB-hugepage are allowed?
Thanks,
Jianfeng
>
> Fix this issue by setting mempolicy MPOL_PREFERRED for each
> hugepage to one of requested nodes in a round-robin fashion.
> In this case all allocated pages will be fairly distributed
> between all requested nodes.
>
> New config option RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> introduced and disabled by default because of external
> dependency from libnuma.
>
> Cc: <stable at dpdk.org>
> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>
> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
> ---
> config/common_base | 1 +
> lib/librte_eal/Makefile | 4 ++
> lib/librte_eal/linuxapp/eal/eal_memory.c | 66
> ++++++++++++++++++++++++++++++++
> mk/rte.app.mk | 3 ++
> 4 files changed, 74 insertions(+)
>
> diff --git a/config/common_base b/config/common_base
> index 71a4fcb..fbcebbd 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -97,6 +97,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
> CONFIG_RTE_EAL_IGB_UIO=n
> CONFIG_RTE_EAL_VFIO=n
> CONFIG_RTE_MALLOC_DEBUG=n
> +CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES=n
>
> # Default driver path (or "" to disable)
> CONFIG_RTE_EAL_PMD_PATH=""
> diff --git a/lib/librte_eal/Makefile b/lib/librte_eal/Makefile
> index cf11a09..5ae3846 100644
> --- a/lib/librte_eal/Makefile
> +++ b/lib/librte_eal/Makefile
> @@ -35,4 +35,8 @@ DIRS-y += common
> DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += linuxapp
> DIRS-$(CONFIG_RTE_EXEC_ENV_BSDAPP) += bsdapp
>
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +LDLIBS += -lnuma
> +endif
> +
> include $(RTE_SDK)/mk/rte.subdir.mk
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index a956bb2..8536a36 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -82,6 +82,9 @@
> #include <sys/time.h>
> #include <signal.h>
> #include <setjmp.h>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#include <numaif.h>
> +#endif
>
> #include <rte_log.h>
> #include <rte_memory.h>
> @@ -359,6 +362,21 @@ static int huge_wrap_sigsetjmp(void)
> return sigsetjmp(huge_jmpenv, 1);
> }
>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> +#ifndef ULONG_SIZE
> +#define ULONG_SIZE sizeof(unsigned long)
> +#endif
> +#ifndef ULONG_BITS
> +#define ULONG_BITS (ULONG_SIZE * CHAR_BIT)
> +#endif
> +#ifndef DIV_ROUND_UP
> +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d))
> +#endif
> +#ifndef BITS_TO_LONGS
> +#define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, ULONG_SIZE)
> +#endif
> +#endif
> +
> /*
> * Mmap all hugepages of hugepage table: it first open a file in
> * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
> @@ -375,10 +393,48 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
> void *virtaddr;
> void *vma_addr = NULL;
> size_t vma_len = 0;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> + unsigned long
> nodemask[BITS_TO_LONGS(RTE_MAX_NUMA_NODES)] = {0UL};
> + unsigned long maxnode = 0;
> + int node_id = -1;
> +
> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
> + if (internal_config.socket_mem[i])
> + maxnode = i + 1;
> +#endif
>
> for (i = 0; i < hpi->num_pages[0]; i++) {
> uint64_t hugepage_sz = hpi->hugepage_sz;
>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> + if (maxnode) {
> + node_id = (node_id + 1) % RTE_MAX_NUMA_NODES;
> + while (!internal_config.socket_mem[node_id])
> + node_id = (node_id + 1) %
> RTE_MAX_NUMA_NODES;
> +
> + nodemask[node_id / ULONG_BITS] =
> + 1UL << (node_id %
> ULONG_BITS);
> +
> + RTE_LOG(DEBUG, EAL,
> + "Setting policy MPOL_PREFERRED for
> socket %d\n",
> + node_id);
> + /*
> + * Due to old linux kernel bug (feature?) we have to
> + * increase maxnode by 1. It will be unconditionally
> + * decreased back to normal value inside the syscall
> + * handler.
> + */
> + if (set_mempolicy(MPOL_PREFERRED,
> + nodemask, maxnode + 1) < 0) {
> + RTE_LOG(ERR, EAL,
> + "Failed to set policy
> MPOL_PREFERRED: "
> + "%s\n", strerror(errno));
> + return i;
> + }
> +
> + nodemask[node_id / ULONG_BITS] = 0UL;
> + }
> +#endif
> if (orig) {
> hugepg_tbl[i].file_id = i;
> hugepg_tbl[i].size = hugepage_sz;
> @@ -489,6 +545,10 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl,
> vma_len -= hugepage_sz;
> }
>
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> + if (maxnode && set_mempolicy(MPOL_DEFAULT, NULL, 0) < 0)
> + RTE_LOG(ERR, EAL, "Failed to set mempolicy
> MPOL_DEFAULT\n");
> +#endif
> return i;
> }
>
> @@ -573,6 +634,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl,
> struct hugepage_info *hpi)
> if (hugepg_tbl[i].orig_va == va) {
> hugepg_tbl[i].socket_id = socket_id;
> hp_count++;
> +#ifdef RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES
> + RTE_LOG(DEBUG, EAL,
> + "Hugepage %s is on socket %d\n",
> + hugepg_tbl[i].filepath, socket_id);
> +#endif
> }
> }
> }
> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
> index 92f3635..c2153b9 100644
> --- a/mk/rte.app.mk
> +++ b/mk/rte.app.mk
> @@ -159,6 +159,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
> # The static libraries do not know their dependencies.
> # So linking with static library requires explicit dependencies.
> _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt
> +ifeq ($(CONFIG_RTE_LIBRTE_EAL_NUMA_AWARE_HUGEPAGES),y)
> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma
> +endif
> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm
> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt
> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm
> --
> 2.7.4
More information about the dev
mailing list