[dpdk-dev] [PATCH v9 1/2] mem: balanced allocation of hugepages
Ilya Maximets
i.maximets at samsung.com
Thu Jun 29 08:08:35 CEST 2017
On 29.06.2017 08:48, Ilya Maximets wrote:
> On 29.06.2017 08:32, Hemant Agrawal wrote:
>> On 6/27/2017 3:54 PM, Ilya Maximets wrote:
>>> Currently EAL allocates hugepages one by one not paying attention
>>> from which NUMA node allocation was done.
>>>
>>> Such behaviour leads to allocation failure if number of available
>>> hugepages for application limited by cgroups or hugetlbfs and
>>> memory requested not only from the first socket.
>>>
>>> Example:
>>> # 90 x 1GB hugepages availavle in a system
>>>
>>> cgcreate -g hugetlb:/test
>>> # Limit to 32GB of hugepages
>>> cgset -r hugetlb.1GB.limit_in_bytes=34359738368 test
>>> # Request 4GB from each of 2 sockets
>>> cgexec -g hugetlb:test testpmd --socket-mem=4096,4096 ...
>>>
>>> EAL: SIGBUS: Cannot mmap more hugepages of size 1024 MB
>>> EAL: 32 not 90 hugepages of size 1024 MB allocated
>>> EAL: Not enough memory available on socket 1!
>>> Requested: 4096MB, available: 0MB
>>> PANIC in rte_eal_init():
>>> Cannot init memory
>>>
>>> This happens beacause all allocated pages are
>>> on socket 0.
>>>
>>> Fix this issue by setting mempolicy MPOL_PREFERRED for each hugepage
>>> to one of requested nodes using following schema:
>>>
>>> 1) Allocate essential hugepages:
>>> 1.1) Allocate as many hugepages from numa N to
>>> only fit requested memory for this numa.
>>> 1.2) repeat 1.1 for all numa nodes.
>>> 2) Try to map all remaining free hugepages in a round-robin
>>> fashion.
>>> 3) Sort pages and choose the most suitable.
>>>
>>> In this case all essential memory will be allocated and all remaining
>>> pages will be fairly distributed between all requested nodes.
>>>
>>> New config option RTE_EAL_NUMA_AWARE_HUGEPAGES introduced and
>>> enabled by default for linuxapp except armv7 and dpaa2.
>>> Enabling of this option adds libnuma as a dependency for EAL.
>>>
>>> Fixes: 77988fc08dc5 ("mem: fix allocating all free hugepages")
>>>
>>> Signed-off-by: Ilya Maximets <i.maximets at samsung.com>
>>> ---
>>> config/common_base | 1 +
>>> config/common_linuxapp | 1 +
>>> config/defconfig_arm-armv7a-linuxapp-gcc | 3 +
>>> config/defconfig_arm64-dpaa2-linuxapp-gcc | 3 +
>>> lib/librte_eal/linuxapp/eal/Makefile | 3 +
>>> lib/librte_eal/linuxapp/eal/eal_memory.c | 120 ++++++++++++++++++++++++++++--
>>> mk/rte.app.mk | 3 +
>>> 7 files changed, 126 insertions(+), 8 deletions(-)
>>>
>>> diff --git a/config/common_base b/config/common_base
>>> index f6aafd1..660588a 100644
>>> --- a/config/common_base
>>> +++ b/config/common_base
>>> @@ -103,6 +103,7 @@ CONFIG_RTE_EAL_ALWAYS_PANIC_ON_ERROR=n
>>> CONFIG_RTE_EAL_IGB_UIO=n
>>> CONFIG_RTE_EAL_VFIO=n
>>> CONFIG_RTE_MALLOC_DEBUG=n
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>>
>>> #
>>> # Recognize/ignore the AVX/AVX512 CPU flags for performance/power testing.
>>> diff --git a/config/common_linuxapp b/config/common_linuxapp
>>> index b3cf41b..64bef87 100644
>>> --- a/config/common_linuxapp
>>> +++ b/config/common_linuxapp
>>> @@ -35,6 +35,7 @@
>>> CONFIG_RTE_EXEC_ENV="linuxapp"
>>> CONFIG_RTE_EXEC_ENV_LINUXAPP=y
>>>
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>>> CONFIG_RTE_EAL_IGB_UIO=y
>>> CONFIG_RTE_EAL_VFIO=y
>>> CONFIG_RTE_KNI_KMOD=y
>>> diff --git a/config/defconfig_arm-armv7a-linuxapp-gcc b/config/defconfig_arm-armv7a-linuxapp-gcc
>>> index 19607eb..e06b1d4 100644
>>> --- a/config/defconfig_arm-armv7a-linuxapp-gcc
>>> +++ b/config/defconfig_arm-armv7a-linuxapp-gcc
>>> @@ -47,6 +47,9 @@ CONFIG_RTE_ARCH_STRICT_ALIGN=y
>>> CONFIG_RTE_TOOLCHAIN="gcc"
>>> CONFIG_RTE_TOOLCHAIN_GCC=y
>>>
>>> +# NUMA is not supported on ARM
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>>> +
>>> # ARM doesn't have support for vmware TSC map
>>> CONFIG_RTE_LIBRTE_EAL_VMWARE_TSC_MAP_SUPPORT=n
>>>
>>> diff --git a/config/defconfig_arm64-dpaa2-linuxapp-gcc b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> index 2304ab6..f78449d 100644
>>> --- a/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> +++ b/config/defconfig_arm64-dpaa2-linuxapp-gcc
>>> @@ -45,6 +45,9 @@ CONFIG_RTE_CACHE_LINE_SIZE=64
>>>
>>> CONFIG_RTE_PKTMBUF_HEADROOM=256
>>>
>>> +# Doesn't support NUMA
>>> +CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=y
>>> +
>>
>> DPAA2 does not support NUMA so,
>> CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=n
>
> Oh, sorry. Just typo.
> Thanks for catching this.
Fixed. Hemant, please, check the new version (v10).
> Sergio, I'll send v10 with only this change and will keep your
> acked-by because the change is trivial.
>
>>> #
>>> # Compile Support Libraries for DPAA2
>>> #
>>> diff --git a/lib/librte_eal/linuxapp/eal/Makefile b/lib/librte_eal/linuxapp/eal/Makefile
>>> index 640afd0..8651e27 100644
>>> --- a/lib/librte_eal/linuxapp/eal/Makefile
>>> +++ b/lib/librte_eal/linuxapp/eal/Makefile
>>> @@ -50,6 +50,9 @@ LDLIBS += -ldl
>>> LDLIBS += -lpthread
>>> LDLIBS += -lgcc_s
>>> LDLIBS += -lrt
>>> +ifeq ($(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),y)
>>> +LDLIBS += -lnuma
>>> +endif
>>>
>>> # specific to linuxapp exec-env
>>> SRCS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) := eal.c
>>> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> index e17c9cb..647d89c 100644
>>> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
>>> @@ -54,6 +54,10 @@
>>> #include <sys/time.h>
>>> #include <signal.h>
>>> #include <setjmp.h>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +#include <numa.h>
>>> +#include <numaif.h>
>>> +#endif
>>>
>>> #include <rte_log.h>
>>> #include <rte_memory.h>
>>> @@ -348,6 +352,14 @@ static int huge_wrap_sigsetjmp(void)
>>> return sigsetjmp(huge_jmpenv, 1);
>>> }
>>>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> +/* Callback for numa library. */
>>> +void numa_error(char *where)
>>> +{
>>> + RTE_LOG(ERR, EAL, "%s failed: %s\n", where, strerror(errno));
>>> +}
>>> +#endif
>>> +
>>> /*
>>> * Mmap all hugepages of hugepage table: it first open a file in
>>> * hugetlbfs, then mmap() hugepage_sz data in it. If orig is set, the
>>> @@ -356,18 +368,78 @@ static int huge_wrap_sigsetjmp(void)
>>> * map continguous physical blocks in contiguous virtual blocks.
>>> */
>>> static unsigned
>>> -map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> - struct hugepage_info *hpi, int orig)
>>> +map_all_hugepages(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi,
>>> + uint64_t *essential_memory __rte_unused, int orig)
>>> {
>>> int fd;
>>> unsigned i;
>>> void *virtaddr;
>>> void *vma_addr = NULL;
>>> size_t vma_len = 0;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> + int node_id = -1;
>>> + int essential_prev = 0;
>>> + int oldpolicy;
>>> + struct bitmask *oldmask = numa_allocate_nodemask();
>>> + bool have_numa = true;
>>> + unsigned long maxnode = 0;
>>> +
>>> + /* Check if kernel supports NUMA. */
>>> + if (numa_available() != 0) {
>>> + RTE_LOG(DEBUG, EAL, "NUMA is not supported.\n");
>>> + have_numa = false;
>>> + }
>>> +
>>> + if (orig && have_numa) {
>>> + RTE_LOG(DEBUG, EAL, "Trying to obtain current memory policy.\n");
>>> + if (get_mempolicy(&oldpolicy, oldmask->maskp,
>>> + oldmask->size + 1, 0, 0) < 0) {
>>> + RTE_LOG(ERR, EAL,
>>> + "Failed to get current mempolicy: %s. "
>>> + "Assuming MPOL_DEFAULT.\n", strerror(errno));
>>> + oldpolicy = MPOL_DEFAULT;
>>> + }
>>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>>> + if (internal_config.socket_mem[i])
>>> + maxnode = i + 1;
>>> + }
>>> +#endif
>>>
>>> for (i = 0; i < hpi->num_pages[0]; i++) {
>>> uint64_t hugepage_sz = hpi->hugepage_sz;
>>>
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> + if (maxnode) {
>>> + unsigned int j;
>>> +
>>> + for (j = 0; j < maxnode; j++)
>>> + if (essential_memory[j])
>>> + break;
>>> +
>>> + if (j == maxnode) {
>>> + node_id = (node_id + 1) % maxnode;
>>> + while (!internal_config.socket_mem[node_id]) {
>>> + node_id++;
>>> + node_id %= maxnode;
>>> + }
>>> + essential_prev = 0;
>>> + } else {
>>> + node_id = j;
>>> + essential_prev = essential_memory[j];
>>> +
>>> + if (essential_memory[j] < hugepage_sz)
>>> + essential_memory[j] = 0;
>>> + else
>>> + essential_memory[j] -= hugepage_sz;
>>> + }
>>> +
>>> + RTE_LOG(DEBUG, EAL,
>>> + "Setting policy MPOL_PREFERRED for socket %d\n",
>>> + node_id);
>>> + numa_set_preferred(node_id);
>>> + }
>>> +#endif
>>> +
>>> if (orig) {
>>> hugepg_tbl[i].file_id = i;
>>> hugepg_tbl[i].size = hugepage_sz;
>>> @@ -422,7 +494,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> if (fd < 0) {
>>> RTE_LOG(DEBUG, EAL, "%s(): open failed: %s\n", __func__,
>>> strerror(errno));
>>> - return i;
>>> + goto out;
>>> }
>>>
>>> /* map the segment, and populate page tables,
>>> @@ -433,7 +505,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> RTE_LOG(DEBUG, EAL, "%s(): mmap failed: %s\n", __func__,
>>> strerror(errno));
>>> close(fd);
>>> - return i;
>>> + goto out;
>>> }
>>>
>>> if (orig) {
>>> @@ -458,7 +530,12 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> munmap(virtaddr, hugepage_sz);
>>> close(fd);
>>> unlink(hugepg_tbl[i].filepath);
>>> - return i;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> + if (maxnode)
>>> + essential_memory[node_id] =
>>> + essential_prev;
>>> +#endif
>>> + goto out;
>>> }
>>> *(int *)virtaddr = 0;
>>> }
>>> @@ -469,7 +546,7 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> RTE_LOG(DEBUG, EAL, "%s(): Locking file failed:%s \n",
>>> __func__, strerror(errno));
>>> close(fd);
>>> - return i;
>>> + goto out;
>>> }
>>>
>>> close(fd);
>>> @@ -478,6 +555,22 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>>> vma_len -= hugepage_sz;
>>> }
>>>
>>> +out:
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> + if (maxnode) {
>>> + RTE_LOG(DEBUG, EAL,
>>> + "Restoring previous memory policy: %d\n", oldpolicy);
>>> + if (oldpolicy == MPOL_DEFAULT) {
>>> + numa_set_localalloc();
>>> + } else if (set_mempolicy(oldpolicy, oldmask->maskp,
>>> + oldmask->size + 1) < 0) {
>>> + RTE_LOG(ERR, EAL, "Failed to restore mempolicy: %s\n",
>>> + strerror(errno));
>>> + numa_set_localalloc();
>>> + }
>>> + }
>>> + numa_free_cpumask(oldmask);
>>> +#endif
>>> return i;
>>> }
>>>
>>> @@ -562,6 +655,11 @@ find_numasocket(struct hugepage_file *hugepg_tbl, struct hugepage_info *hpi)
>>> if (hugepg_tbl[i].orig_va == va) {
>>> hugepg_tbl[i].socket_id = socket_id;
>>> hp_count++;
>>> +#ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
>>> + RTE_LOG(DEBUG, EAL,
>>> + "Hugepage %s is on socket %d\n",
>>> + hugepg_tbl[i].filepath, socket_id);
>>> +#endif
>>> }
>>> }
>>> }
>>> @@ -1000,6 +1098,11 @@ rte_eal_hugepage_init(void)
>>>
>>> huge_register_sigbus();
>>>
>>> + /* make a copy of socket_mem, needed for balanced allocation. */
>>> + for (i = 0; i < RTE_MAX_NUMA_NODES; i++)
>>> + memory[i] = internal_config.socket_mem[i];
>>> +
>>> +
>>> /* map all hugepages and sort them */
>>> for (i = 0; i < (int)internal_config.num_hugepage_sizes; i ++){
>>> unsigned pages_old, pages_new;
>>> @@ -1017,7 +1120,8 @@ rte_eal_hugepage_init(void)
>>>
>>> /* map all hugepages available */
>>> pages_old = hpi->num_pages[0];
>>> - pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi, 1);
>>> + pages_new = map_all_hugepages(&tmp_hp[hp_offset], hpi,
>>> + memory, 1);
>>> if (pages_new < pages_old) {
>>> RTE_LOG(DEBUG, EAL,
>>> "%d not %d hugepages of size %u MB allocated\n",
>>> @@ -1060,7 +1164,7 @@ rte_eal_hugepage_init(void)
>>> sizeof(struct hugepage_file), cmp_physaddr);
>>>
>>> /* remap all hugepages */
>>> - if (map_all_hugepages(&tmp_hp[hp_offset], hpi, 0) !=
>>> + if (map_all_hugepages(&tmp_hp[hp_offset], hpi, NULL, 0) !=
>>> hpi->num_pages[0]) {
>>> RTE_LOG(ERR, EAL, "Failed to remap %u MB pages\n",
>>> (unsigned)(hpi->hugepage_sz / 0x100000));
>>> diff --git a/mk/rte.app.mk b/mk/rte.app.mk
>>> index bcaf1b3..4fe22d1 100644
>>> --- a/mk/rte.app.mk
>>> +++ b/mk/rte.app.mk
>>> @@ -186,6 +186,9 @@ ifeq ($(CONFIG_RTE_BUILD_SHARED_LIB),n)
>>> # The static libraries do not know their dependencies.
>>> # So linking with static library requires explicit dependencies.
>>> _LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lrt
>>> +ifeq ($(CONFIG_RTE_EXEC_ENV_LINUXAPP)$(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES),yy)
>>> +_LDLIBS-$(CONFIG_RTE_LIBRTE_EAL) += -lnuma
>>> +endif
>>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lm
>>> _LDLIBS-$(CONFIG_RTE_LIBRTE_SCHED) += -lrt
>>> _LDLIBS-$(CONFIG_RTE_LIBRTE_METER) += -lm
>>>
>>
>>
>>
>>
>>
More information about the dev
mailing list