[dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init
dwilder
dwilder at us.ibm.com
Sat Nov 10 03:59:45 CET 2018
On 2018-11-09 11:28, David Wilder wrote:
> Determine if the ppc64 platform is Power9 or Power8 and perform huge
> page mapping appropriately for the selected platform.
>
> Signed-off-by: Pradeep Satyanarayana <pradeep at us.ibm.com>
> Tested-by: David Wilder <wilder at us.ibm.com>
> ---
> On IBM Power8, when mmaping hugepage files the address hint supplied to
> mmap
> is not always honored, therefor we let the kernel pick the address by
> specifying a NULL address hint. On Power9 the address hint is honored
> as
> expected. This patch detects the platform, if Power9 the address hint
> is
> supplied to mmap and the pages are sorted appropriately. Hugepage
> mapping for
> both primary and secondary processes now work correctly on Power9. I
> have
> retain the original behavior and limitations on Power8. Additionally
> the flags
> supplied to mmap() have been corrected eliminating the message "Cannot
> get
> a virtual area" messages previously seen during EAL init on Power.
>
> lib/librte_eal/linuxapp/eal/eal_memory.c | 75 +++++++++++++++++-------
> 1 file changed, 54 insertions(+), 21 deletions(-)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index bac969a12..5b7001be8 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -50,6 +50,9 @@
> #include <limits.h>
> #include <sys/ioctl.h>
> #include <sys/time.h>
> +#ifdef RTE_ARCH_PPC_64
> +#include <sys/auxv.h>
> +#endif
> #include <signal.h>
> #include <setjmp.h>
> #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> @@ -107,6 +110,10 @@ static uint64_t baseaddr = 0x100000000;
>
> static bool phys_addrs_available = true;
>
> +#ifdef RTE_ARCH_PPC_64
> +static int p8;
> +#endif
> +
> #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
>
> static void
> @@ -309,12 +316,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
> addr_hint = get_addr_hint();
>
> addr = mmap(addr_hint,
> - (*size) + hugepage_sz, PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> - MAP_PRIVATE,
> -#endif
> + (*size) + hugepage_sz, PROT_READ, MAP_PRIVATE,
> fd, 0);
> if (addr == MAP_FAILED) {
> /* map failed. Let's try with less memory */
> @@ -501,6 +503,15 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl, struct hugepage_info *hpi,
> * vma_len. If it fails, vma_addr is NULL, so
> * let the kernel provide the address. */
> vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
> +#ifdef RTE_ARCH_PPC_64
> + /*
> + * On power8 the address hint is not consistently
> + * honored, therefor we always let the
> + * kernel provide the address.
> + */
> + if (p8)
> + vma_addr = NULL;
> +#endif
> if (vma_addr == NULL)
> vma_len = hugepage_sz;
> }
> @@ -1059,6 +1070,23 @@ rte_eal_hugepage_init(void)
> int nr_hugefiles, nr_hugepages = 0;
> void *addr;
>
> +#ifdef RTE_ARCH_PPC_64
> + char *platform;
> + platform = (char *)getauxval(AT_BASE_PLATFORM);
> +
> + p8 = 0;
> +
> + /* Alert the user in case our assumptions are incorrect */
> + if (platform == NULL)
> + printf("Some distros on P9 do not support "
> + "getauxval(AT_BASE_PLATFORM). Assuming P9\n");
> +
> + if (platform && !strncmp(platform, "power8", 6)) {
> + RTE_LOG(DEBUG, EAL, "This must be a P8\n");
> + p8 = 1;
> + } else
> + RTE_LOG(DEBUG, EAL, "This must be a P9\n");
> +#endif
> test_phys_addrs_available();
>
> memset(used_hp, 0, sizeof(used_hp));
> @@ -1305,14 +1333,22 @@ rte_eal_hugepage_init(void)
> new_memseg = 1;
>
> #ifdef RTE_ARCH_PPC_64
> - /* On PPC64 architecture, the mmap always start from higher
> - * virtual address to lower address. Here, both the physical
> - * address and virtual address are in descending order */
> + /*
> + * On power8 we let the kernel selected the virtual address
> + * for mmaped segments, successive mmaps will start from
> + * higher virtual address to lower address. Physical address
> + * are in descending order for both platforms.
> + */
> else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
> hugepage[i].size)
> new_memseg = 1;
> - else if (((unsigned long)hugepage[i-1].final_va -
> - (unsigned long)hugepage[i].final_va) != hugepage[i].size)
> + else if ((((unsigned long)hugepage[i-1].final_va -
> + (unsigned long)hugepage[i].final_va) !=
> + hugepage[i].size) && (p8))
> + new_memseg = 1;
> + else if ((((unsigned long)hugepage[i].final_va -
> + (unsigned long)hugepage[i-1].final_va) !=
> + hugepage[i].size) && (!p8))
> new_memseg = 1;
> #else
> else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
> @@ -1338,9 +1374,12 @@ rte_eal_hugepage_init(void)
> else {
> #ifdef RTE_ARCH_PPC_64
> /* Use the phy and virt address of the last page as segment
> - * address for IBM Power architecture */
> - mcfg->memseg[j].iova = hugepage[i].physaddr;
> - mcfg->memseg[j].addr = hugepage[i].final_va;
> + * address for IBM Power8 architecture.
> + */
> + if (p8) {
> + mcfg->memseg[j].iova = hugepage[i].physaddr;
> + mcfg->memseg[j].addr = hugepage[i].final_va;
> + }
> #endif
> mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
> }
> @@ -1437,13 +1476,7 @@ rte_eal_hugepage_attach(void)
> * use mmap to get identical addresses as the primary process.
> */
> base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
> - PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> - MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> - MAP_PRIVATE,
> -#endif
> - fd_zero, 0);
> + PROT_READ, MAP_PRIVATE, fd_zero, 0);
> if (base_addr == MAP_FAILED ||
> base_addr != mcfg->memseg[s].addr) {
> max_seg = s;
Sorry, This breaks Chao's workaround that makes memory initialization
for the second process work on power8 (setting nr_hugepages and
nr_overcommit_hugepages) I need to make the mmap flags change
conditional on power8/9. I am working on a v2 patch.
More information about the stable
mailing list