[dpdk-stable] [PATCH] mem: Add Power9 support in rte_eal_hugepage_init

dwilder dwilder at us.ibm.com
Sat Nov 10 03:59:45 CET 2018


On 2018-11-09 11:28, David Wilder wrote:
> Determine if the ppc64 platform is Power9 or Power8 and perform huge
> page mapping appropriately for the selected platform.
> 
> Signed-off-by: Pradeep Satyanarayana <pradeep at us.ibm.com>
> Tested-by: David Wilder <wilder at us.ibm.com>
> ---
> On IBM Power8, when mmaping hugepage files the address hint supplied to 
> mmap
> is not always honored, therefor we let the kernel pick the address by
> specifying a NULL address hint. On Power9 the address hint is honored 
> as
> expected. This patch detects the platform, if Power9 the address hint 
> is
> supplied to mmap and the pages are sorted appropriately. Hugepage 
> mapping for
> both primary and secondary processes now work correctly on Power9. I 
> have
> retain the original behavior and limitations on Power8. Additionally 
> the flags
> supplied to mmap() have been corrected eliminating the message "Cannot 
> get
> a virtual area" messages previously seen during EAL init on Power.
> 
>  lib/librte_eal/linuxapp/eal/eal_memory.c | 75 +++++++++++++++++-------
>  1 file changed, 54 insertions(+), 21 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c
> b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index bac969a12..5b7001be8 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -50,6 +50,9 @@
>  #include <limits.h>
>  #include <sys/ioctl.h>
>  #include <sys/time.h>
> +#ifdef RTE_ARCH_PPC_64
> +#include <sys/auxv.h>
> +#endif
>  #include <signal.h>
>  #include <setjmp.h>
>  #ifdef RTE_EAL_NUMA_AWARE_HUGEPAGES
> @@ -107,6 +110,10 @@ static uint64_t baseaddr = 0x100000000;
> 
>  static bool phys_addrs_available = true;
> 
> +#ifdef RTE_ARCH_PPC_64
> +static int p8;
> +#endif
> +
>  #define RANDOMIZE_VA_SPACE_FILE "/proc/sys/kernel/randomize_va_space"
> 
>  static void
> @@ -309,12 +316,7 @@ get_virtual_area(size_t *size, size_t hugepage_sz)
>  		addr_hint = get_addr_hint();
> 
>  		addr = mmap(addr_hint,
> -				(*size) + hugepage_sz, PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> -				MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> -				MAP_PRIVATE,
> -#endif
> +				(*size) + hugepage_sz, PROT_READ, MAP_PRIVATE,
>  				fd, 0);
>  		if (addr == MAP_FAILED) {
>  			/* map failed. Let's try with less memory */
> @@ -501,6 +503,15 @@ map_all_hugepages(struct hugepage_file
> *hugepg_tbl, struct hugepage_info *hpi,
>  			 * vma_len. If it fails, vma_addr is NULL, so
>  			 * let the kernel provide the address. */
>  			vma_addr = get_virtual_area(&vma_len, hpi->hugepage_sz);
> +#ifdef RTE_ARCH_PPC_64
> +			/*
> +			 * On power8 the address hint is not consistently
> +			 * honored, therefor we always let the
> +			 * kernel provide the address.
> +			 */
> +			if (p8)
> +				vma_addr = NULL;
> +#endif
>  			if (vma_addr == NULL)
>  				vma_len = hugepage_sz;
>  		}
> @@ -1059,6 +1070,23 @@ rte_eal_hugepage_init(void)
>  	int nr_hugefiles, nr_hugepages = 0;
>  	void *addr;
> 
> +#ifdef RTE_ARCH_PPC_64
> +	char *platform;
> +	platform = (char *)getauxval(AT_BASE_PLATFORM);
> +
> +	p8 = 0;
> +
> +	/* Alert the user in case our assumptions are incorrect */
> +	if (platform == NULL)
> +		printf("Some distros on P9 do not support "
> +			"getauxval(AT_BASE_PLATFORM). Assuming P9\n");
> +
> +	if (platform && !strncmp(platform, "power8", 6)) {
> +		RTE_LOG(DEBUG, EAL, "This must be a P8\n");
> +		p8 = 1;
> +	} else
> +		RTE_LOG(DEBUG, EAL, "This must be a P9\n");
> +#endif
>  	test_phys_addrs_available();
> 
>  	memset(used_hp, 0, sizeof(used_hp));
> @@ -1305,14 +1333,22 @@ rte_eal_hugepage_init(void)
>  			new_memseg = 1;
> 
>  #ifdef RTE_ARCH_PPC_64
> -		/* On PPC64 architecture, the mmap always start from higher
> -		 * virtual address to lower address. Here, both the physical
> -		 * address and virtual address are in descending order */
> +		/*
> +		 * On power8 we let the kernel selected the virtual address
> +		 * for mmaped segments, successive mmaps will start from
> +		 * higher virtual address to lower address. Physical address
> +		 * are in descending order for both platforms.
> +		 */
>  		else if ((hugepage[i-1].physaddr - hugepage[i].physaddr) !=
>  		    hugepage[i].size)
>  			new_memseg = 1;
> -		else if (((unsigned long)hugepage[i-1].final_va -
> -		    (unsigned long)hugepage[i].final_va) != hugepage[i].size)
> +		else if ((((unsigned long)hugepage[i-1].final_va -
> +		    (unsigned long)hugepage[i].final_va) !=
> +		    hugepage[i].size) && (p8))
> +			new_memseg = 1;
> +		else if ((((unsigned long)hugepage[i].final_va -
> +		    (unsigned long)hugepage[i-1].final_va) !=
> +		    hugepage[i].size) && (!p8))
>  			new_memseg = 1;
>  #else
>  		else if ((hugepage[i].physaddr - hugepage[i-1].physaddr) !=
> @@ -1338,9 +1374,12 @@ rte_eal_hugepage_init(void)
>  		else {
>  #ifdef RTE_ARCH_PPC_64
>  		/* Use the phy and virt address of the last page as segment
> -		 * address for IBM Power architecture */
> -			mcfg->memseg[j].iova = hugepage[i].physaddr;
> -			mcfg->memseg[j].addr = hugepage[i].final_va;
> +		 * address for IBM Power8 architecture.
> +		 */
> +			if (p8) {
> +				mcfg->memseg[j].iova = hugepage[i].physaddr;
> +				mcfg->memseg[j].addr = hugepage[i].final_va;
> +			}
>  #endif
>  			mcfg->memseg[j].len += mcfg->memseg[j].hugepage_sz;
>  		}
> @@ -1437,13 +1476,7 @@ rte_eal_hugepage_attach(void)
>  		 * use mmap to get identical addresses as the primary process.
>  		 */
>  		base_addr = mmap(mcfg->memseg[s].addr, mcfg->memseg[s].len,
> -				 PROT_READ,
> -#ifdef RTE_ARCH_PPC_64
> -				 MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB,
> -#else
> -				 MAP_PRIVATE,
> -#endif
> -				 fd_zero, 0);
> +				 PROT_READ, MAP_PRIVATE, fd_zero, 0);
>  		if (base_addr == MAP_FAILED ||
>  		    base_addr != mcfg->memseg[s].addr) {
>  			max_seg = s;

Sorry, This breaks Chao's workaround that makes memory initialization 
for the second process work on power8 (setting nr_hugepages and 
nr_overcommit_hugepages)  I need to make the mmap flags change 
conditional on power8/9.  I am working on a v2 patch.



More information about the stable mailing list