[dpdk-dev,v3] arch/arm: optimization for memcpy on AArch64

Message ID 1513565664-19509-1-git-send-email-herbert.guan@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Herbert Guan Dec. 18, 2017, 2:54 a.m. UTC
  This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 292 +++++++++++++++++++++
 2 files changed, 298 insertions(+)
  

Comments

Jerin Jacob Dec. 18, 2017, 7:43 a.m. UTC | #1
-----Original Message-----
> Date: Mon, 18 Dec 2017 10:54:24 +0800
> From: Herbert Guan <herbert.guan@arm.com>
> To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> CC: Herbert Guan <herbert.guan@arm.com>
> Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> X-Mailer: git-send-email 1.8.3.1
> 
> Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> ---
>  config/common_armv8a_linuxapp                      |   6 +
>  .../common/include/arch/arm/rte_memcpy_64.h        | 292 +++++++++++++++++++++
>  2 files changed, 298 insertions(+)
> 
> diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
> index 6732d1e..8f0cbed 100644
> --- a/config/common_armv8a_linuxapp
> +++ b/config/common_armv8a_linuxapp
> @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y
>  # to address minimum DMA alignment across all arm64 implementations.
>  CONFIG_RTE_CACHE_LINE_SIZE=128
>  
> +# Accelarate rte_memcpy.  Be sure to run unit test to determine the

Additional space before "Be". Rather than just mentioning the unit test, mention
the absolute test case name(memcpy_perf_autotest)

> +# best threshold in code.  Refer to notes in source file

Additional space before "Refer"

> +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> +# info.
> +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> +
>  CONFIG_RTE_LIBRTE_FM10K_PMD=n
>  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
>  CONFIG_RTE_LIBRTE_AVP_PMD=n
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> index b80d8ba..1ea275d 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> @@ -42,6 +42,296 @@
>  
>  #include "generic/rte_memcpy.h"
>  
> +#ifdef RTE_ARCH_ARM64_MEMCPY

See the comment below at "(GCC_VERSION < 50400)" check

> +#include <rte_common.h>
> +#include <rte_branch_prediction.h>
> +
> +/*
> + * The memory copy performance differs on different AArch64 micro-architectures.
> + * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
> + * performance compared to old glibc versions. It's always suggested to use a
> + * more recent glibc if possible, from which the entire system can get benefit.
> + *
> + * This implementation improves memory copy on some aarch64 micro-architectures,
> + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
> + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
> + * always providing better performance than memcpy() so users need to run unit
> + * test "memcpy_perf_autotest" and customize parameters in customization section
> + * below for best performance.
> + *
> + * Compiler version will also impact the rte_memcpy() performance. It's observed
> + * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
> + * provide better performance than GCC 4.8.5 compiled binaries.
> + */
> +
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define ALIGNMENT_MASK 0x0F

This symbol will be included in public rte_memcpy.h version for arm64 DPDK build.
Please use RTE_ prefix to avoid multi definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)

> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define IS_UNALIGNED_COPY(dst, src) \
> +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))

Same as above comment.

> +
> +/**************************************
> + * End of customization section
> + **************************************/
> +#ifdef RTE_TOOLCHAIN_GCC
> +#if (GCC_VERSION < 50400)
> +#warning "The GCC version is quite old, which may result in sub-optimal \
> +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> +be used."

Even though it is warning, based on where this file get included it will generate error(see below)
How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY && if (GCC_VERSION >= 50400) ?

  CC eal_common_options.o
In file included from
/home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from
/home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53:
/home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error: #warning
                                                        ^^^^^^^^
"The GCC version is quite old, which may result in sub-optimal
performance of the compiled code. It is suggested that at least GCC
5.4.0 be used." [-Werror=cpp]
                ^^^^^^^^^^^^^^
 #warning "The GCC version is quite old, which may result in sub-optimal
\
  ^


> +#endif
> +#endif
> +
> +
> +#if RTE_CACHE_LINE_SIZE >= 128

We can remove this conditional compilation check. ie. It can get compiled for both cases,
But it will be used only when RTE_CACHE_LINE_SIZE >= 128

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt128
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n < 64) {
> +		if (n == 16) {
> +			rte_mov16(dst, src);
> +		} else if (n <= 32) {
> +			rte_mov16(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		} else if (n <= 48) {
> +			rte_mov32(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		} else {
> +			rte_mov48(dst, src);
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +		}
> +	} else {
> +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> +		if (n > 48 + 64)
> +			rte_mov64(dst - 64 + n, src - 64 + n);
> +		else if (n > 32 + 64)
> +			rte_mov48(dst - 48 + n, src - 48 + n);
> +		else if (n > 16 + 64)
> +			rte_mov32(dst - 32 + n, src - 32 + n);
> +		else if (n > 64)
> +			rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +
> +#else

Same as above comment.

> +static __rte_always_inline void
> +rte_memcpy_ge16_lt64
> +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> +{
> +	if (n == 16) {
> +		rte_mov16(dst, src);
> +	} else if (n <= 32) {
> +		rte_mov16(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else if (n <= 48) {
> +		rte_mov32(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	} else {
> +		rte_mov48(dst, src);
> +		rte_mov16(dst - 16 + n, src - 16 + n);
> +	}
> +}
> +
> +
> +static __rte_always_inline void *
> +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> +{
> +	if (n < 16) {
> +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#if RTE_CACHE_LINE_SIZE >= 128
> +	if (n < 128) {
> +		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#else
> +	if (n < 64) {
> +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
> +		return dst;
> +	}
> +#endif
> +	__builtin_prefetch(src, 0, 0);
> +	__builtin_prefetch(dst, 1, 0);
> +	if (likely(
> +		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
> +		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
> +		  )) {
> +#if RTE_CACHE_LINE_SIZE >= 128
> +		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
> +#else
> +		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> +#endif

Can we remove this #ifdef clutter(We have two of them in a same function)?

I suggest to remove this clutter by having the separate routine. ie.
1)
#if RTE_CACHE_LINE_SIZE >= 128
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#else
rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
{
}
#endif

2) Have separate inline function to resolve following logic and used it 
in both variants.

	if (likely(
		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
		  )) {

With above changes:
Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
  
Herbert Guan Dec. 19, 2017, 5:33 a.m. UTC | #2
Jerin,

Thanks for review and comments.  Please find my feedbacks below inline.

> -----Original Message-----
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Monday, December 18, 2017 15:44
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: dev@dpdk.org
> Subject: Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> 
> -----Original Message-----
> > Date: Mon, 18 Dec 2017 10:54:24 +0800
> > From: Herbert Guan <herbert.guan@arm.com>
> > To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> > CC: Herbert Guan <herbert.guan@arm.com>
> > Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > X-Mailer: git-send-email 1.8.3.1
> >
> > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> > ---
> >  config/common_armv8a_linuxapp                      |   6 +
> >  .../common/include/arch/arm/rte_memcpy_64.h        | 292
> +++++++++++++++++++++
> >  2 files changed, 298 insertions(+)
> >
> > diff --git a/config/common_armv8a_linuxapp
> > b/config/common_armv8a_linuxapp index 6732d1e..8f0cbed 100644
> > --- a/config/common_armv8a_linuxapp
> > +++ b/config/common_armv8a_linuxapp
> > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y  # to address
> minimum
> > DMA alignment across all arm64 implementations.
> >  CONFIG_RTE_CACHE_LINE_SIZE=128
> >
> > +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> 
> Additional space before "Be". Rather than just mentioning the unit test,
> mention the absolute test case name(memcpy_perf_autotest)
> 
> > +# best threshold in code.  Refer to notes in source file
> 
> Additional space before "Refer"

Fixed in new version.

> 
> > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> #
> > +info.
> > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> > +
> >  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> >  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
> >  CONFIG_RTE_LIBRTE_AVP_PMD=n
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > index b80d8ba..1ea275d 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > @@ -42,6 +42,296 @@
> >
> >  #include "generic/rte_memcpy.h"
> >
> > +#ifdef RTE_ARCH_ARM64_MEMCPY
> 
> See the comment below at "(GCC_VERSION < 50400)" check
> 
> > +#include <rte_common.h>
> > +#include <rte_branch_prediction.h>
> > +
> > +/*
> > + * The memory copy performance differs on different AArch64 micro-
> architectures.
> > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > +better memcpy()
> > + * performance compared to old glibc versions. It's always suggested
> > +to use a
> > + * more recent glibc if possible, from which the entire system can get
> benefit.
> > + *
> > + * This implementation improves memory copy on some aarch64
> > +micro-architectures,
> > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > +disabled by
> > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > +It's not
> > + * always providing better performance than memcpy() so users need to
> > +run unit
> > + * test "memcpy_perf_autotest" and customize parameters in
> > +customization section
> > + * below for best performance.
> > + *
> > + * Compiler version will also impact the rte_memcpy() performance.
> > +It's observed
> > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > +binaries can
> > + * provide better performance than GCC 4.8.5 compiled binaries.
> > + */
> > +
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define ALIGNMENT_MASK 0x0F
> 
> This symbol will be included in public rte_memcpy.h version for arm64 DPDK
> build.
> Please use RTE_ prefix to avoid multi
> definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)
> 
Changed to RTE_AARCH64_ALIGN_MASK in new version.

> > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +/* Only src unalignment will be treaed as unaligned copy */ #define
> > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> #else
> > +/* Both dst and src unalignment will be treated as unaligned copy */
> > +#define IS_UNALIGNED_COPY(dst, src) \
> > +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> #endif
> > +
> > +
> > +/*
> > + * If copy size is larger than threshold, memcpy() will be used.
> > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > + */
> > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> 
> Same as above comment.
Added RTE_AARCH64_ prefix in new version.

> 
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> > +#ifdef RTE_TOOLCHAIN_GCC
> > +#if (GCC_VERSION < 50400)
> > +#warning "The GCC version is quite old, which may result in sub-optimal \
> > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> > +be used."
> 
> Even though it is warning, based on where this file get included it will
> generate error(see below)
> How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY
> && if (GCC_VERSION >= 50400) ?
> 
Fully understand that.  While I'm not tending to make it 'silent'.  GCC 4.x is just
quite old and may not provide best optimized code -- not only for DPDK app.  
We can provide another option RTE_AARCH64_SKIP_GCC_VERSION_CHECK to allow
skipping the GCC version check.  How do you think?

>   CC eal_common_options.o
> In file included from
> /home/jerin/dpdk.org/build/include/rte_memcpy.h:37:0,from
> /home/jerin/dpdk.org/lib/librte_eal/common/eal_common_options.c:53:
> /home/jerin/dpdk.org/build/include/rte_memcpy_64.h:93:2: error:
> #warning
>                                                         ^^^^^^^^
> "The GCC version is quite old, which may result in sub-optimal
> performance of the compiled code. It is suggested that at least GCC
> 5.4.0 be used." [-Werror=cpp]
>                 ^^^^^^^^^^^^^^
>  #warning "The GCC version is quite old, which may result in sub-optimal
> \
>   ^
> 
> 
> > +#endif
> > +#endif
> > +
> > +
> > +#if RTE_CACHE_LINE_SIZE >= 128
> 
> We can remove this conditional compilation check. ie. It can get compiled for
> both cases,
> But it will be used only when RTE_CACHE_LINE_SIZE >= 128
> 
OK, it'll be removed in the new version.

> > +static __rte_always_inline void
> > +rte_memcpy_ge16_lt128
> > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> > +{
> > +	if (n < 64) {
> > +		if (n == 16) {
> > +			rte_mov16(dst, src);
> > +		} else if (n <= 32) {
> > +			rte_mov16(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		} else if (n <= 48) {
> > +			rte_mov32(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		} else {
> > +			rte_mov48(dst, src);
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +		}
> > +	} else {
> > +		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
> > +		if (n > 48 + 64)
> > +			rte_mov64(dst - 64 + n, src - 64 + n);
> > +		else if (n > 32 + 64)
> > +			rte_mov48(dst - 48 + n, src - 48 + n);
> > +		else if (n > 16 + 64)
> > +			rte_mov32(dst - 32 + n, src - 32 + n);
> > +		else if (n > 64)
> > +			rte_mov16(dst - 16 + n, src - 16 + n);
> > +	}
> > +}
> > +
> > +
> > +#else
> 
> Same as above comment.
> 
> > +static __rte_always_inline void
> > +rte_memcpy_ge16_lt64
> > +(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
> > +{
> > +	if (n == 16) {
> > +		rte_mov16(dst, src);
> > +	} else if (n <= 32) {
> > +		rte_mov16(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	} else if (n <= 48) {
> > +		rte_mov32(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	} else {
> > +		rte_mov48(dst, src);
> > +		rte_mov16(dst - 16 + n, src - 16 + n);
> > +	}
> > +}
> > +
> > +
> > +static __rte_always_inline void *
> > +rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> > +{
> > +	if (n < 16) {
> > +		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
> > +		return dst;
> > +	}
> > +#if RTE_CACHE_LINE_SIZE >= 128
> > +	if (n < 128) {
> > +		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +		return dst;
> > +	}
> > +#else
> > +	if (n < 64) {
> > +		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src,
> n);
> > +		return dst;
> > +	}
> > +#endif
> > +	__builtin_prefetch(src, 0, 0);
> > +	__builtin_prefetch(dst, 1, 0);
> > +	if (likely(
> > +		  (!IS_UNALIGNED_COPY(dst, src) && n <=
> ALIGNED_THRESHOLD)
> > +		   || (IS_UNALIGNED_COPY(dst, src) && n <=
> UNALIGNED_THRESHOLD)
> > +		  )) {
> > +#if RTE_CACHE_LINE_SIZE >= 128
> > +		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
> > +#else
> > +		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
> > +#endif
> 
> Can we remove this #ifdef clutter(We have two of them in a same function)?
> 
> I suggest to remove this clutter by having the separate routine. ie.
> 1)
> #if RTE_CACHE_LINE_SIZE >= 128
> rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> {
> }
> #else
> rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
> {
> }
> #endif
> 
> 2) Have separate inline function to resolve following logic and used it
> in both variants.
> 
> 	if (likely(
> 		  (!IS_UNALIGNED_COPY(dst, src) && n <=
> ALIGNED_THRESHOLD)
> 		   || (IS_UNALIGNED_COPY(dst, src) && n <=
> UNALIGNED_THRESHOLD)
> 		  )) {
> 

Implemented as suggested in new version.

> With above changes:
> Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>

Thanks,
Herbert Guan
  
Jerin Jacob Dec. 19, 2017, 7:24 a.m. UTC | #3
-----Original Message-----
> Date: Tue, 19 Dec 2017 05:33:19 +0000
> From: Herbert Guan <Herbert.Guan@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: "dev@dpdk.org" <dev@dpdk.org>, nd <nd@arm.com>
> Subject: RE: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> 
> Jerin,
> 
> Thanks for review and comments.  Please find my feedbacks below inline.
> 
> > -----Original Message-----
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Monday, December 18, 2017 15:44
> > To: Herbert Guan <Herbert.Guan@arm.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > 
> > -----Original Message-----
> > > Date: Mon, 18 Dec 2017 10:54:24 +0800
> > > From: Herbert Guan <herbert.guan@arm.com>
> > > To: dev@dpdk.org, jerin.jacob@caviumnetworks.com
> > > CC: Herbert Guan <herbert.guan@arm.com>
> > > Subject: [PATCH v3] arch/arm: optimization for memcpy on AArch64
> > > X-Mailer: git-send-email 1.8.3.1
> > >
> > > Signed-off-by: Herbert Guan <herbert.guan@arm.com>
> > > ---
> > >  config/common_armv8a_linuxapp                      |   6 +
> > >  .../common/include/arch/arm/rte_memcpy_64.h        | 292
> > +++++++++++++++++++++
> > >  2 files changed, 298 insertions(+)
> > >
> > > diff --git a/config/common_armv8a_linuxapp
> > > b/config/common_armv8a_linuxapp index 6732d1e..8f0cbed 100644
> > > --- a/config/common_armv8a_linuxapp
> > > +++ b/config/common_armv8a_linuxapp
> > > @@ -44,6 +44,12 @@ CONFIG_RTE_FORCE_INTRINSICS=y  # to address
> > minimum
> > > DMA alignment across all arm64 implementations.
> > >  CONFIG_RTE_CACHE_LINE_SIZE=128
> > >
> > > +# Accelarate rte_memcpy.  Be sure to run unit test to determine the
> > 
> > Additional space before "Be". Rather than just mentioning the unit test,
> > mention the absolute test case name(memcpy_perf_autotest)
> > 
> > > +# best threshold in code.  Refer to notes in source file
> > 
> > Additional space before "Refer"
> 
> Fixed in new version.
> 
> > 
> > > +# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
> > #
> > > +info.
> > > +CONFIG_RTE_ARCH_ARM64_MEMCPY=n
> > > +
> > >  CONFIG_RTE_LIBRTE_FM10K_PMD=n
> > >  CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
> > >  CONFIG_RTE_LIBRTE_AVP_PMD=n
> > > diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > index b80d8ba..1ea275d 100644
> > > --- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > +++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
> > > @@ -42,6 +42,296 @@
> > >
> > >  #include "generic/rte_memcpy.h"
> > >
> > > +#ifdef RTE_ARCH_ARM64_MEMCPY
> > 
> > See the comment below at "(GCC_VERSION < 50400)" check
> > 
> > > +#include <rte_common.h>
> > > +#include <rte_branch_prediction.h>
> > > +
> > > +/*
> > > + * The memory copy performance differs on different AArch64 micro-
> > architectures.
> > > + * And the most recent glibc (e.g. 2.23 or later) can provide a
> > > +better memcpy()
> > > + * performance compared to old glibc versions. It's always suggested
> > > +to use a
> > > + * more recent glibc if possible, from which the entire system can get
> > benefit.
> > > + *
> > > + * This implementation improves memory copy on some aarch64
> > > +micro-architectures,
> > > + * when an old glibc (e.g. 2.19, 2.17...) is being used. It is
> > > +disabled by
> > > + * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate.
> > > +It's not
> > > + * always providing better performance than memcpy() so users need to
> > > +run unit
> > > + * test "memcpy_perf_autotest" and customize parameters in
> > > +customization section
> > > + * below for best performance.
> > > + *
> > > + * Compiler version will also impact the rte_memcpy() performance.
> > > +It's observed
> > > + * on some platforms and with the same code, GCC 7.2.0 compiled
> > > +binaries can
> > > + * provide better performance than GCC 4.8.5 compiled binaries.
> > > + */
> > > +
> > > +/**************************************
> > > + * Beginning of customization section
> > > +**************************************/
> > > +#define ALIGNMENT_MASK 0x0F
> > 
> > This symbol will be included in public rte_memcpy.h version for arm64 DPDK
> > build.
> > Please use RTE_ prefix to avoid multi
> > definition.(RTE_ARCH_ARM64_ALIGN_MASK ? or any shorter name)
> > 
> Changed to RTE_AARCH64_ALIGN_MASK in new version.

Since it is something to do with memcpy and arm64, I prefer,
RTE_ARM64_MEMCPY_ALIGN_MASK


> 
> > > +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > > +/* Only src unalignment will be treaed as unaligned copy */ #define
> > > +IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
> > #else
> > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > +#define IS_UNALIGNED_COPY(dst, src) \
> > > +		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
> > #endif
> > > +
> > > +
> > > +/*
> > > + * If copy size is larger than threshold, memcpy() will be used.
> > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > + */
> > > +#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> > > +#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> > 
> > Same as above comment.
> Added RTE_AARCH64_ prefix in new version.

Same as above.

> 
> > 
> > > +
> > > +/**************************************
> > > + * End of customization section
> > > + **************************************/
> > > +#ifdef RTE_TOOLCHAIN_GCC
> > > +#if (GCC_VERSION < 50400)
> > > +#warning "The GCC version is quite old, which may result in sub-optimal \
> > > +performance of the compiled code. It is suggested that at least GCC 5.4.0 \
> > > +be used."
> > 
> > Even though it is warning, based on where this file get included it will
> > generate error(see below)
> > How about, selecting optimized memcpy when RTE_ARCH_ARM64_MEMCPY
> > && if (GCC_VERSION >= 50400) ?
> > 
> Fully understand that.  While I'm not tending to make it 'silent'.  GCC 4.x is just
> quite old and may not provide best optimized code -- not only for DPDK app.  
> We can provide another option RTE_AARCH64_SKIP_GCC_VERSION_CHECK to allow
> skipping the GCC version check.  How do you think?

I prefer to reduce the options. But, No strong opinion on this as this
the RTE_ARCH_ARM64_MEMCPY option is by default disabled(ie. No risk).
  

Patch

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..8f0cbed 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@  CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..1ea275d 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,296 @@ 
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define ALIGNMENT_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) ((uintptr_t)(dst) & ALIGNMENT_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define IS_UNALIGNED_COPY(dst, src) \
+		(((uintptr_t)(dst) | (uintptr_t)(src)) & ALIGNMENT_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+/**************************************
+ * End of customization section
+ **************************************/
+#ifdef RTE_TOOLCHAIN_GCC
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline void
+rte_mov16(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline void
+rte_mov32(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+}
+
+static __rte_always_inline void
+rte_mov48(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+}
+
+static __rte_always_inline void
+rte_mov64(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	__uint128_t *restrict dst128 = (__uint128_t *restrict)dst;
+	const __uint128_t *restrict src128 = (const __uint128_t *restrict)src;
+	dst128[0] = src128[0];
+	dst128[1] = src128[1];
+	dst128[2] = src128[2];
+	dst128[3] = src128[3];
+}
+
+static __rte_always_inline void
+rte_mov128(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	/*
+	 * GCC generated code is not having good performance in 128 bytes
+	 * case on some platforms.  Use ASM codes can aviod such downgrade.
+	 */
+	register uint64_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8,
+			tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
+	__asm__  __volatile__ (
+		"ldp %2, %3, [%0]\n\t"
+		"ldp %4, %5, [%0, 16]\n\t"
+		"ldp %6, %7, [%0, 32]\n\t"
+		"ldp %8, %9, [%0, 48]\n\t"
+		"ldp %10, %11, [%0, 64]\n\t"
+		"ldp %12, %13, [%0, 80]\n\t"
+		"ldp %14, %15, [%0, 96]\n\t"
+		"ldp %16, %17, [%0, 112]\n\t"
+		"stp %2, %3, [%1]\n\t"
+		"stp %4, %5, [%1, 16]\n\t"
+		"stp %6, %7, [%1, 32]\n\t"
+		"stp %8, %9, [%1, 48]\n\t"
+		"stp %10, %11, [%1, 64]\n\t"
+		"stp %12, %13, [%1, 80]\n\t"
+		"stp %14, %15, [%1, 96]\n\t"
+		"stp %16, %17, [%1, 112]\n\t"
+		: "+r" (src), "+r" (dst),
+		  "=&r" (tmp1), "=&r" (tmp2), "=&r" (tmp3), "=&r" (tmp4),
+		  "=&r" (tmp5), "=&r" (tmp6), "=&r" (tmp7), "=&r" (tmp8),
+		  "=&r" (tmp9), "=&r" (tmp10), "=&r" (tmp11), "=&r" (tmp12),
+		  "=&r" (tmp13), "=&r" (tmp14), "=&r" (tmp15), "=&r" (tmp16)
+		:
+		: "memory");
+}
+
+static __rte_always_inline void
+rte_mov256(uint8_t *restrict dst, const uint8_t *restrict src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline void
+rte_memcpy_ge16_lt128
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge128(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+#else
+static __rte_always_inline void
+rte_memcpy_ge16_lt64
+(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline void
+rte_memcpy_ge64(uint8_t *restrict dst, const uint8_t *restrict src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#endif
+
+static __rte_always_inline void *
+rte_memcpy(void *restrict dst, const void *restrict src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#if RTE_CACHE_LINE_SIZE >= 128
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#else
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+#endif
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(
+		  (!IS_UNALIGNED_COPY(dst, src) && n <= ALIGNED_THRESHOLD)
+		   || (IS_UNALIGNED_COPY(dst, src) && n <= UNALIGNED_THRESHOLD)
+		  )) {
+#if RTE_CACHE_LINE_SIZE >= 128
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+#else
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+#endif
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +370,8 @@ 
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif
+
 #ifdef __cplusplus
 }
 #endif