[dpdk-dev,v5] arch/arm: optimization for memcpy on AArch64

Message ID 1515061208-27252-1-git-send-email-herbert.guan@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Herbert Guan Jan. 4, 2018, 10:20 a.m. UTC
  This patch provides an option to do rte_memcpy() using 'restrict'
qualifier, which can induce GCC to do optimizations by using more
efficient instructions, providing some performance gain over memcpy()
on some AArch64 platforms/enviroments.

The memory copy performance differs between different AArch64
platforms. And a more recent glibc (e.g. 2.23 or later)
can provide a better memcpy() performance compared to old glibc
versions. It's always suggested to use a more recent glibc if
possible, from which the entire system can get benefit. If for some
reason an old glibc has to be used, this patch is provided for an
alternative.

This implementation can improve memory copy on some AArch64
platforms, when an old glibc (e.g. 2.19, 2.17...) is being used.
It is disabled by default and needs "RTE_ARCH_ARM64_MEMCPY"
defined to activate. It's not always proving better performance
than memcpy() so users need to run DPDK unit test
"memcpy_perf_autotest" and customize parameters in "customization
section" in rte_memcpy_64.h for best performance.

Compiler version will also impact the rte_memcpy() performance.
It's observed on some platforms and with the same code, GCC 7.2.0
compiled binary can provide better performance than GCC 4.8.5. It's
suggested to use GCC 5.4.0 or later.

Signed-off-by: Herbert Guan <herbert.guan@arm.com>
Acked-by: Jerin Jacob <jerin.jacob@caviumnetworks.com>
---
 config/common_armv8a_linuxapp                      |   6 +
 .../common/include/arch/arm/rte_memcpy_64.h        | 296 +++++++++++++++++++++
 2 files changed, 302 insertions(+)
  

Comments

Thomas Monjalon Jan. 12, 2018, 5:03 p.m. UTC | #1
Hi,

All the code is using ARM64, but the title suggests AArch64.
What is the difference between AArch64 and ARM64 (and ARMv8)?

04/01/2018 11:20, Herbert Guan:
> +/**************************************
> + * Beginning of customization section
> + **************************************/
> +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
> +#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> +/* Only src unalignment will be treaed as unaligned copy */

typo: treaed

> +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> +#else
> +/* Both dst and src unalignment will be treated as unaligned copy */
> +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> +	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
> +#endif
> +
> +
> +/*
> + * If copy size is larger than threshold, memcpy() will be used.
> + * Run "memcpy_perf_autotest" to determine the proper threshold.
> + */
> +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
> +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
> +
> +/*
> + * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
> + */
> +#define USE_RTE_MEMCPY(dst, src, n) \
> +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> +n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> +
> +/**************************************
> + * End of customization section
> + **************************************/

Modifying the code to asjust the platform is not easy for deployment.
Can we move some customization variables inside the configuration file?
  
Herbert Guan Jan. 15, 2018, 10:57 a.m. UTC | #2
Hi Thomas,


> -----Original Message-----
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> Sent: Saturday, January 13, 2018 1:04
> To: Herbert Guan <Herbert.Guan@arm.com>
> Cc: dev@dpdk.org; jerin.jacob@caviumnetworks.com
> Subject: Re: [dpdk-dev] [PATCH v5] arch/arm: optimization for memcpy on
> AArch64
> 
> Hi,
> 
> All the code is using ARM64, but the title suggests AArch64.
> What is the difference between AArch64 and ARM64 (and ARMv8)?

AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.

> 
> 04/01/2018 11:20, Herbert Guan:
> > +/**************************************
> > + * Beginning of customization section
> > +**************************************/
> > +#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F #ifndef
> > +RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
> > +/* Only src unalignment will be treaed as unaligned copy */
> 
> typo: treaed

It should be 'treated'.  Will correct it in the next version.

> 
> > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > +/* Both dst and src unalignment will be treated as unaligned copy */
> > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > +
> > +
> > +/*
> > + * If copy size is larger than threshold, memcpy() will be used.
> > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > + */
> > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> ((size_t)(0xffffffff))
> > +
> > +/*
> > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> platform.
> > + */
> > +#define USE_RTE_MEMCPY(dst, src, n) \
> > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > +
> > +/**************************************
> > + * End of customization section
> > + **************************************/
> 
> Modifying the code to asjust the platform is not easy for deployment.
> Can we move some customization variables inside the configuration file?

RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.  


Best regards,
Herbert
  
Thomas Monjalon Jan. 15, 2018, 11:37 a.m. UTC | #3
15/01/2018 11:57, Herbert Guan:
> Hi Thomas,
> 
> From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > Hi,
> > 
> > All the code is using ARM64, but the title suggests AArch64.
> > What is the difference between AArch64 and ARM64 (and ARMv8)?
> 
> AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.

So please let's continue to call it ARM64.

> > 04/01/2018 11:20, Herbert Guan:
> > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > > +
> > > +
> > > +/*
> > > + * If copy size is larger than threshold, memcpy() will be used.
> > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > + */
> > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> > ((size_t)(0xffffffff))
> > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> > ((size_t)(0xffffffff))
> > > +
> > > +/*
> > > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> > platform.
> > > + */
> > > +#define USE_RTE_MEMCPY(dst, src, n) \
> > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > > +
> > > +/**************************************
> > > + * End of customization section
> > > + **************************************/
> > 
> > Modifying the code to asjust the platform is not easy for deployment.
> > Can we move some customization variables inside the configuration file?
> 
> RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.

The build time configuration must be set in the config file
(config/common_armv8a_linuxapp).
v6 please?
  
Thomas Monjalon Jan. 18, 2018, 11:54 p.m. UTC | #4
Ping
Are we targetting to integrate this optimization in DPDK 18.02?
I am expecting a v6, thanks.

15/01/2018 12:37, Thomas Monjalon:
> 15/01/2018 11:57, Herbert Guan:
> > Hi Thomas,
> > 
> > From: Thomas Monjalon [mailto:thomas@monjalon.net]
> > > Hi,
> > > 
> > > All the code is using ARM64, but the title suggests AArch64.
> > > What is the difference between AArch64 and ARM64 (and ARMv8)?
> > 
> > AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.
> 
> So please let's continue to call it ARM64.
> 
> > > 04/01/2018 11:20, Herbert Guan:
> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else
> > > > +/* Both dst and src unalignment will be treated as unaligned copy */
> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
> > > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &
> > > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif
> > > > +
> > > > +
> > > > +/*
> > > > + * If copy size is larger than threshold, memcpy() will be used.
> > > > + * Run "memcpy_perf_autotest" to determine the proper threshold.
> > > > + */
> > > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD
> > > ((size_t)(0xffffffff))
> > > > +
> > > > +/*
> > > > + * The logic of USE_RTE_MEMCPY() can also be modified to best fit
> > > platform.
> > > > + */
> > > > +#define USE_RTE_MEMCPY(dst, src, n) \
> > > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=
> > > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
> > > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
> > > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
> > > > +
> > > > +/**************************************
> > > > + * End of customization section
> > > > + **************************************/
> > > 
> > > Modifying the code to asjust the platform is not easy for deployment.
> > > Can we move some customization variables inside the configuration file?
> > 
> > RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.
> 
> The build time configuration must be set in the config file
> (config/common_armv8a_linuxapp).
> v6 please?
>
  
Herbert Guan Jan. 19, 2018, 6:16 a.m. UTC | #5
Yes, this is the target.  I was in a 3-day meeting this week, and had limited time to update the patch.  

A new patch v6 was sent out just now.  It's actually sent twice -- forgot to add version info and "--in-reply-to" in the first one.  Please just ignore that and I'm sorry for the disturbance.


Best regards,
Herbert

-----邮件原件-----
发件人: Thomas Monjalon [mailto:thomas@monjalon.net] 
发送时间: 2018年1月19日 7:54
收件人: Herbert Guan <Herbert.Guan@arm.com>
抄送: dev@dpdk.org; jerin.jacob@caviumnetworks.com; nd <nd@arm.com>
主题: Re: [dpdk-dev] [PATCH v5] arch/arm: optimization for memcpy on AArch64

Ping
Are we targetting to integrate this optimization in DPDK 18.02?
I am expecting a v6, thanks.

15/01/2018 12:37, Thomas Monjalon:
> 15/01/2018 11:57, Herbert Guan:

> > Hi Thomas,

> > 

> > From: Thomas Monjalon [mailto:thomas@monjalon.net]

> > > Hi,

> > > 

> > > All the code is using ARM64, but the title suggests AArch64.

> > > What is the difference between AArch64 and ARM64 (and ARMv8)?

> > 

> > AArch64 and ARM64 refer to the same thing.  AArch64 refers to the 64-bit architecture introduced since ARMv8-A.  But the Linux kernel community calls it as ARM64.  As to DPDK, in most existing compile flags, ARM64 is used.  So this patch keeps the ARM64 naming in newly added compile options.

> 

> So please let's continue to call it ARM64.

> 

> > > 04/01/2018 11:20, Herbert Guan:

> > > > +#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \

> > > > +	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK) #else

> > > > +/* Both dst and src unalignment will be treated as unaligned 

> > > > +copy */ #define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \

> > > > +	(((uintptr_t)(dst) | (uintptr_t)(src)) &

> > > > +RTE_ARM64_MEMCPY_ALIGN_MASK) #endif

> > > > +

> > > > +

> > > > +/*

> > > > + * If copy size is larger than threshold, memcpy() will be used.

> > > > + * Run "memcpy_perf_autotest" to determine the proper threshold.

> > > > + */

> > > > +#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD

> > > ((size_t)(0xffffffff))

> > > > +#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD

> > > ((size_t)(0xffffffff))

> > > > +

> > > > +/*

> > > > + * The logic of USE_RTE_MEMCPY() can also be modified to best 

> > > > +fit

> > > platform.

> > > > + */

> > > > +#define USE_RTE_MEMCPY(dst, src, n) \ 

> > > > +((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \ n <=

> > > > +RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \

> > > > +|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \

> > > > +n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))

> > > > +

> > > > +/**************************************

> > > > + * End of customization section  

> > > > +**************************************/

> > > 

> > > Modifying the code to asjust the platform is not easy for deployment.

> > > Can we move some customization variables inside the configuration file?

> > 

> > RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD and RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD are the 2 parameters can be configured during build-time.  The values can be specified with the best values for the target platform.  Usually it's not necessary to change the expression, the comment added in the code is just to raise the hint that this code piece can be modified.

> 

> The build time configuration must be set in the config file 

> (config/common_armv8a_linuxapp).

> v6 please?

>
  

Patch

diff --git a/config/common_armv8a_linuxapp b/config/common_armv8a_linuxapp
index 6732d1e..8f0cbed 100644
--- a/config/common_armv8a_linuxapp
+++ b/config/common_armv8a_linuxapp
@@ -44,6 +44,12 @@  CONFIG_RTE_FORCE_INTRINSICS=y
 # to address minimum DMA alignment across all arm64 implementations.
 CONFIG_RTE_CACHE_LINE_SIZE=128
 
+# Accelarate rte_memcpy.  Be sure to run unit test to determine the
+# best threshold in code.  Refer to notes in source file
+# (lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h) for more
+# info.
+CONFIG_RTE_ARCH_ARM64_MEMCPY=n
+
 CONFIG_RTE_LIBRTE_FM10K_PMD=n
 CONFIG_RTE_LIBRTE_SFC_EFX_PMD=n
 CONFIG_RTE_LIBRTE_AVP_PMD=n
diff --git a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
index b80d8ba..786c9cc 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_memcpy_64.h
@@ -42,6 +42,300 @@ 
 
 #include "generic/rte_memcpy.h"
 
+#ifdef RTE_ARCH_ARM64_MEMCPY
+#include <rte_common.h>
+#include <rte_branch_prediction.h>
+
+/*
+ * The memory copy performance differs on different AArch64 micro-architectures.
+ * And the most recent glibc (e.g. 2.23 or later) can provide a better memcpy()
+ * performance compared to old glibc versions. It's always suggested to use a
+ * more recent glibc if possible, from which the entire system can get benefit.
+ *
+ * This implementation improves memory copy on some aarch64 micro-architectures,
+ * when an old glibc (e.g. 2.19, 2.17...) is being used. It is disabled by
+ * default and needs "RTE_ARCH_ARM64_MEMCPY" defined to activate. It's not
+ * always providing better performance than memcpy() so users need to run unit
+ * test "memcpy_perf_autotest" and customize parameters in customization section
+ * below for best performance.
+ *
+ * Compiler version will also impact the rte_memcpy() performance. It's observed
+ * on some platforms and with the same code, GCC 7.2.0 compiled binaries can
+ * provide better performance than GCC 4.8.5 compiled binaries.
+ */
+
+/**************************************
+ * Beginning of customization section
+ **************************************/
+#define RTE_ARM64_MEMCPY_ALIGN_MASK 0x0F
+#ifndef RTE_ARCH_ARM64_MEMCPY_STRICT_ALIGN
+/* Only src unalignment will be treaed as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	((uintptr_t)(dst) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#else
+/* Both dst and src unalignment will be treated as unaligned copy */
+#define RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) \
+	(((uintptr_t)(dst) | (uintptr_t)(src)) & RTE_ARM64_MEMCPY_ALIGN_MASK)
+#endif
+
+
+/*
+ * If copy size is larger than threshold, memcpy() will be used.
+ * Run "memcpy_perf_autotest" to determine the proper threshold.
+ */
+#define RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD       ((size_t)(0xffffffff))
+#define RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD     ((size_t)(0xffffffff))
+
+/*
+ * The logic of USE_RTE_MEMCPY() can also be modified to best fit platform.
+ */
+#define USE_RTE_MEMCPY(dst, src, n) \
+((!RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= RTE_ARM64_MEMCPY_ALIGNED_THRESHOLD) \
+|| (RTE_ARM64_MEMCPY_IS_UNALIGNED_COPY(dst, src) && \
+n <= RTE_ARM64_MEMCPY_UNALIGNED_THRESHOLD))
+
+/**************************************
+ * End of customization section
+ **************************************/
+
+
+#if defined(RTE_TOOLCHAIN_GCC) && !defined(RTE_ARM64_MEMCPY_SKIP_GCC_VER_CHECK)
+#if (GCC_VERSION < 50400)
+#warning "The GCC version is quite old, which may result in sub-optimal \
+performance of the compiled code. It is suggested that at least GCC 5.4.0 \
+be used."
+#endif
+#endif
+
+static __rte_always_inline
+void rte_mov16(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	*dst128 = *src128;
+}
+
+static __rte_always_inline
+void rte_mov32(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1];
+	dst128[0] = x0;
+	dst128[1] = x1;
+}
+
+static __rte_always_inline
+void rte_mov48(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t x0 = src128[0], x1 = src128[1], x2 = src128[2];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+}
+
+static __rte_always_inline
+void rte_mov64(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	dst128[1] = x1;
+	dst128[2] = x2;
+	dst128[3] = x3;
+}
+
+static __rte_always_inline
+void rte_mov128(uint8_t *dst, const uint8_t *src)
+{
+	__uint128_t *dst128 = (__uint128_t *)dst;
+	const __uint128_t *src128 = (const __uint128_t *)src;
+	/* Keep below declaration & copy sequence for optimized instructions */
+	const __uint128_t
+		x0 = src128[0], x1 = src128[1], x2 = src128[2], x3 = src128[3];
+	dst128[0] = x0;
+	__uint128_t x4 = src128[4];
+	dst128[1] = x1;
+	__uint128_t x5 = src128[5];
+	dst128[2] = x2;
+	__uint128_t x6 = src128[6];
+	dst128[3] = x3;
+	__uint128_t x7 = src128[7];
+	dst128[4] = x4;
+	dst128[5] = x5;
+	dst128[6] = x6;
+	dst128[7] = x7;
+}
+
+static __rte_always_inline
+void rte_mov256(uint8_t *dst, const uint8_t *src)
+{
+	rte_mov128(dst, src);
+	rte_mov128(dst + 128, src + 128);
+}
+
+static __rte_always_inline void
+rte_memcpy_lt16(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n & 0x08) {
+		/* copy 8 ~ 15 bytes */
+		*(uint64_t *)dst = *(const uint64_t *)src;
+		*(uint64_t *)(dst - 8 + n) = *(const uint64_t *)(src - 8 + n);
+	} else if (n & 0x04) {
+		/* copy 4 ~ 7 bytes */
+		*(uint32_t *)dst = *(const uint32_t *)src;
+		*(uint32_t *)(dst - 4 + n) = *(const uint32_t *)(src - 4 + n);
+	} else if (n & 0x02) {
+		/* copy 2 ~ 3 bytes */
+		*(uint16_t *)dst = *(const uint16_t *)src;
+		*(uint16_t *)(dst - 2 + n) = *(const uint16_t *)(src - 2 + n);
+	} else if (n & 0x01) {
+		/* copy 1 byte */
+		*dst = *src;
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n < 64) {
+		if (n == 16) {
+			rte_mov16(dst, src);
+		} else if (n <= 32) {
+			rte_mov16(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else if (n <= 48) {
+			rte_mov32(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		} else {
+			rte_mov48(dst, src);
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		}
+	} else {
+		rte_mov64((uint8_t *)dst, (const uint8_t *)src);
+		if (n > 48 + 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else if (n > 32 + 64)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n > 16 + 64)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n > 64)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge128(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov128(dst, src);
+		src += 128;
+		dst += 128;
+		n -= 128;
+	} while (likely(n >= 128));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else if (n <= 64)
+			rte_mov64(dst - 64 + n, src - 64 + n);
+		else
+			rte_memcpy_ge16_lt128(dst, src, n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge16_lt64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	if (n == 16) {
+		rte_mov16(dst, src);
+	} else if (n <= 32) {
+		rte_mov16(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else if (n <= 48) {
+		rte_mov32(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	} else {
+		rte_mov48(dst, src);
+		rte_mov16(dst - 16 + n, src - 16 + n);
+	}
+}
+
+static __rte_always_inline
+void rte_memcpy_ge64(uint8_t *dst, const uint8_t *src, size_t n)
+{
+	do {
+		rte_mov64(dst, src);
+		src += 64;
+		dst += 64;
+		n -= 64;
+	} while (likely(n >= 64));
+
+	if (likely(n)) {
+		if (n <= 16)
+			rte_mov16(dst - 16 + n, src - 16 + n);
+		else if (n <= 32)
+			rte_mov32(dst - 32 + n, src - 32 + n);
+		else if (n <= 48)
+			rte_mov48(dst - 48 + n, src - 48 + n);
+		else
+			rte_mov64(dst - 64 + n, src - 64 + n);
+	}
+}
+
+#if RTE_CACHE_LINE_SIZE >= 128
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 128) {
+		rte_memcpy_ge16_lt128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge128((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+
+#else
+static __rte_always_inline
+void *rte_memcpy(void *dst, const void *src, size_t n)
+{
+	if (n < 16) {
+		rte_memcpy_lt16((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	if (n < 64) {
+		rte_memcpy_ge16_lt64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	}
+	__builtin_prefetch(src, 0, 0);
+	__builtin_prefetch(dst, 1, 0);
+	if (likely(USE_RTE_MEMCPY(dst, src, n))) {
+		rte_memcpy_ge64((uint8_t *)dst, (const uint8_t *)src, n);
+		return dst;
+	} else
+		return memcpy(dst, src, n);
+}
+#endif /* RTE_CACHE_LINE_SIZE >= 128 */
+
+#else
 static inline void
 rte_mov16(uint8_t *dst, const uint8_t *src)
 {
@@ -80,6 +374,8 @@ 
 
 #define rte_memcpy(d, s, n)	memcpy((d), (s), (n))
 
+#endif /* RTE_ARCH_ARM64_MEMCPY */
+
 #ifdef __cplusplus
 }
 #endif