[PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic

Luc Pelletier lucp.at.work at gmail.com
Sun Jan 16 15:13:05 CET 2022
Previous message (by thread): [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
Next message (by thread): [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.

rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use inline assembly for the load/store operations.
Unaligned load/store operations are permitted in x86/x64 assembly.

Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li at intel.com>
Cc: stable at dpdk.org

Signed-off-by: Luc Pelletier <lucp.at.work at gmail.com>
---

Please note that I didn't write the entire function in inline assembly.
The reason why I kept the bitwise ands as C code is so the optimizer can
remove the branches when n is known at compile-time.

 lib/eal/x86/include/rte_memcpy.h | 134 +++++++++++++++++--------------
 1 file changed, 72 insertions(+), 62 deletions(-)

diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..b99c1b2ca5 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,75 @@ extern "C" {
 static __rte_always_inline void *
 rte_memcpy(void *dst, const void *src, size_t n);
 
+#if defined(__i386__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "eax"
+#elif defined(__x86_64__)
+	#define RTE_ACCUMULATOR_REGISTER_NAME "rax"
+#endif
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+	void *ret = dst;
+	if (n & 8) {
+		asm (
+#if defined(__i386__)
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+#elif defined(__x86_64__)
+		"movq (%[src]), %%rax\n"
+		"movq %%rax, (%[dst])\n"
+		"add $8, %[src]\n"
+		"add $8, %[dst]\n"
+#else
+		#error Unsupported architecture
+#endif
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 4) {
+		asm (
+		"movl (%[src]), %%eax\n"
+		"movl %%eax, (%[dst])\n"
+		"add $4, %[src]\n"
+		"add $4, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 2) {
+		asm (
+		"movw (%[src]), %%ax\n"
+		"movw %%ax, (%[dst])\n"
+		"add $2, %[src]\n"
+		"add $2, %[dst]\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	if (n & 1) {
+		asm (
+		"movb (%[src]), %%al\n"
+		"movb %%al, (%[dst])\n"
+		: [dst] "+r" (dst), [src] "+r" (src)
+		:
+		: RTE_ACCUMULATOR_REGISTER_NAME, "memory");
+	}
+	return ret;
+}
+
 #if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
 
 #define ALIGNMENT_MASK 0x3F
@@ -171,8 +240,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -181,24 +248,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08)
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -379,8 +429,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
 static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t bits;
@@ -389,25 +437,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
@@ -672,8 +702,6 @@ static __rte_always_inline void *
 rte_memcpy_generic(void *dst, const void *src, size_t n)
 {
 	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
-	uintptr_t dstu = (uintptr_t)dst;
-	uintptr_t srcu = (uintptr_t)src;
 	void *ret = dst;
 	size_t dstofss;
 	size_t srcofs;
@@ -682,25 +710,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
 	 * Copy less than 16 bytes
 	 */
 	if (n < 16) {
-		if (n & 0x01) {
-			*(uint8_t *)dstu = *(const uint8_t *)srcu;
-			srcu = (uintptr_t)((const uint8_t *)srcu + 1);
-			dstu = (uintptr_t)((uint8_t *)dstu + 1);
-		}
-		if (n & 0x02) {
-			*(uint16_t *)dstu = *(const uint16_t *)srcu;
-			srcu = (uintptr_t)((const uint16_t *)srcu + 1);
-			dstu = (uintptr_t)((uint16_t *)dstu + 1);
-		}
-		if (n & 0x04) {
-			*(uint32_t *)dstu = *(const uint32_t *)srcu;
-			srcu = (uintptr_t)((const uint32_t *)srcu + 1);
-			dstu = (uintptr_t)((uint32_t *)dstu + 1);
-		}
-		if (n & 0x08) {
-			*(uint64_t *)dstu = *(const uint64_t *)srcu;
-		}
-		return ret;
+		return rte_mov15_or_less_unaligned(dst, src, n);
 	}
 
 	/**
-- 
2.25.1
Previous message (by thread): [PATCH v2] eal: fix unaligned loads/stores in rte_memcpy_generic
Next message (by thread): [PATCH v3] eal: fix unaligned loads/stores in rte_memcpy_generic
Messages sorted by: [ date ] [ thread ] [ subject ] [ author ]
More information about the stable mailing list