[PATCH v5] eal: fix unaligned loads/stores in rte_memcpy_generic
Luc Pelletier
lucp.at.work at gmail.com
Mon Jan 17 16:37:12 CET 2022
Calls to rte_memcpy_generic could result in unaligned loads/stores for
1 < n < 16. This is undefined behavior according to the C standard,
and it gets flagged by the clang undefined behavior sanitizer.
rte_memcpy_generic is called with unaligned src and dst addresses.
When 1 < n < 16, the code would cast both src and dst to a qword,
dword or word pointer, without verifying the alignment of src/dst. The
code was changed to use a packed structure to perform the unaligned
load/store operations. This results in unaligned load/store operations
to be C standards-compliant.
Fixes: d35cc1fe6a7a ("eal/x86: revert select optimized memcpy at run-time")
Cc: Xiaoyun Li <xiaoyun.li at intel.com>
Cc: stable at dpdk.org
Signed-off-by: Luc Pelletier <lucp.at.work at gmail.com>
---
Thanks to Stephen's pointer to look at the linux kernel, I was able to
find a way to perform the unaligned load/store using pure C code. The
new functions added to perform the load/store could likely be moved to a
different file and the code duplication could likely be eliminated by
using a macro. However, I will hold off on making these changes until I
get confirmation from maintainers that this technique is acceptable and
this is what we want to move forward with.
lib/eal/x86/include/rte_memcpy.h | 142 +++++++++++++++++--------------
1 file changed, 80 insertions(+), 62 deletions(-)
diff --git a/lib/eal/x86/include/rte_memcpy.h b/lib/eal/x86/include/rte_memcpy.h
index 1b6c6e585f..4e876d39eb 100644
--- a/lib/eal/x86/include/rte_memcpy.h
+++ b/lib/eal/x86/include/rte_memcpy.h
@@ -45,6 +45,83 @@ extern "C" {
static __rte_always_inline void *
rte_memcpy(void *dst, const void *src, size_t n);
+static __rte_always_inline uint64_t
+rte_load_unaligned_uint64(const void *ptr)
+{
+ struct unaligned_uint64 { uint64_t val; } __rte_packed;
+ return ((const struct unaligned_uint64 *)ptr)->val;
+}
+
+static __rte_always_inline uint32_t
+rte_load_unaligned_uint32(const void *ptr)
+{
+ struct unaligned_uint32 { uint32_t val; } __rte_packed;
+ return ((const struct unaligned_uint32 *)ptr)->val;
+}
+
+static __rte_always_inline uint16_t
+rte_load_unaligned_uint16(const void *ptr)
+{
+ struct unaligned_uint16 { uint16_t val; } __rte_packed;
+ return ((const struct unaligned_uint16 *)ptr)->val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint64(void *ptr, uint64_t val)
+{
+ struct unaligned_uint64 { uint64_t val; } __rte_packed;
+ ((struct unaligned_uint64 *)ptr)->val = val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint32(void *ptr, uint32_t val)
+{
+ struct unaligned_uint32 { uint32_t val; } __rte_packed;
+ ((struct unaligned_uint32 *)ptr)->val = val;
+}
+
+static __rte_always_inline void
+rte_store_unaligned_uint16(void *ptr, uint16_t val)
+{
+ struct unaligned_uint16 { uint16_t val; } __rte_packed;
+ ((struct unaligned_uint16 *)ptr)->val = val;
+}
+
+/**
+ * Copy bytes from one location to another,
+ * locations should not overlap.
+ * Use with unaligned src/dst, and n <= 15.
+ */
+static __rte_always_inline void *
+rte_mov15_or_less_unaligned(void *dst, const void *src, size_t n)
+{
+ void *ret = dst;
+ if (n & 8) {
+ rte_store_unaligned_uint64(
+ dst,
+ rte_load_unaligned_uint64(src));
+ src = ((const uint64_t *)src + 1);
+ dst = ((uint64_t *)dst + 1);
+ }
+ if (n & 4) {
+ rte_store_unaligned_uint32(
+ dst,
+ rte_load_unaligned_uint32(src));
+ src = ((const uint32_t *)src + 1);
+ dst = ((uint32_t *)dst + 1);
+ }
+ if (n & 2) {
+ rte_store_unaligned_uint16(
+ dst,
+ rte_load_unaligned_uint16(src));
+ src = ((const uint16_t *)src + 1);
+ dst = ((uint16_t *)dst + 1);
+ }
+ if (n & 1)
+ *(uint8_t *)dst = *(const uint8_t *)src;
+ return ret;
+}
+
#if defined __AVX512F__ && defined RTE_MEMCPY_AVX512
#define ALIGNMENT_MASK 0x3F
@@ -171,8 +248,6 @@ rte_mov512blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -181,24 +256,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08)
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -379,8 +437,6 @@ rte_mov128blocks(uint8_t *dst, const uint8_t *src, size_t n)
static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t bits;
@@ -389,25 +445,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
@@ -672,8 +710,6 @@ static __rte_always_inline void *
rte_memcpy_generic(void *dst, const void *src, size_t n)
{
__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8;
- uintptr_t dstu = (uintptr_t)dst;
- uintptr_t srcu = (uintptr_t)src;
void *ret = dst;
size_t dstofss;
size_t srcofs;
@@ -682,25 +718,7 @@ rte_memcpy_generic(void *dst, const void *src, size_t n)
* Copy less than 16 bytes
*/
if (n < 16) {
- if (n & 0x01) {
- *(uint8_t *)dstu = *(const uint8_t *)srcu;
- srcu = (uintptr_t)((const uint8_t *)srcu + 1);
- dstu = (uintptr_t)((uint8_t *)dstu + 1);
- }
- if (n & 0x02) {
- *(uint16_t *)dstu = *(const uint16_t *)srcu;
- srcu = (uintptr_t)((const uint16_t *)srcu + 1);
- dstu = (uintptr_t)((uint16_t *)dstu + 1);
- }
- if (n & 0x04) {
- *(uint32_t *)dstu = *(const uint32_t *)srcu;
- srcu = (uintptr_t)((const uint32_t *)srcu + 1);
- dstu = (uintptr_t)((uint32_t *)dstu + 1);
- }
- if (n & 0x08) {
- *(uint64_t *)dstu = *(const uint64_t *)srcu;
- }
- return ret;
+ return rte_mov15_or_less_unaligned(dst, src, n);
}
/**
--
2.25.1
More information about the stable
mailing list