[v3,1/5] eal: add the APIs to wait until equal

Message ID 1563896626-44862-2-git-send-email-gavin.hu@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series use WFE for locks and ring on aarch64 |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail Compilation issues
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS

Commit Message

Gavin Hu July 23, 2019, 3:43 p.m. UTC
  The rte_wait_until_equalxx APIs abstract the functionality of 'polling
for a memory location to become equal to a given value'.

Signed-off-by: Gavin Hu <gavin.hu@arm.com>
Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
Reviewed-by: Steve Capper <steve.capper@arm.com>
Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 .../common/include/arch/arm/rte_atomic_64.h        |   4 +
 .../common/include/arch/arm/rte_pause_64.h         | 106 +++++++++++++++++++++
 lib/librte_eal/common/include/generic/rte_pause.h  |  39 +++++++-
 3 files changed, 148 insertions(+), 1 deletion(-)
  

Comments

Jerin Jacob Kollanukkaran July 24, 2019, 11:52 a.m. UTC | #1
> -----Original Message-----
> From: Gavin Hu <gavin.hu@arm.com>
> Sent: Tuesday, July 23, 2019 9:14 PM
> To: dev@dpdk.org
> Cc: nd@arm.com; thomas@monjalon.net; stephen@networkplumber.org;
> Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Pavan Nikhilesh
> Bhagavatula <pbhagavatula@marvell.com>;
> Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com
> Subject: [EXT] [PATCH v3 1/5] eal: add the APIs to wait until equal
> 
> The rte_wait_until_equalxx APIs abstract the functionality of 'polling for a
> memory location to become equal to a given value'.
> 
> Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> Reviewed-by: Steve Capper <steve.capper@arm.com>
> Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>  .../common/include/arch/arm/rte_atomic_64.h        |   4 +
>  .../common/include/arch/arm/rte_pause_64.h         | 106
> +++++++++++++++++++++
>  lib/librte_eal/common/include/generic/rte_pause.h  |  39 +++++++-
>  3 files changed, 148 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> index 97060e4..8d742c6 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> @@ -15,8 +15,12 @@ extern "C" {
> 
>  #include "generic/rte_atomic.h"
> 
> +#ifndef dsb
>  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
> +#endif
> +#ifndef dmb
>  #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
> +#endif

Is this change required? Please fix the root cause.
I do see some build error too.

In file included from /home/jerin/dpdk.org/build/include/rte_pause_64.h:13,
                 from /home/jerin/dpdk.org/build/include/rte_pause.h:13,
                 from /home/jerin/dpdk.org/build/include/generic/rte_spinlock.h:25,
                 from /home/jerin/dpdk.org/build/include/rte_spinlock.h:17,
                 from /home/jerin/dpdk.org/drivers/bus/fslmc/mc/mc_sys.c:10:
/home/jerin/dpdk.org/build/include/generic/rte_pause.h: In function 'rte_wait_until_equal16':
/home/jerin/dpdk.org/build/include/generic/rte_pause.h:44:49: error: macro "dmb" passed 1 arguments, but takes just 0
   44 |  __rte_wait_until_equal(addr, expected, memorder);

Command to reproduce(gcc 9.1)

rm -rf build && unset RTE_KERNELDIR && make -j  T=arm64-thunderx-linux-gcc  CROSS=aarch64-linux-gnu- && sed -ri    's,(CONFIG_RTE_KNI_KMOD=)y,\1n,' build/.config && sed -ri  's,(CONFIG_RTE_LIBRTE_VHOST_NUMA=)y,\1n,' build/.config &&  sed -ri  's,(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=)y,\1n,' build/.config && sed -ri  's,(CONFIG_RTE_EAL_IGB_UIO=)y,\1n,' build/.config && CC="ccache gcc" make -j  test-build CROSS=aarch64-linux-gnu-


> 
>  #define rte_mb() dsb(sy)
> 
> diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> index 93895d3..1f7be0a 100644
> --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> @@ -17,6 +17,112 @@ static inline void rte_pause(void)
>  	asm volatile("yield" ::: "memory");
>  }
> 
> +#ifdef RTE_USE_WFE

Do we need it to be under RTE_USE_WFE? If there is no harm, no need to add
Conditional compilation to detect build errors, especially config is disabled by default.

> +/* Wait for *addr to be updated with expected value */ static
> +__rte_always_inline void rte_wait_until_equal16(volatile uint16_t
> +*addr, uint16_t expected, int memorder) {
> +	uint16_t tmp;
> +	if (memorder == __ATOMIC_RELAXED)
> +		asm volatile(
> +			"ldxrh	%w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldxrh	%w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");
> +	else
> +		asm volatile(
> +			"ldaxrh %w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldaxrh	%w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int
> +memorder) {
> +	uint32_t tmp;
> +	if (memorder == __ATOMIC_RELAXED)
> +		asm volatile(
> +			"ldxr	%w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldxr	%w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");
> +	else
> +		asm volatile(
> +			"ldaxr  %w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldaxr  %w[tmp], %w[addr]\n"
> +			"cmp	%w[tmp], %w[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");
> +}
> +
> +static __rte_always_inline void
> +rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int
> +memorder) {
> +	uint64_t tmp;
> +	if (memorder == __ATOMIC_RELAXED)
> +		asm volatile(
> +			"ldxr	%x[tmp], %x[addr]\n"
> +			"cmp	%x[tmp], %x[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldxr	%x[tmp], %x[addr]\n"
> +			"cmp	%x[tmp], %x[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");
> +	else
> +		asm volatile(
> +			"ldaxr  %x[tmp], %x[addr]\n"
> +			"cmp	%x[tmp], %x[expected]\n"
> +			"b.eq	2f\n"
> +			"sevl\n"
> +			"1:	wfe\n"
> +			"ldaxr  %x[tmp], %x[addr]\n"
> +			"cmp	%x[tmp], %x[expected]\n"
> +			"bne	1b\n"
> +			"2:\n"
> +			: [tmp] "=&r" (tmp)
> +			: [addr] "Q"(*addr), [expected] "r"(expected)
> +			: "cc", "memory");

Duplication of code. Please introduce a macro for assembly Skelton.
Something like

http://patches.dpdk.org/patch/56949/

> +}
> +
> +#endif
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> b/lib/librte_eal/common/include/generic/rte_pause.h
> index 52bd4db..8f5f025 100644
> --- a/lib/librte_eal/common/include/generic/rte_pause.h
> +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> @@ -4,7 +4,6 @@
> 
>  #ifndef _RTE_PAUSE_H_
>  #define _RTE_PAUSE_H_
> -
>  /**
>   * @file
>   *
> @@ -12,6 +11,10 @@
>   *
>   */
> 
> +#include <stdint.h>
> +#include <rte_common.h>
> +#include <rte_atomic.h>
> +
>  /**
>   * Pause CPU execution for a short while
>   *
> @@ -20,4 +23,38 @@
>   */
>  static inline void rte_pause(void);
> 
> +#if !defined(RTE_USE_WFE)
> +#ifdef RTE_USE_C11_MEM_MODEL
> +#define __rte_wait_until_equal(addr, expected, memorder) do {\
> +	while (__atomic_load_n(addr, memorder) != expected) \
> +		rte_pause();\
> +} while (0)
> +#else
> +#define __rte_wait_until_equal(addr, expected, memorder) do {\
> +	while (*addr != expected)\
> +		rte_pause();\
> +	if (memorder != __ATOMIC_RELAXED)\
> +		rte_smp_rmb();\

Is this correct wrt all memorder?
If there is no specific gain on no C11 mem model, let have only C11 memmodel
Aka remove RTE_USE_C11_MEM_MODEL

> +} while (0)
> +#endif
> +

Spotted public API. Lets have prototype with very good documentation on the
API details.


> +static __rte_always_inline void
> +rte_wait_until_equal16(volatile uint16_t *addr, uint16_t expected, int
> +memorder) {
> +	__rte_wait_until_equal(addr, expected, memorder); }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int
> +memorder) {
> +	__rte_wait_until_equal(addr, expected, memorder); }
> +
> +static __rte_always_inline void
> +rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int
> +memorder) {
> +	__rte_wait_until_equal(addr, expected, memorder); } #endif /*
> +RTE_USE_WFE */
> +
>  #endif /* _RTE_PAUSE_H_ */
> --
> 2.7.4
  
Gavin Hu July 24, 2019, 6:10 p.m. UTC | #2
Hi Jerin,
> -----Original Message-----
> From: Jerin Jacob Kollanukkaran <jerinj@marvell.com>
> Sent: Wednesday, July 24, 2019 7:53 PM
> To: Gavin Hu (Arm Technology China) <Gavin.Hu@arm.com>; dev@dpdk.org
> Cc: nd <nd@arm.com>; thomas@monjalon.net;
> stephen@networkplumber.org; Pavan Nikhilesh Bhagavatula
> <pbhagavatula@marvell.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: RE: [EXT] [PATCH v3 1/5] eal: add the APIs to wait until equal
> 
> > -----Original Message-----
> > From: Gavin Hu <gavin.hu@arm.com>
> > Sent: Tuesday, July 23, 2019 9:14 PM
> > To: dev@dpdk.org
> > Cc: nd@arm.com; thomas@monjalon.net; stephen@networkplumber.org;
> > Jerin Jacob Kollanukkaran <jerinj@marvell.com>; Pavan Nikhilesh
> > Bhagavatula <pbhagavatula@marvell.com>;
> > Honnappa.Nagarahalli@arm.com; gavin.hu@arm.com
> > Subject: [EXT] [PATCH v3 1/5] eal: add the APIs to wait until equal
> >
> > The rte_wait_until_equalxx APIs abstract the functionality of 'polling for a
> > memory location to become equal to a given value'.
> >
> > Signed-off-by: Gavin Hu <gavin.hu@arm.com>
> > Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
> > Reviewed-by: Steve Capper <steve.capper@arm.com>
> > Reviewed-by: Ola Liljedahl <ola.liljedahl@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
> > Acked-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  .../common/include/arch/arm/rte_atomic_64.h        |   4 +
> >  .../common/include/arch/arm/rte_pause_64.h         | 106
> > +++++++++++++++++++++
> >  lib/librte_eal/common/include/generic/rte_pause.h  |  39 +++++++-
> >  3 files changed, 148 insertions(+), 1 deletion(-)
> >
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > index 97060e4..8d742c6 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
> > @@ -15,8 +15,12 @@ extern "C" {
> >
> >  #include "generic/rte_atomic.h"
> >
> > +#ifndef dsb
> >  #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
> > +#endif
> > +#ifndef dmb
> >  #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
> > +#endif
> 
> Is this change required? Please fix the root cause.
> I do see some build error too.
> 
> In file included from /home/jerin/dpdk.org/build/include/rte_pause_64.h:13,
>                  from /home/jerin/dpdk.org/build/include/rte_pause.h:13,
>                  from
> /home/jerin/dpdk.org/build/include/generic/rte_spinlock.h:25,
>                  from /home/jerin/dpdk.org/build/include/rte_spinlock.h:17,
>                  from /home/jerin/dpdk.org/drivers/bus/fslmc/mc/mc_sys.c:10:
> /home/jerin/dpdk.org/build/include/generic/rte_pause.h: In function
> 'rte_wait_until_equal16':
> /home/jerin/dpdk.org/build/include/generic/rte_pause.h:44:49: error:
> macro "dmb" passed 1 arguments, but takes just 0
>    44 |  __rte_wait_until_equal(addr, expected, memorder);
> 
> Command to reproduce(gcc 9.1)
> 
> rm -rf build && unset RTE_KERNELDIR && make -j  T=arm64-thunderx-linux-
> gcc  CROSS=aarch64-linux-gnu- && sed -ri
> 's,(CONFIG_RTE_KNI_KMOD=)y,\1n,' build/.config && sed -ri
> 's,(CONFIG_RTE_LIBRTE_VHOST_NUMA=)y,\1n,' build/.config &&  sed -ri
> 's,(CONFIG_RTE_EAL_NUMA_AWARE_HUGEPAGES=)y,\1n,' build/.config &&
> sed -ri  's,(CONFIG_RTE_EAL_IGB_UIO=)y,\1n,' build/.config && CC="ccache
> gcc" make -j  test-build CROSS=aarch64-linux-gnu-
> 
> 
> >
> >  #define rte_mb() dsb(sy)
> >
> > diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > index 93895d3..1f7be0a 100644
> > --- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > +++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
> > @@ -17,6 +17,112 @@ static inline void rte_pause(void)
> >  	asm volatile("yield" ::: "memory");
> >  }
> >
> > +#ifdef RTE_USE_WFE
> 
> Do we need it to be under RTE_USE_WFE? If there is no harm, no need to
> add
> Conditional compilation to detect build errors, especially config is disabled
> by default.
> 
> > +/* Wait for *addr to be updated with expected value */ static
> > +__rte_always_inline void rte_wait_until_equal16(volatile uint16_t
> > +*addr, uint16_t expected, int memorder) {
> > +	uint16_t tmp;
> > +	if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile(
> > +			"ldxrh	%w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldxrh	%w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> > +	else
> > +		asm volatile(
> > +			"ldaxrh %w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldaxrh	%w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int
> > +memorder) {
> > +	uint32_t tmp;
> > +	if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile(
> > +			"ldxr	%w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldxr	%w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> > +	else
> > +		asm volatile(
> > +			"ldaxr  %w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldaxr  %w[tmp], %w[addr]\n"
> > +			"cmp	%w[tmp], %w[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> > +}
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int
> > +memorder) {
> > +	uint64_t tmp;
> > +	if (memorder == __ATOMIC_RELAXED)
> > +		asm volatile(
> > +			"ldxr	%x[tmp], %x[addr]\n"
> > +			"cmp	%x[tmp], %x[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldxr	%x[tmp], %x[addr]\n"
> > +			"cmp	%x[tmp], %x[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> > +	else
> > +		asm volatile(
> > +			"ldaxr  %x[tmp], %x[addr]\n"
> > +			"cmp	%x[tmp], %x[expected]\n"
> > +			"b.eq	2f\n"
> > +			"sevl\n"
> > +			"1:	wfe\n"
> > +			"ldaxr  %x[tmp], %x[addr]\n"
> > +			"cmp	%x[tmp], %x[expected]\n"
> > +			"bne	1b\n"
> > +			"2:\n"
> > +			: [tmp] "=&r" (tmp)
> > +			: [addr] "Q"(*addr), [expected] "r"(expected)
> > +			: "cc", "memory");
> 
> Duplication of code. Please introduce a macro for assembly Skelton.
> Something like
> 
> http://patches.dpdk.org/patch/56949/
> 
> > +}
> > +
> > +#endif
> > +
> >  #ifdef __cplusplus
> >  }
> >  #endif
> > diff --git a/lib/librte_eal/common/include/generic/rte_pause.h
> > b/lib/librte_eal/common/include/generic/rte_pause.h
> > index 52bd4db..8f5f025 100644
> > --- a/lib/librte_eal/common/include/generic/rte_pause.h
> > +++ b/lib/librte_eal/common/include/generic/rte_pause.h
> > @@ -4,7 +4,6 @@
> >
> >  #ifndef _RTE_PAUSE_H_
> >  #define _RTE_PAUSE_H_
> > -
> >  /**
> >   * @file
> >   *
> > @@ -12,6 +11,10 @@
> >   *
> >   */
> >
> > +#include <stdint.h>
> > +#include <rte_common.h>
> > +#include <rte_atomic.h>
> > +
> >  /**
> >   * Pause CPU execution for a short while
> >   *
> > @@ -20,4 +23,38 @@
> >   */
> >  static inline void rte_pause(void);
> >
> > +#if !defined(RTE_USE_WFE)
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +#define __rte_wait_until_equal(addr, expected, memorder) do {\
> > +	while (__atomic_load_n(addr, memorder) != expected) \
> > +		rte_pause();\
> > +} while (0)
> > +#else
> > +#define __rte_wait_until_equal(addr, expected, memorder) do {\
> > +	while (*addr != expected)\
> > +		rte_pause();\
> > +	if (memorder != __ATOMIC_RELAXED)\
> > +		rte_smp_rmb();\
> 
> Is this correct wrt all memorder?
> If there is no specific gain on no C11 mem model, let have only C11
> memmodel
> Aka remove RTE_USE_C11_MEM_MODEL

I am looking into all your comments(thanks!) and will submit a new version. 

> > +} while (0)
> > +#endif
> > +
> 
> Spotted public API. Lets have prototype with very good documentation on
> the
> API details.
> 
> 
> > +static __rte_always_inline void
> > +rte_wait_until_equal16(volatile uint16_t *addr, uint16_t expected, int
> > +memorder) {
> > +	__rte_wait_until_equal(addr, expected, memorder); }
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int
> > +memorder) {
> > +	__rte_wait_until_equal(addr, expected, memorder); }
> > +
> > +static __rte_always_inline void
> > +rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int
> > +memorder) {
> > +	__rte_wait_until_equal(addr, expected, memorder); } #endif /*
> > +RTE_USE_WFE */
> > +
> >  #endif /* _RTE_PAUSE_H_ */
> > --
> > 2.7.4
  

Patch

diff --git a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
index 97060e4..8d742c6 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_atomic_64.h
@@ -15,8 +15,12 @@  extern "C" {
 
 #include "generic/rte_atomic.h"
 
+#ifndef dsb
 #define dsb(opt) asm volatile("dsb " #opt : : : "memory")
+#endif
+#ifndef dmb
 #define dmb(opt) asm volatile("dmb " #opt : : : "memory")
+#endif
 
 #define rte_mb() dsb(sy)
 
diff --git a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
index 93895d3..1f7be0a 100644
--- a/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
+++ b/lib/librte_eal/common/include/arch/arm/rte_pause_64.h
@@ -17,6 +17,112 @@  static inline void rte_pause(void)
 	asm volatile("yield" ::: "memory");
 }
 
+#ifdef RTE_USE_WFE
+/* Wait for *addr to be updated with expected value */
+static __rte_always_inline void
+rte_wait_until_equal16(volatile uint16_t *addr, uint16_t expected, int memorder)
+{
+	uint16_t tmp;
+	if (memorder == __ATOMIC_RELAXED)
+		asm volatile(
+			"ldxrh	%w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldxrh	%w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+	else
+		asm volatile(
+			"ldaxrh %w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldaxrh	%w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+}
+
+static __rte_always_inline void
+rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int memorder)
+{
+	uint32_t tmp;
+	if (memorder == __ATOMIC_RELAXED)
+		asm volatile(
+			"ldxr	%w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldxr	%w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+	else
+		asm volatile(
+			"ldaxr  %w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldaxr  %w[tmp], %w[addr]\n"
+			"cmp	%w[tmp], %w[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+}
+
+static __rte_always_inline void
+rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int memorder)
+{
+	uint64_t tmp;
+	if (memorder == __ATOMIC_RELAXED)
+		asm volatile(
+			"ldxr	%x[tmp], %x[addr]\n"
+			"cmp	%x[tmp], %x[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldxr	%x[tmp], %x[addr]\n"
+			"cmp	%x[tmp], %x[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+	else
+		asm volatile(
+			"ldaxr  %x[tmp], %x[addr]\n"
+			"cmp	%x[tmp], %x[expected]\n"
+			"b.eq	2f\n"
+			"sevl\n"
+			"1:	wfe\n"
+			"ldaxr  %x[tmp], %x[addr]\n"
+			"cmp	%x[tmp], %x[expected]\n"
+			"bne	1b\n"
+			"2:\n"
+			: [tmp] "=&r" (tmp)
+			: [addr] "Q"(*addr), [expected] "r"(expected)
+			: "cc", "memory");
+}
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/common/include/generic/rte_pause.h b/lib/librte_eal/common/include/generic/rte_pause.h
index 52bd4db..8f5f025 100644
--- a/lib/librte_eal/common/include/generic/rte_pause.h
+++ b/lib/librte_eal/common/include/generic/rte_pause.h
@@ -4,7 +4,6 @@ 
 
 #ifndef _RTE_PAUSE_H_
 #define _RTE_PAUSE_H_
-
 /**
  * @file
  *
@@ -12,6 +11,10 @@ 
  *
  */
 
+#include <stdint.h>
+#include <rte_common.h>
+#include <rte_atomic.h>
+
 /**
  * Pause CPU execution for a short while
  *
@@ -20,4 +23,38 @@ 
  */
 static inline void rte_pause(void);
 
+#if !defined(RTE_USE_WFE)
+#ifdef RTE_USE_C11_MEM_MODEL
+#define __rte_wait_until_equal(addr, expected, memorder) do {\
+	while (__atomic_load_n(addr, memorder) != expected) \
+		rte_pause();\
+} while (0)
+#else
+#define __rte_wait_until_equal(addr, expected, memorder) do {\
+	while (*addr != expected)\
+		rte_pause();\
+	if (memorder != __ATOMIC_RELAXED)\
+		rte_smp_rmb();\
+} while (0)
+#endif
+
+static __rte_always_inline void
+rte_wait_until_equal16(volatile uint16_t *addr, uint16_t expected, int memorder)
+{
+	__rte_wait_until_equal(addr, expected, memorder);
+}
+
+static __rte_always_inline void
+rte_wait_until_equal32(volatile uint32_t *addr, uint32_t expected, int memorder)
+{
+	__rte_wait_until_equal(addr, expected, memorder);
+}
+
+static __rte_always_inline void
+rte_wait_until_equal64(volatile uint64_t *addr, uint64_t expected, int memorder)
+{
+	__rte_wait_until_equal(addr, expected, memorder);
+}
+#endif /* RTE_USE_WFE */
+
 #endif /* _RTE_PAUSE_H_ */