[v7] eal: add cache-line demote support

Message ID 1602582191-23807-2-git-send-email-omkar.maslekar@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series [v7] eal: add cache-line demote support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/iol-broadcom-Functional success Functional Testing PASS
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/Intel-compilation success Compilation OK
ci/iol-mellanox-Performance success Performance Testing PASS

Commit Message

Omkar Maslekar Oct. 13, 2020, 9:43 a.m. UTC
  rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
enables software to hint to hardware that line is likely to be shared.
Useful in core-to-core communications where cache-line is likely to be
shared. ARM and PPC implementation is provided with NOP and can be added
if any equivalent instructions could be used for implementation on those
architectures.

Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
Acked-by: Bruce Richardson <bruce.richardson@intel.com>
Acked-by: David Christensen <drc@linux.vnet.ibm.com>
Acked-by: Jerin Jacob <jerinj@marvell.com>

---
v7: fixed experimental tag

v6: marked rte_cldemote as experimental
    added rte_cldemote call in existing app/test_prefetch.c

v5: documentation updated
    fixed formatting issue in release notes
    added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
*
v4: updated bold text for title and fixed margin in release notes
*
v3: fixed warning regarding whitespace
*
v2: documentation updated
---
---
 app/test/test_prefetch.c                      |  4 ++++
 doc/guides/rel_notes/release_20_11.rst        |  7 +++++++
 lib/librte_eal/arm/include/rte_prefetch_32.h  |  7 +++++++
 lib/librte_eal/arm/include/rte_prefetch_64.h  |  7 +++++++
 lib/librte_eal/include/generic/rte_prefetch.h | 15 +++++++++++++++
 lib/librte_eal/ppc/include/rte_prefetch.h     |  7 +++++++
 lib/librte_eal/x86/include/rte_prefetch.h     | 11 +++++++++++
 7 files changed, 58 insertions(+)
  

Comments

Ruifeng Wang Oct. 14, 2020, 7:24 a.m. UTC | #1
> -----Original Message-----
> From: Omkar Maslekar <omkar.maslekar@intel.com>
> Sent: Tuesday, October 13, 2020 5:43 PM
> To: dev@dpdk.org
> Cc: bruce.richardson@intel.com; ciara.loftus@intel.com;
> omkar.maslekar@intel.com; drc@linux.vnet.ibm.com; jerinj@marvell.com;
> Ruifeng Wang <Ruifeng.Wang@arm.com>; Honnappa Nagarahalli
> <Honnappa.Nagarahalli@arm.com>
> Subject: [PATCH v7] eal: add cache-line demote support
> 
> rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
> enables software to hint to hardware that line is likely to be shared.
> Useful in core-to-core communications where cache-line is likely to be
> shared. ARM and PPC implementation is provided with NOP and can be
> added if any equivalent instructions could be used for implementation on
> those architectures.
> 
> Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
> Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> Acked-by: David Christensen <drc@linux.vnet.ibm.com>
> Acked-by: Jerin Jacob <jerinj@marvell.com>
> 
> ---
> v7: fixed experimental tag
> 
> v6: marked rte_cldemote as experimental
>     added rte_cldemote call in existing app/test_prefetch.c
> 
> v5: documentation updated
>     fixed formatting issue in release notes
>     added Acked-by: Bruce Richardson <bruce.richardson@intel.com>
> *
> v4: updated bold text for title and fixed margin in release notes
> *
> v3: fixed warning regarding whitespace
> *
> v2: documentation updated
> ---
> ---
>  app/test/test_prefetch.c                      |  4 ++++
>  doc/guides/rel_notes/release_20_11.rst        |  7 +++++++
>  lib/librte_eal/arm/include/rte_prefetch_32.h  |  7 +++++++
> lib/librte_eal/arm/include/rte_prefetch_64.h  |  7 +++++++
> lib/librte_eal/include/generic/rte_prefetch.h | 15 +++++++++++++++
>  lib/librte_eal/ppc/include/rte_prefetch.h     |  7 +++++++
>  lib/librte_eal/x86/include/rte_prefetch.h     | 11 +++++++++++
>  7 files changed, 58 insertions(+)
> 
> diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c index
> 41f219a..5c58d0c 100644
> --- a/app/test/test_prefetch.c
> +++ b/app/test/test_prefetch.c
> @@ -26,7 +26,11 @@
>  	rte_prefetch1(&a);
>  	rte_prefetch2(&a);
> 
> +/* test for marking a line as shared to test cldemote functionality */
> +	rte_cldemote(&a);
> +
>  	return 0;
>  }
> 
> +
>  REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch); diff --git
> a/doc/guides/rel_notes/release_20_11.rst
> b/doc/guides/rel_notes/release_20_11.rst
> index b7881f2..8a1ed01 100644
> --- a/doc/guides/rel_notes/release_20_11.rst
> +++ b/doc/guides/rel_notes/release_20_11.rst
> @@ -171,6 +171,13 @@ New Features
>    * Extern objects and functions can be plugged into the pipeline.
>    * Transaction-oriented table updates.
> 
> +* **Added new function rte_cldemote in rte_prefetch.h.**
> +
> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
> +  CLDEMOTE moves the cache line to the more remote cache, where it
> + expects  sharing to be efficient. Moving the cache line to a level
> + more distant from  the processor helps to accelerate core-to-core
> communication.
> +
> 
>  Removed Items
>  -------------
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h
> b/lib/librte_eal/arm/include/rte_prefetch_32.h
> index e53420a..28b3d48 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -33,6 +34,12 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
> 
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p) {
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h
> b/lib/librte_eal/arm/include/rte_prefetch_64.h
> index fc2b391..1c722eb 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6 +33,12 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));  }
> 
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p) {
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h
> b/lib/librte_eal/include/generic/rte_prefetch.h
> index 6e47bdf..ad9844c 100644
> --- a/lib/librte_eal/include/generic/rte_prefetch.h
> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
> @@ -51,4 +51,19 @@
>   */
>  static inline void rte_prefetch_non_temporal(const volatile void *p);
> 
> +/**
> + * Demote a cache line to a more distant level of cache from the processor.
> + *
> + * CLDEMOTE hints to hardware to move (demote) a cache line from the
> +closest to
> + * the processor to a level more distant from the processor. It is a
> +hint and
> + * not guarantee. rte_cldemote is intended to move the cache line to
> +the more
> + * remote cache, where it expects sharing to be efficient and to
> +indicate that a
> + * line may be accessed by a different core in the future.
> + *
> + * @param p
> + *   Address to demote
> + */
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p);
> +
>  #endif /* _RTE_PREFETCH_H_ */
> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h
> b/lib/librte_eal/ppc/include/rte_prefetch.h
> index 9ba07c8..b55cac4 100644
> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
> @@ -11,6 +11,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -34,6 +35,12 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
> 
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p) {
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h
> b/lib/librte_eal/x86/include/rte_prefetch.h
> index 384c6b3..92ba05a 100644
> --- a/lib/librte_eal/x86/include/rte_prefetch.h
> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
> @@ -10,6 +10,7 @@
>  #endif
> 
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
> 
>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6 +33,16 @@
> static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char
> *)p));  }
> 
> +/*
> + * we're using raw byte codes for now as only the newest compiler
> + * versions support this instruction natively.
> + */
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p) {
> +	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p)); }
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 1.8.3.1

Reviewed-by: Ruifeng Wang <ruifeng.wang@arm.com>
  
David Marchand Oct. 15, 2020, 8:01 a.m. UTC | #2
Repeating my questions:
- would there be a point in hinting at where the "demoted" line goes?
- is this instruction available on all x86 CPUs?


See comments:

On Tue, Oct 13, 2020 at 6:47 PM Omkar Maslekar <omkar.maslekar@intel.com> wrote:
> diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c
> index 41f219a..5c58d0c 100644
> --- a/app/test/test_prefetch.c
> +++ b/app/test/test_prefetch.c
> @@ -26,7 +26,11 @@
>         rte_prefetch1(&a);
>         rte_prefetch2(&a);
>
> +/* test for marking a line as shared to test cldemote functionality */

Non indented comment that gives no more info than the call itself.
Please remove.

> +       rte_cldemote(&a);
> +
>         return 0;
>  }
>
> +

Please remove this empty line.


>  REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch);
> diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
> index b7881f2..8a1ed01 100644
> --- a/doc/guides/rel_notes/release_20_11.rst
> +++ b/doc/guides/rel_notes/release_20_11.rst
> @@ -171,6 +171,13 @@ New Features
>    * Extern objects and functions can be plugged into the pipeline.
>    * Transaction-oriented table updates.
>
> +* **Added new function rte_cldemote in rte_prefetch.h.**
> +
> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.

This should come at the top of the features list (but after "write
combining store" entry that got in first).

Please add a mention that it only concerns x86.


> +  CLDEMOTE moves the cache line to the more remote cache, where it expects
> +  sharing to be efficient. Moving the cache line to a level more distant from
> +  the processor helps to accelerate core-to-core communication.
> +
>
>  Removed Items
>  -------------
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h b/lib/librte_eal/arm/include/rte_prefetch_32.h
> index e53420a..28b3d48 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
> @@ -10,6 +10,7 @@
>  #endif
>
>  #include <rte_common.h>
> +#include <rte_compat.h>

Move rte_compat.h inclusion from the arch headers to the
generic/rte_prefetch.h header only.


>  #include "generic/rte_prefetch.h"
>
>  static inline void rte_prefetch0(const volatile void *p)
> @@ -33,6 +34,12 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>         rte_prefetch0(p);
>  }
>
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +       RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h b/lib/librte_eal/arm/include/rte_prefetch_64.h
> index fc2b391..1c722eb 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
> @@ -10,6 +10,7 @@
>  #endif
>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
>
>  static inline void rte_prefetch0(const volatile void *p)
> @@ -32,6 +33,12 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>         asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));
>  }
>
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +       RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h b/lib/librte_eal/include/generic/rte_prefetch.h
> index 6e47bdf..ad9844c 100644
> --- a/lib/librte_eal/include/generic/rte_prefetch.h
> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
> @@ -51,4 +51,19 @@
>   */
>  static inline void rte_prefetch_non_temporal(const volatile void *p);
>
> +/**
> + * Demote a cache line to a more distant level of cache from the processor.
> + *
> + * CLDEMOTE hints to hardware to move (demote) a cache line from the closest to
> + * the processor to a level more distant from the processor. It is a hint and
> + * not guarantee. rte_cldemote is intended to move the cache line to the more

guaranteed*


> + * remote cache, where it expects sharing to be efficient and to indicate that a
> + * line may be accessed by a different core in the future.
> + *
> + * @param p
> + *   Address to demote
> + */
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p);
> +
>  #endif /* _RTE_PREFETCH_H_ */
> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
> index 9ba07c8..b55cac4 100644
> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
> @@ -11,6 +11,7 @@
>  #endif
>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
>
>  static inline void rte_prefetch0(const volatile void *p)
> @@ -34,6 +35,12 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>         rte_prefetch0(p);
>  }
>
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +       RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h b/lib/librte_eal/x86/include/rte_prefetch.h
> index 384c6b3..92ba05a 100644
> --- a/lib/librte_eal/x86/include/rte_prefetch.h
> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
> @@ -10,6 +10,7 @@
>  #endif
>
>  #include <rte_common.h>
> +#include <rte_compat.h>
>  #include "generic/rte_prefetch.h"
>
>  static inline void rte_prefetch0(const volatile void *p)
> @@ -32,6 +33,16 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>         asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
>  }
>
> +/*
> + * we're using raw byte codes for now as only the newest compiler

We use

> + * versions support this instruction natively.
> + */
> +__rte_experimental
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +       asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> --
> 1.8.3.1
>
  
Omkar Maslekar Oct. 15, 2020, 2:41 p.m. UTC | #3
Hi David,

 >-----Original Message-----
 >From: David Marchand <david.marchand@redhat.com>
 >Sent: Thursday, October 15, 2020 1:01 AM
 >To: Maslekar, Omkar <omkar.maslekar@intel.com>
 >Cc: dev <dev@dpdk.org>; Richardson, Bruce <bruce.richardson@intel.com>;
 >Loftus, Ciara <ciara.loftus@intel.com>; David Christensen
 ><drc@linux.vnet.ibm.com>; Jerin Jacob Kollanukkaran <jerinj@marvell.com>;
 >Ruifeng Wang (Arm Technology China) <ruifeng.wang@arm.com>;
 >Honnappa Nagarahalli <honnappa.nagarahalli@arm.com>
 >Subject: Re: [dpdk-dev] [PATCH v7] eal: add cache-line demote support
 >
 >Repeating my questions:
 >- would there be a point in hinting at where the "demoted" line goes?
Yes, it is worth mentioning a point that demoted line goes to last shared level of cache hierarchy. Demotion to desired cache level is not possible.
 >- is this instruction available on all x86 CPUs?
Yes, this instruction is available on all x86 CPUs, it works on latest cpus and substitute NOP in the older generations
 >
 >
 >See comments:
 >
 >On Tue, Oct 13, 2020 at 6:47 PM Omkar Maslekar
 ><omkar.maslekar@intel.com> wrote:
 >> diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c index
 >> 41f219a..5c58d0c 100644
 >> --- a/app/test/test_prefetch.c
 >> +++ b/app/test/test_prefetch.c
 >> @@ -26,7 +26,11 @@
 >>         rte_prefetch1(&a);
 >>         rte_prefetch2(&a);
 >>
 >> +/* test for marking a line as shared to test cldemote functionality
 >> +*/
 >
 >Non indented comment that gives no more info than the call itself.
 >Please remove.
I will fix it
 >
 >> +       rte_cldemote(&a);
 >> +
 >>         return 0;
 >>  }
 >>
 >> +
 >
 >Please remove this empty line.
 I will fix it
 >
 >>  REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch); diff --git
 >> a/doc/guides/rel_notes/release_20_11.rst
 >> b/doc/guides/rel_notes/release_20_11.rst
 >> index b7881f2..8a1ed01 100644
 >> --- a/doc/guides/rel_notes/release_20_11.rst
 >> +++ b/doc/guides/rel_notes/release_20_11.rst
 >> @@ -171,6 +171,13 @@ New Features
 >>    * Extern objects and functions can be plugged into the pipeline.
 >>    * Transaction-oriented table updates.
 >>
 >> +* **Added new function rte_cldemote in rte_prefetch.h.**
 >> +
 >> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in
 >reverse.
 >
 >This should come at the top of the features list (but after "write combining
 >store" entry that got in first).
 >
 >Please add a mention that it only concerns x86.
I will modify the sequence in the release notes
 >
 >
 >> +  CLDEMOTE moves the cache line to the more remote cache, where it
 >> + expects  sharing to be efficient. Moving the cache line to a level
 >> + more distant from  the processor helps to accelerate core-to-core
 >communication.
 >> +
 >>
 >>  Removed Items
 >>  -------------
 >> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> b/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> index e53420a..28b3d48 100644
 >> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> @@ -10,6 +10,7 @@
 >>  #endif
 >>
 >>  #include <rte_common.h>
 >> +#include <rte_compat.h>
 >
 >Move rte_compat.h inclusion from the arch headers to the
 >generic/rte_prefetch.h header only.
I got below build error if I move rte_compat.h inclusion from the arch headers to the generic/rte_prefetch.h header only. I will remove it and send out a new patch v8.
In file included from ../lib/librte_eal/x86/include/rte_prefetch.h:14:0,
                 from ../lib/librte_table/rte_swx_table_em.c:10:
../lib/librte_eal/include/generic/rte_prefetch.h:67:1: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘static’
 static inline void rte_cldemote(const volatile void *p);
 >
 >
 >>  #include "generic/rte_prefetch.h"
 >>
 >>  static inline void rte_prefetch0(const volatile void *p) @@ -33,6
 >> +34,12 @@ static inline void rte_prefetch_non_temporal(const volatile
 >void *p)
 >>         rte_prefetch0(p);
 >>  }
 >>
 >> +__rte_experimental
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +       RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> b/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> index fc2b391..1c722eb 100644
 >> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> @@ -10,6 +10,7 @@
 >>  #endif
 >>
 >>  #include <rte_common.h>
 >> +#include <rte_compat.h>
 >>  #include "generic/rte_prefetch.h"
 >>
 >>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6
 >> +33,12 @@ static inline void rte_prefetch_non_temporal(const volatile
 >void *p)
 >>         asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));  }
 >>
 >> +__rte_experimental
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +       RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h
 >> b/lib/librte_eal/include/generic/rte_prefetch.h
 >> index 6e47bdf..ad9844c 100644
 >> --- a/lib/librte_eal/include/generic/rte_prefetch.h
 >> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
 >> @@ -51,4 +51,19 @@
 >>   */
 >>  static inline void rte_prefetch_non_temporal(const volatile void *p);
 >>
 >> +/**
 >> + * Demote a cache line to a more distant level of cache from the
 >processor.
 >> + *
 >> + * CLDEMOTE hints to hardware to move (demote) a cache line from the
 >> +closest to
 >> + * the processor to a level more distant from the processor. It is a
 >> +hint and
 >> + * not guarantee. rte_cldemote is intended to move the cache line to
 >> +the more
 >
 >guaranteed*
I will fix this
 >
 >
 >> + * remote cache, where it expects sharing to be efficient and to
 >> +indicate that a
 >> + * line may be accessed by a different core in the future.
 >> + *
 >> + * @param p
 >> + *   Address to demote
 >> + */
 >> +__rte_experimental
 >> +static inline void rte_cldemote(const volatile void *p);
 >> +
 >>  #endif /* _RTE_PREFETCH_H_ */
 >> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h
 >> b/lib/librte_eal/ppc/include/rte_prefetch.h
 >> index 9ba07c8..b55cac4 100644
 >> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
 >> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
 >> @@ -11,6 +11,7 @@
 >>  #endif
 >>
 >>  #include <rte_common.h>
 >> +#include <rte_compat.h>
 >>  #include "generic/rte_prefetch.h"
 >>
 >>  static inline void rte_prefetch0(const volatile void *p) @@ -34,6
 >> +35,12 @@ static inline void rte_prefetch_non_temporal(const volatile
 >void *p)
 >>         rte_prefetch0(p);
 >>  }
 >>
 >> +__rte_experimental
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +       RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h
 >> b/lib/librte_eal/x86/include/rte_prefetch.h
 >> index 384c6b3..92ba05a 100644
 >> --- a/lib/librte_eal/x86/include/rte_prefetch.h
 >> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
 >> @@ -10,6 +10,7 @@
 >>  #endif
 >>
 >>  #include <rte_common.h>
 >> +#include <rte_compat.h>
 >>  #include "generic/rte_prefetch.h"
 >>
 >>  static inline void rte_prefetch0(const volatile void *p) @@ -32,6
 >> +33,16 @@ static inline void rte_prefetch_non_temporal(const volatile
 >void *p)
 >>         asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile
 >> char *)p));  }
 >>
 >> +/*
 >> + * we're using raw byte codes for now as only the newest compiler
 >
 >We use
I will fix this
 >
 >> + * versions support this instruction natively.
 >> + */
 >> +__rte_experimental
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +       asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p)); }
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> --
 >> 1.8.3.1
 >>
 >
 >
 >--
 >David Marchand
  
David Marchand Oct. 15, 2020, 8:32 p.m. UTC | #4
On Thu, Oct 15, 2020 at 4:41 PM Maslekar, Omkar
<omkar.maslekar@intel.com> wrote:
>  >> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h
>  >> b/lib/librte_eal/arm/include/rte_prefetch_32.h
>  >> index e53420a..28b3d48 100644
>  >> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
>  >> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
>  >> @@ -10,6 +10,7 @@
>  >>  #endif
>  >>
>  >>  #include <rte_common.h>
>  >> +#include <rte_compat.h>
>  >
>  >Move rte_compat.h inclusion from the arch headers to the
>  >generic/rte_prefetch.h header only.
> I got below build error if I move rte_compat.h inclusion from the arch headers to the generic/rte_prefetch.h header only. I will remove it and send out a new patch v8.
> In file included from ../lib/librte_eal/x86/include/rte_prefetch.h:14:0,
>                  from ../lib/librte_table/rte_swx_table_em.c:10:
> ../lib/librte_eal/include/generic/rte_prefetch.h:67:1: error: expected ‘=’, ‘,’, ‘;’, ‘asm’ or ‘__attribute__’ before ‘static’

Please rebase on main as I took Harry patch which was ready.
Thanks.
  

Patch

diff --git a/app/test/test_prefetch.c b/app/test/test_prefetch.c
index 41f219a..5c58d0c 100644
--- a/app/test/test_prefetch.c
+++ b/app/test/test_prefetch.c
@@ -26,7 +26,11 @@ 
 	rte_prefetch1(&a);
 	rte_prefetch2(&a);
 
+/* test for marking a line as shared to test cldemote functionality */
+	rte_cldemote(&a);
+
 	return 0;
 }
 
+
 REGISTER_TEST_COMMAND(prefetch_autotest, test_prefetch);
diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index b7881f2..8a1ed01 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -171,6 +171,13 @@  New Features
   * Extern objects and functions can be plugged into the pipeline.
   * Transaction-oriented table updates.
 
+* **Added new function rte_cldemote in rte_prefetch.h.**
+
+  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
+  CLDEMOTE moves the cache line to the more remote cache, where it expects
+  sharing to be efficient. Moving the cache line to a level more distant from
+  the processor helps to accelerate core-to-core communication.
+
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h b/lib/librte_eal/arm/include/rte_prefetch_32.h
index e53420a..28b3d48 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_32.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -33,6 +34,12 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h b/lib/librte_eal/arm/include/rte_prefetch_64.h
index fc2b391..1c722eb 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_64.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -32,6 +33,12 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));
 }
 
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_prefetch.h b/lib/librte_eal/include/generic/rte_prefetch.h
index 6e47bdf..ad9844c 100644
--- a/lib/librte_eal/include/generic/rte_prefetch.h
+++ b/lib/librte_eal/include/generic/rte_prefetch.h
@@ -51,4 +51,19 @@ 
  */
 static inline void rte_prefetch_non_temporal(const volatile void *p);
 
+/**
+ * Demote a cache line to a more distant level of cache from the processor.
+ *
+ * CLDEMOTE hints to hardware to move (demote) a cache line from the closest to
+ * the processor to a level more distant from the processor. It is a hint and
+ * not guarantee. rte_cldemote is intended to move the cache line to the more
+ * remote cache, where it expects sharing to be efficient and to indicate that a
+ * line may be accessed by a different core in the future.
+ *
+ * @param p
+ *   Address to demote
+ */
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p);
+
 #endif /* _RTE_PREFETCH_H_ */
diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
index 9ba07c8..b55cac4 100644
--- a/lib/librte_eal/ppc/include/rte_prefetch.h
+++ b/lib/librte_eal/ppc/include/rte_prefetch.h
@@ -11,6 +11,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -34,6 +35,12 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/x86/include/rte_prefetch.h b/lib/librte_eal/x86/include/rte_prefetch.h
index 384c6b3..92ba05a 100644
--- a/lib/librte_eal/x86/include/rte_prefetch.h
+++ b/lib/librte_eal/x86/include/rte_prefetch.h
@@ -10,6 +10,7 @@ 
 #endif
 
 #include <rte_common.h>
+#include <rte_compat.h>
 #include "generic/rte_prefetch.h"
 
 static inline void rte_prefetch0(const volatile void *p)
@@ -32,6 +33,16 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
 }
 
+/*
+ * we're using raw byte codes for now as only the newest compiler
+ * versions support this instruction natively.
+ */
+__rte_experimental
+static inline void rte_cldemote(const volatile void *p)
+{
+	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
+}
+
 #ifdef __cplusplus
 }
 #endif