[v4] eal: add cache-line demote support

Message ID 1600739967-6499-2-git-send-email-omkar.maslekar@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series [v4] eal: add cache-line demote support |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/iol-broadcom-Performance success Performance Testing PASS
ci/iol-testing success Testing PASS
ci/iol-intel-Functional success Functional Testing PASS
ci/iol-intel-Performance success Performance Testing PASS
ci/travis-robot success Travis build: passed
ci/iol-broadcom-Functional success Functional Testing PASS

Commit Message

Omkar Maslekar Sept. 22, 2020, 1:59 a.m. UTC
  rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
enables software to hint to hardware that line is likely to be shared.
Useful in core-to-core communications where cache-line is likely to be
shared. ARM and PPC implementation is provided with NOP and can be added
if any equivalent instructions could be used for implementation on those
architectures.

Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>

---
v4: updated bold text for title and fixed margin in release notes
*
v3: fixed warning regarding whitespace
*
v2: documentation updated
---
---
 doc/guides/rel_notes/release_20_11.rst        |  6 ++++++
 lib/librte_eal/arm/include/rte_prefetch_32.h  |  5 +++++
 lib/librte_eal/arm/include/rte_prefetch_64.h  |  5 +++++
 lib/librte_eal/include/generic/rte_prefetch.h | 13 +++++++++++++
 lib/librte_eal/ppc/include/rte_prefetch.h     |  5 +++++
 lib/librte_eal/x86/include/rte_prefetch.h     |  9 +++++++++
 6 files changed, 43 insertions(+)
  

Comments

Bruce Richardson Sept. 22, 2020, 8:28 a.m. UTC | #1
On Mon, Sep 21, 2020 at 06:59:27PM -0700, Omkar Maslekar wrote:
> rte_cldemote is similar to a prefetch hint - in reverse. cldemote(addr)
> enables software to hint to hardware that line is likely to be shared.
> Useful in core-to-core communications where cache-line is likely to be
> shared. ARM and PPC implementation is provided with NOP and can be added
> if any equivalent instructions could be used for implementation on those
> architectures.
> 
> Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
>
Few minor suggestions below. With those fixed, feel free to add my ack to
future versions of this patch.

Acked-by: Bruce Richardson <bruce.richardson@intel.com>
 
> ---
> v4: updated bold text for title and fixed margin in release notes
> *
> v3: fixed warning regarding whitespace
> *
> v2: documentation updated
> ---
> ---
>  doc/guides/rel_notes/release_20_11.rst        |  6 ++++++
>  lib/librte_eal/arm/include/rte_prefetch_32.h  |  5 +++++
>  lib/librte_eal/arm/include/rte_prefetch_64.h  |  5 +++++
>  lib/librte_eal/include/generic/rte_prefetch.h | 13 +++++++++++++
>  lib/librte_eal/ppc/include/rte_prefetch.h     |  5 +++++
>  lib/librte_eal/x86/include/rte_prefetch.h     |  9 +++++++++
>  6 files changed, 43 insertions(+)
> 
> diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
> index df227a1..b844b96 100644
> --- a/doc/guides/rel_notes/release_20_11.rst
> +++ b/doc/guides/rel_notes/release_20_11.rst
> @@ -55,6 +55,12 @@ New Features
>       Also, make sure to start the actual text at the margin.
>       =======================================================
>  
> +* **Added new function rte_cldemote in rte_prefetch.h.**
> +
> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
> +  CLDEMOTE moves the cache line to the more remote cache, where it expects
> +  sharing to be efficient. Moving the cache line to a level more distant from
> +  the processor helps to accelerate core-to-core communication.
>  

I think you need two blank lines between sections here, not just one.

>  Removed Items
>  -------------
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h b/lib/librte_eal/arm/include/rte_prefetch_32.h
> index e53420a..ad91edd 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
> @@ -33,6 +33,11 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
>  
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h b/lib/librte_eal/arm/include/rte_prefetch_64.h
> index fc2b391..35d278a 100644
> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
> @@ -32,6 +32,11 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));
>  }
>  
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h b/lib/librte_eal/include/generic/rte_prefetch.h
> index 6e47bdf..8742412 100644
> --- a/lib/librte_eal/include/generic/rte_prefetch.h
> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
> @@ -51,4 +51,17 @@
>   */
>  static inline void rte_prefetch_non_temporal(const volatile void *p);
>  
> +/**
> + * Demote a cache line to a more distant level of cache from the processor.
> + *
> + * CLDEMOTE hints to hardware to move (demote) a cache line from the closest to
> + * the processor to a level more distant from the processor. It is a hint and
> + * not guarantee. rte_cldemote is intended to speed up things at the producer,
> + * in the producer-consumer case.
> + *

Two thoughts here:
1. Is it not more the consumer who benefits more since they are the ones
receiving the demoted value, while the producer pays a higher cost since
they have to demote the value on send?
2. Rather than talking about producer consumer case specifically, I think
it would be good to replace the last sentence with what you have in the
cover letter about it being for sharing, and to indicate that a line may be
accessed by a different core in the future.

> + * @param p
> + *   Address to demote
> + */
> +static inline void rte_cldemote(const volatile void *p);
> +
>  #endif /* _RTE_PREFETCH_H_ */
> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
> index 9ba07c8..3fe9655 100644
> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
> @@ -34,6 +34,11 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	rte_prefetch0(p);
>  }
>  
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +	RTE_SET_USED(p);
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h b/lib/librte_eal/x86/include/rte_prefetch.h
> index 384c6b3..029d06e 100644
> --- a/lib/librte_eal/x86/include/rte_prefetch.h
> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
> @@ -32,6 +32,15 @@ static inline void rte_prefetch_non_temporal(const volatile void *p)
>  	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
>  }
>  
> +/*
> + * we're using raw byte codes for now as only the newest compiler
> + * versions support this instruction natively.
> + */
> +static inline void rte_cldemote(const volatile void *p)
> +{
> +	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
> +}
> +
>  #ifdef __cplusplus
>  }
>  #endif
> -- 
> 1.8.3.1
>
  
Omkar Maslekar Sept. 22, 2020, 9:53 p.m. UTC | #2
Hi Bruce,

My comments are inline

 >-----Original Message-----
 >From: Bruce Richardson <bruce.richardson@intel.com>
 >Sent: Tuesday, September 22, 2020 1:28 AM
 >To: Maslekar, Omkar <omkar.maslekar@intel.com>
 >Cc: dev@dpdk.org; Loftus, Ciara <ciara.loftus@intel.com>
 >Subject: Re: [PATCH v4] eal: add cache-line demote support
 >
 >On Mon, Sep 21, 2020 at 06:59:27PM -0700, Omkar Maslekar wrote:
 >> rte_cldemote is similar to a prefetch hint - in reverse.
 >> cldemote(addr) enables software to hint to hardware that line is likely to be
 >shared.
 >> Useful in core-to-core communications where cache-line is likely to be
 >> shared. ARM and PPC implementation is provided with NOP and can be
 >> added if any equivalent instructions could be used for implementation
 >> on those architectures.
 >>
 >> Signed-off-by: Omkar Maslekar <omkar.maslekar@intel.com>
 >>
 >Few minor suggestions below. With those fixed, feel free to add my ack to
 >future versions of this patch.
 >
 >Acked-by: Bruce Richardson <bruce.richardson@intel.com>
 >
 >> ---
 >> v4: updated bold text for title and fixed margin in release notes
 >> *
 >> v3: fixed warning regarding whitespace
 >> *
 >> v2: documentation updated
 >> ---
 >> ---
 >>  doc/guides/rel_notes/release_20_11.rst        |  6 ++++++
 >>  lib/librte_eal/arm/include/rte_prefetch_32.h  |  5 +++++
 >> lib/librte_eal/arm/include/rte_prefetch_64.h  |  5 +++++
 >> lib/librte_eal/include/generic/rte_prefetch.h | 13 +++++++++++++
 >>  lib/librte_eal/ppc/include/rte_prefetch.h     |  5 +++++
 >>  lib/librte_eal/x86/include/rte_prefetch.h     |  9 +++++++++
 >>  6 files changed, 43 insertions(+)
 >>
 >> diff --git a/doc/guides/rel_notes/release_20_11.rst
 >> b/doc/guides/rel_notes/release_20_11.rst
 >> index df227a1..b844b96 100644
 >> --- a/doc/guides/rel_notes/release_20_11.rst
 >> +++ b/doc/guides/rel_notes/release_20_11.rst
 >> @@ -55,6 +55,12 @@ New Features
 >>       Also, make sure to start the actual text at the margin.
 >>       =======================================================
 >>
 >> +* **Added new function rte_cldemote in rte_prefetch.h.**
 >> +
 >> +  Added a hardware hint CLDEMOTE, which is similar to prefetch in
 >reverse.
 >> +  CLDEMOTE moves the cache line to the more remote cache, where it
 >> + expects  sharing to be efficient. Moving the cache line to a level
 >> + more distant from  the processor helps to accelerate core-to-core
 >communication.
 >>
 >
 >I think you need two blank lines between sections here, not just one.
[om] you are right, I will fix in v5. 
 >
 >>  Removed Items
 >>  -------------
 >> diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> b/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> index e53420a..ad91edd 100644
 >> --- a/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> +++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
 >> @@ -33,6 +33,11 @@ static inline void rte_prefetch_non_temporal(const
 >volatile void *p)
 >>  	rte_prefetch0(p);
 >>  }
 >>
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +	RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> b/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> index fc2b391..35d278a 100644
 >> --- a/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> +++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
 >> @@ -32,6 +32,11 @@ static inline void rte_prefetch_non_temporal(const
 >volatile void *p)
 >>  	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));  }
 >>
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +	RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/include/generic/rte_prefetch.h
 >> b/lib/librte_eal/include/generic/rte_prefetch.h
 >> index 6e47bdf..8742412 100644
 >> --- a/lib/librte_eal/include/generic/rte_prefetch.h
 >> +++ b/lib/librte_eal/include/generic/rte_prefetch.h
 >> @@ -51,4 +51,17 @@
 >>   */
 >>  static inline void rte_prefetch_non_temporal(const volatile void *p);
 >>
 >> +/**
 >> + * Demote a cache line to a more distant level of cache from the
 >processor.
 >> + *
 >> + * CLDEMOTE hints to hardware to move (demote) a cache line from the
 >> +closest to
 >> + * the processor to a level more distant from the processor. It is a
 >> +hint and
 >> + * not guarantee. rte_cldemote is intended to speed up things at the
 >> +producer,
 >> + * in the producer-consumer case.
 >> + *
 >
 >Two thoughts here:
 >1. Is it not more the consumer who benefits more since they are the ones
 >receiving the demoted value, while the producer pays a higher cost since
 >they have to demote the value on send?

[OM] CLDEMOTE benefits the consumer. My statement "speed up things at the producer" indicate proximity where the distance is reduced.
But I will make it simple and more readable. 

 >2. Rather than talking about producer consumer case specifically, I think it
 >would be good to replace the last sentence with what you have in the cover
 >letter about it being for sharing, and to indicate that a line may be accessed
 >by a different core in the future.

 
[OM] Good point, there could be many other cores that can benefit instead of just a single consumer. I will update this.

 >
 >> + * @param p
 >> + *   Address to demote
 >> + */
 >> +static inline void rte_cldemote(const volatile void *p);
 >> +
 >>  #endif /* _RTE_PREFETCH_H_ */
 >> diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h
 >> b/lib/librte_eal/ppc/include/rte_prefetch.h
 >> index 9ba07c8..3fe9655 100644
 >> --- a/lib/librte_eal/ppc/include/rte_prefetch.h
 >> +++ b/lib/librte_eal/ppc/include/rte_prefetch.h
 >> @@ -34,6 +34,11 @@ static inline void rte_prefetch_non_temporal(const
 >volatile void *p)
 >>  	rte_prefetch0(p);
 >>  }
 >>
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +	RTE_SET_USED(p);
 >> +}
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> diff --git a/lib/librte_eal/x86/include/rte_prefetch.h
 >> b/lib/librte_eal/x86/include/rte_prefetch.h
 >> index 384c6b3..029d06e 100644
 >> --- a/lib/librte_eal/x86/include/rte_prefetch.h
 >> +++ b/lib/librte_eal/x86/include/rte_prefetch.h
 >> @@ -32,6 +32,15 @@ static inline void rte_prefetch_non_temporal(const
 >volatile void *p)
 >>  	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char
 >> *)p));  }
 >>
 >> +/*
 >> + * we're using raw byte codes for now as only the newest compiler
 >> + * versions support this instruction natively.
 >> + */
 >> +static inline void rte_cldemote(const volatile void *p) {
 >> +	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p)); }
 >> +
 >>  #ifdef __cplusplus
 >>  }
 >>  #endif
 >> --
 >> 1.8.3.1
 >>
  

Patch

diff --git a/doc/guides/rel_notes/release_20_11.rst b/doc/guides/rel_notes/release_20_11.rst
index df227a1..b844b96 100644
--- a/doc/guides/rel_notes/release_20_11.rst
+++ b/doc/guides/rel_notes/release_20_11.rst
@@ -55,6 +55,12 @@  New Features
      Also, make sure to start the actual text at the margin.
      =======================================================
 
+* **Added new function rte_cldemote in rte_prefetch.h.**
+
+  Added a hardware hint CLDEMOTE, which is similar to prefetch in reverse.
+  CLDEMOTE moves the cache line to the more remote cache, where it expects
+  sharing to be efficient. Moving the cache line to a level more distant from
+  the processor helps to accelerate core-to-core communication.
 
 Removed Items
 -------------
diff --git a/lib/librte_eal/arm/include/rte_prefetch_32.h b/lib/librte_eal/arm/include/rte_prefetch_32.h
index e53420a..ad91edd 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_32.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_32.h
@@ -33,6 +33,11 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/arm/include/rte_prefetch_64.h b/lib/librte_eal/arm/include/rte_prefetch_64.h
index fc2b391..35d278a 100644
--- a/lib/librte_eal/arm/include/rte_prefetch_64.h
+++ b/lib/librte_eal/arm/include/rte_prefetch_64.h
@@ -32,6 +32,11 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("PRFM PLDL1STRM, [%0]" : : "r" (p));
 }
 
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/include/generic/rte_prefetch.h b/lib/librte_eal/include/generic/rte_prefetch.h
index 6e47bdf..8742412 100644
--- a/lib/librte_eal/include/generic/rte_prefetch.h
+++ b/lib/librte_eal/include/generic/rte_prefetch.h
@@ -51,4 +51,17 @@ 
  */
 static inline void rte_prefetch_non_temporal(const volatile void *p);
 
+/**
+ * Demote a cache line to a more distant level of cache from the processor.
+ *
+ * CLDEMOTE hints to hardware to move (demote) a cache line from the closest to
+ * the processor to a level more distant from the processor. It is a hint and
+ * not guarantee. rte_cldemote is intended to speed up things at the producer,
+ * in the producer-consumer case.
+ *
+ * @param p
+ *   Address to demote
+ */
+static inline void rte_cldemote(const volatile void *p);
+
 #endif /* _RTE_PREFETCH_H_ */
diff --git a/lib/librte_eal/ppc/include/rte_prefetch.h b/lib/librte_eal/ppc/include/rte_prefetch.h
index 9ba07c8..3fe9655 100644
--- a/lib/librte_eal/ppc/include/rte_prefetch.h
+++ b/lib/librte_eal/ppc/include/rte_prefetch.h
@@ -34,6 +34,11 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	rte_prefetch0(p);
 }
 
+static inline void rte_cldemote(const volatile void *p)
+{
+	RTE_SET_USED(p);
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/x86/include/rte_prefetch.h b/lib/librte_eal/x86/include/rte_prefetch.h
index 384c6b3..029d06e 100644
--- a/lib/librte_eal/x86/include/rte_prefetch.h
+++ b/lib/librte_eal/x86/include/rte_prefetch.h
@@ -32,6 +32,15 @@  static inline void rte_prefetch_non_temporal(const volatile void *p)
 	asm volatile ("prefetchnta %[p]" : : [p] "m" (*(const volatile char *)p));
 }
 
+/*
+ * we're using raw byte codes for now as only the newest compiler
+ * versions support this instruction natively.
+ */
+static inline void rte_cldemote(const volatile void *p)
+{
+	asm volatile(".byte 0x0f, 0x1c, 0x06" :: "S" (p));
+}
+
 #ifdef __cplusplus
 }
 #endif