[dpdk-dev,v1,01/14] ring: remove split cacheline build setting

Message ID 20170223172407.27664-2-bruce.richardson@intel.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Bruce Richardson Feb. 23, 2017, 5:23 p.m. UTC
  Users compiling DPDK should not need to know or care about the arrangement
of cachelines in the rte_ring structure. Therefore just remove the build
option and set the structures to be always split. For improved
performance use 128B rather than 64B alignment since it stops the producer
and consumer data being on adjacent cachelines.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 config/common_base                     | 1 -
 doc/guides/rel_notes/release_17_05.rst | 6 ++++++
 lib/librte_ring/rte_ring.c             | 2 --
 lib/librte_ring/rte_ring.h             | 8 ++------
 4 files changed, 8 insertions(+), 9 deletions(-)
  

Comments

Jerin Jacob Feb. 28, 2017, 11:35 a.m. UTC | #1
On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> Users compiling DPDK should not need to know or care about the arrangement
> of cachelines in the rte_ring structure. Therefore just remove the build
> option and set the structures to be always split. For improved
> performance use 128B rather than 64B alignment since it stops the producer
> and consumer data being on adjacent cachelines.
> 
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  config/common_base                     | 1 -
>  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
>  lib/librte_ring/rte_ring.c             | 2 --
>  lib/librte_ring/rte_ring.h             | 8 ++------
>  4 files changed, 8 insertions(+), 9 deletions(-)
> 
> diff --git a/config/common_base b/config/common_base
> index aeee13e..099ffda 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
>  #
>  CONFIG_RTE_LIBRTE_RING=y
>  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
>  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
>  
>  #
> diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> index e25ea9f..ea45e0c 100644
> --- a/doc/guides/rel_notes/release_17_05.rst
> +++ b/doc/guides/rel_notes/release_17_05.rst
> @@ -110,6 +110,12 @@ API Changes
>     Also, make sure to start the actual text at the margin.
>     =========================================================
>  
> +* **Reworked rte_ring library**
> +
> +  The rte_ring library has been reworked and updated. The following changes
> +  have been made to it:
> +
> +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
>  
>  ABI Changes
>  -----------
> diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> index ca0a108..4bc6da1 100644
> --- a/lib/librte_ring/rte_ring.c
> +++ b/lib/librte_ring/rte_ring.c
> @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
>  	/* compilation-time checks */
>  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> -#ifdef RTE_RING_SPLIT_PROD_CONS
>  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
>  			  RTE_CACHE_LINE_MASK) != 0);
> -#endif
>  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
>  			  RTE_CACHE_LINE_MASK) != 0);
>  #ifdef RTE_LIBRTE_RING_DEBUG
> diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> index 72ccca5..04fe667 100644
> --- a/lib/librte_ring/rte_ring.h
> +++ b/lib/librte_ring/rte_ring.h
> @@ -168,7 +168,7 @@ struct rte_ring {
>  		uint32_t mask;           /**< Mask (size-1) of ring. */
>  		volatile uint32_t head;  /**< Producer head. */
>  		volatile uint32_t tail;  /**< Producer tail. */
> -	} prod __rte_cache_aligned;
> +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);

I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
size of 128B

> +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);


>  
>  	/** Ring consumer status. */
>  	struct cons {
> @@ -177,11 +177,7 @@ struct rte_ring {
>  		uint32_t mask;           /**< Mask (size-1) of ring. */
>  		volatile uint32_t head;  /**< Consumer head. */
>  		volatile uint32_t tail;  /**< Consumer tail. */
> -#ifdef RTE_RING_SPLIT_PROD_CONS
> -	} cons __rte_cache_aligned;
> -#else
> -	} cons;
> -#endif
> +	} cons __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
>  
>  #ifdef RTE_LIBRTE_RING_DEBUG
>  	struct rte_ring_debug_stats stats[RTE_MAX_LCORE];
> -- 
> 2.9.3
>
  
Bruce Richardson Feb. 28, 2017, 11:57 a.m. UTC | #2
On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:
> On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> > Users compiling DPDK should not need to know or care about the arrangement
> > of cachelines in the rte_ring structure. Therefore just remove the build
> > option and set the structures to be always split. For improved
> > performance use 128B rather than 64B alignment since it stops the producer
> > and consumer data being on adjacent cachelines.
> > 
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> >  config/common_base                     | 1 -
> >  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
> >  lib/librte_ring/rte_ring.c             | 2 --
> >  lib/librte_ring/rte_ring.h             | 8 ++------
> >  4 files changed, 8 insertions(+), 9 deletions(-)
> > 
> > diff --git a/config/common_base b/config/common_base
> > index aeee13e..099ffda 100644
> > --- a/config/common_base
> > +++ b/config/common_base
> > @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
> >  #
> >  CONFIG_RTE_LIBRTE_RING=y
> >  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> > -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
> >  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
> >  
> >  #
> > diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> > index e25ea9f..ea45e0c 100644
> > --- a/doc/guides/rel_notes/release_17_05.rst
> > +++ b/doc/guides/rel_notes/release_17_05.rst
> > @@ -110,6 +110,12 @@ API Changes
> >     Also, make sure to start the actual text at the margin.
> >     =========================================================
> >  
> > +* **Reworked rte_ring library**
> > +
> > +  The rte_ring library has been reworked and updated. The following changes
> > +  have been made to it:
> > +
> > +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
> >  
> >  ABI Changes
> >  -----------
> > diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> > index ca0a108..4bc6da1 100644
> > --- a/lib/librte_ring/rte_ring.c
> > +++ b/lib/librte_ring/rte_ring.c
> > @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> >  	/* compilation-time checks */
> >  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
> >  			  RTE_CACHE_LINE_MASK) != 0);
> > -#ifdef RTE_RING_SPLIT_PROD_CONS
> >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
> >  			  RTE_CACHE_LINE_MASK) != 0);
> > -#endif
> >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
> >  			  RTE_CACHE_LINE_MASK) != 0);
> >  #ifdef RTE_LIBRTE_RING_DEBUG
> > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > index 72ccca5..04fe667 100644
> > --- a/lib/librte_ring/rte_ring.h
> > +++ b/lib/librte_ring/rte_ring.h
> > @@ -168,7 +168,7 @@ struct rte_ring {
> >  		uint32_t mask;           /**< Mask (size-1) of ring. */
> >  		volatile uint32_t head;  /**< Producer head. */
> >  		volatile uint32_t tail;  /**< Producer tail. */
> > -	} prod __rte_cache_aligned;
> > +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
> 
> I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
> RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
> size of 128B
> 
Sure.

However, can you perhaps try a performance test and check to see if
there is a performance difference between the two values before I change
it? In my tests I see improved performance by having an extra blank
cache-line between the producer and consumer data.

/Bruce
  
Jerin Jacob Feb. 28, 2017, 12:08 p.m. UTC | #3
On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson wrote:
> On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:
> > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> > > Users compiling DPDK should not need to know or care about the arrangement
> > > of cachelines in the rte_ring structure. Therefore just remove the build
> > > option and set the structures to be always split. For improved
> > > performance use 128B rather than 64B alignment since it stops the producer
> > > and consumer data being on adjacent cachelines.
> > > 
> > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > ---
> > >  config/common_base                     | 1 -
> > >  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
> > >  lib/librte_ring/rte_ring.c             | 2 --
> > >  lib/librte_ring/rte_ring.h             | 8 ++------
> > >  4 files changed, 8 insertions(+), 9 deletions(-)
> > > 
> > > diff --git a/config/common_base b/config/common_base
> > > index aeee13e..099ffda 100644
> > > --- a/config/common_base
> > > +++ b/config/common_base
> > > @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
> > >  #
> > >  CONFIG_RTE_LIBRTE_RING=y
> > >  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> > > -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
> > >  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
> > >  
> > >  #
> > > diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> > > index e25ea9f..ea45e0c 100644
> > > --- a/doc/guides/rel_notes/release_17_05.rst
> > > +++ b/doc/guides/rel_notes/release_17_05.rst
> > > @@ -110,6 +110,12 @@ API Changes
> > >     Also, make sure to start the actual text at the margin.
> > >     =========================================================
> > >  
> > > +* **Reworked rte_ring library**
> > > +
> > > +  The rte_ring library has been reworked and updated. The following changes
> > > +  have been made to it:
> > > +
> > > +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
> > >  
> > >  ABI Changes
> > >  -----------
> > > diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> > > index ca0a108..4bc6da1 100644
> > > --- a/lib/librte_ring/rte_ring.c
> > > +++ b/lib/librte_ring/rte_ring.c
> > > @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> > >  	/* compilation-time checks */
> > >  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
> > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > -#ifdef RTE_RING_SPLIT_PROD_CONS
> > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
> > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > -#endif
> > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
> > >  			  RTE_CACHE_LINE_MASK) != 0);
> > >  #ifdef RTE_LIBRTE_RING_DEBUG
> > > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > > index 72ccca5..04fe667 100644
> > > --- a/lib/librte_ring/rte_ring.h
> > > +++ b/lib/librte_ring/rte_ring.h
> > > @@ -168,7 +168,7 @@ struct rte_ring {
> > >  		uint32_t mask;           /**< Mask (size-1) of ring. */
> > >  		volatile uint32_t head;  /**< Producer head. */
> > >  		volatile uint32_t tail;  /**< Producer tail. */
> > > -	} prod __rte_cache_aligned;
> > > +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
> > 
> > I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
> > RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
> > size of 128B
> > 
> Sure.
> 
> However, can you perhaps try a performance test and check to see if
> there is a performance difference between the two values before I change
> it? In my tests I see improved performance by having an extra blank
> cache-line between the producer and consumer data.

Sure. Which test are you running to measure the performance difference?
Is it app/test/test_ring_perf.c?

> 
> /Bruce
  
Bruce Richardson Feb. 28, 2017, 1:52 p.m. UTC | #4
On Tue, Feb 28, 2017 at 05:38:34PM +0530, Jerin Jacob wrote:
> On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson wrote:
> > On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:
> > > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> > > > Users compiling DPDK should not need to know or care about the arrangement
> > > > of cachelines in the rte_ring structure. Therefore just remove the build
> > > > option and set the structures to be always split. For improved
> > > > performance use 128B rather than 64B alignment since it stops the producer
> > > > and consumer data being on adjacent cachelines.
> > > > 
> > > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > > ---
> > > >  config/common_base                     | 1 -
> > > >  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
> > > >  lib/librte_ring/rte_ring.c             | 2 --
> > > >  lib/librte_ring/rte_ring.h             | 8 ++------
> > > >  4 files changed, 8 insertions(+), 9 deletions(-)
> > > > 
> > > > diff --git a/config/common_base b/config/common_base
> > > > index aeee13e..099ffda 100644
> > > > --- a/config/common_base
> > > > +++ b/config/common_base
> > > > @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
> > > >  #
> > > >  CONFIG_RTE_LIBRTE_RING=y
> > > >  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> > > > -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
> > > >  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
> > > >  
> > > >  #
> > > > diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> > > > index e25ea9f..ea45e0c 100644
> > > > --- a/doc/guides/rel_notes/release_17_05.rst
> > > > +++ b/doc/guides/rel_notes/release_17_05.rst
> > > > @@ -110,6 +110,12 @@ API Changes
> > > >     Also, make sure to start the actual text at the margin.
> > > >     =========================================================
> > > >  
> > > > +* **Reworked rte_ring library**
> > > > +
> > > > +  The rte_ring library has been reworked and updated. The following changes
> > > > +  have been made to it:
> > > > +
> > > > +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
> > > >  
> > > >  ABI Changes
> > > >  -----------
> > > > diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> > > > index ca0a108..4bc6da1 100644
> > > > --- a/lib/librte_ring/rte_ring.c
> > > > +++ b/lib/librte_ring/rte_ring.c
> > > > @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> > > >  	/* compilation-time checks */
> > > >  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
> > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > -#ifdef RTE_RING_SPLIT_PROD_CONS
> > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
> > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > -#endif
> > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
> > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > >  #ifdef RTE_LIBRTE_RING_DEBUG
> > > > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > > > index 72ccca5..04fe667 100644
> > > > --- a/lib/librte_ring/rte_ring.h
> > > > +++ b/lib/librte_ring/rte_ring.h
> > > > @@ -168,7 +168,7 @@ struct rte_ring {
> > > >  		uint32_t mask;           /**< Mask (size-1) of ring. */
> > > >  		volatile uint32_t head;  /**< Producer head. */
> > > >  		volatile uint32_t tail;  /**< Producer tail. */
> > > > -	} prod __rte_cache_aligned;
> > > > +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
> > > 
> > > I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
> > > RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
> > > size of 128B
> > > 
> > Sure.
> > 
> > However, can you perhaps try a performance test and check to see if
> > there is a performance difference between the two values before I change
> > it? In my tests I see improved performance by having an extra blank
> > cache-line between the producer and consumer data.
> 
> Sure. Which test are you running to measure the performance difference?
> Is it app/test/test_ring_perf.c?
> 
> > 
Yep, just the basic ring perf test. I look mostly at the core-to-core
numbers, since hyperthread-to-hyperthread or NUMA socket to NUMA socket
would be far less common use cases IMHO.

/Bruce
  
Jerin Jacob Feb. 28, 2017, 5:54 p.m. UTC | #5
On Tue, Feb 28, 2017 at 01:52:26PM +0000, Bruce Richardson wrote:
> On Tue, Feb 28, 2017 at 05:38:34PM +0530, Jerin Jacob wrote:
> > On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson wrote:
> > > On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:
> > > > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> > > > > Users compiling DPDK should not need to know or care about the arrangement
> > > > > of cachelines in the rte_ring structure. Therefore just remove the build
> > > > > option and set the structures to be always split. For improved
> > > > > performance use 128B rather than 64B alignment since it stops the producer
> > > > > and consumer data being on adjacent cachelines.
> > > > > 
> > > > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > > > ---
> > > > >  config/common_base                     | 1 -
> > > > >  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
> > > > >  lib/librte_ring/rte_ring.c             | 2 --
> > > > >  lib/librte_ring/rte_ring.h             | 8 ++------
> > > > >  4 files changed, 8 insertions(+), 9 deletions(-)
> > > > > 
> > > > > diff --git a/config/common_base b/config/common_base
> > > > > index aeee13e..099ffda 100644
> > > > > --- a/config/common_base
> > > > > +++ b/config/common_base
> > > > > @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
> > > > >  #
> > > > >  CONFIG_RTE_LIBRTE_RING=y
> > > > >  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> > > > > -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
> > > > >  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
> > > > >  
> > > > >  #
> > > > > diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> > > > > index e25ea9f..ea45e0c 100644
> > > > > --- a/doc/guides/rel_notes/release_17_05.rst
> > > > > +++ b/doc/guides/rel_notes/release_17_05.rst
> > > > > @@ -110,6 +110,12 @@ API Changes
> > > > >     Also, make sure to start the actual text at the margin.
> > > > >     =========================================================
> > > > >  
> > > > > +* **Reworked rte_ring library**
> > > > > +
> > > > > +  The rte_ring library has been reworked and updated. The following changes
> > > > > +  have been made to it:
> > > > > +
> > > > > +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
> > > > >  
> > > > >  ABI Changes
> > > > >  -----------
> > > > > diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> > > > > index ca0a108..4bc6da1 100644
> > > > > --- a/lib/librte_ring/rte_ring.c
> > > > > +++ b/lib/librte_ring/rte_ring.c
> > > > > @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> > > > >  	/* compilation-time checks */
> > > > >  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
> > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > > -#ifdef RTE_RING_SPLIT_PROD_CONS
> > > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
> > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > > -#endif
> > > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
> > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > >  #ifdef RTE_LIBRTE_RING_DEBUG
> > > > > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > > > > index 72ccca5..04fe667 100644
> > > > > --- a/lib/librte_ring/rte_ring.h
> > > > > +++ b/lib/librte_ring/rte_ring.h
> > > > > @@ -168,7 +168,7 @@ struct rte_ring {
> > > > >  		uint32_t mask;           /**< Mask (size-1) of ring. */
> > > > >  		volatile uint32_t head;  /**< Producer head. */
> > > > >  		volatile uint32_t tail;  /**< Producer tail. */
> > > > > -	} prod __rte_cache_aligned;
> > > > > +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
> > > > 
> > > > I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
> > > > RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
> > > > size of 128B
> > > > 
> > > Sure.
> > > 
> > > However, can you perhaps try a performance test and check to see if
> > > there is a performance difference between the two values before I change
> > > it? In my tests I see improved performance by having an extra blank
> > > cache-line between the producer and consumer data.
> > 
> > Sure. Which test are you running to measure the performance difference?
> > Is it app/test/test_ring_perf.c?
> > 
> > > 
> Yep, just the basic ring perf test. I look mostly at the core-to-core
> numbers, since hyperthread-to-hyperthread or NUMA socket to NUMA socket
> would be far less common use cases IMHO.

Performance test result shows regression with RTE_CACHE_LINE_MIN_SIZE
scheme in some use case and some use case has higher performance(Testing using
two physical cores)


# base code
RTE>>ring_perf_autotest
### Testing single element and burst enq/deq ###
SP/SC single enq/dequeue: 84
MP/MC single enq/dequeue: 301
SP/SC burst enq/dequeue (size: 8): 20
MP/MC burst enq/dequeue (size: 8): 46
SP/SC burst enq/dequeue (size: 32): 12
MP/MC burst enq/dequeue (size: 32): 18

### Testing empty dequeue ###
SC empty dequeue: 7.11
MC empty dequeue: 12.15

### Testing using a single lcore ###
SP/SC bulk enq/dequeue (size: 8): 19.08
MP/MC bulk enq/dequeue (size: 8): 46.28
SP/SC bulk enq/dequeue (size: 32): 11.89
MP/MC bulk enq/dequeue (size: 32): 18.84

### Testing using two physical cores ###
SP/SC bulk enq/dequeue (size: 8): 37.42
MP/MC bulk enq/dequeue (size: 8): 73.32
SP/SC bulk enq/dequeue (size: 32): 18.69
MP/MC bulk enq/dequeue (size: 32): 24.59
Test OK

# with ring rework patch
RTE>>ring_perf_autotest
### Testing single element and burst enq/deq ###
SP/SC single enq/dequeue: 84
MP/MC single enq/dequeue: 301
SP/SC burst enq/dequeue (size: 8): 19
MP/MC burst enq/dequeue (size: 8): 45
SP/SC burst enq/dequeue (size: 32): 11
MP/MC burst enq/dequeue (size: 32): 18

### Testing empty dequeue ###
SC empty dequeue: 7.10
MC empty dequeue: 12.15

### Testing using a single lcore ###
SP/SC bulk enq/dequeue (size: 8): 18.59
MP/MC bulk enq/dequeue (size: 8): 45.49
SP/SC bulk enq/dequeue (size: 32): 11.67
MP/MC bulk enq/dequeue (size: 32): 18.65

### Testing using two physical cores ###
SP/SC bulk enq/dequeue (size: 8): 37.41
MP/MC bulk enq/dequeue (size: 8): 72.98
SP/SC bulk enq/dequeue (size: 32): 18.69
MP/MC bulk enq/dequeue (size: 32): 24.59
Test OK
RTE>>

# with ring rework patch + cache-line size change to one on 128BCL target
RTE>>ring_perf_autotest
### Testing single element and burst enq/deq ###
SP/SC single enq/dequeue: 90
MP/MC single enq/dequeue: 317
SP/SC burst enq/dequeue (size: 8): 20
MP/MC burst enq/dequeue (size: 8): 48
SP/SC burst enq/dequeue (size: 32): 11
MP/MC burst enq/dequeue (size: 32): 18

### Testing empty dequeue ###
SC empty dequeue: 8.10
MC empty dequeue: 11.15

### Testing using a single lcore ###
SP/SC bulk enq/dequeue (size: 8): 20.24
MP/MC bulk enq/dequeue (size: 8): 48.43
SP/SC bulk enq/dequeue (size: 32): 11.01
MP/MC bulk enq/dequeue (size: 32): 18.43

### Testing using two physical cores ###
SP/SC bulk enq/dequeue (size: 8): 25.92
MP/MC bulk enq/dequeue (size: 8): 69.76
SP/SC bulk enq/dequeue (size: 32): 14.27
MP/MC bulk enq/dequeue (size: 32): 22.94
Test OK
RTE>>
  
Bruce Richardson March 1, 2017, 9:47 a.m. UTC | #6
On Tue, Feb 28, 2017 at 11:24:25PM +0530, Jerin Jacob wrote:
> On Tue, Feb 28, 2017 at 01:52:26PM +0000, Bruce Richardson wrote:
> > On Tue, Feb 28, 2017 at 05:38:34PM +0530, Jerin Jacob wrote:
> > > On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson wrote:
> > > > On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:
> > > > > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson wrote:
> > > > > > Users compiling DPDK should not need to know or care about the arrangement
> > > > > > of cachelines in the rte_ring structure. Therefore just remove the build
> > > > > > option and set the structures to be always split. For improved
> > > > > > performance use 128B rather than 64B alignment since it stops the producer
> > > > > > and consumer data being on adjacent cachelines.
> > > > > > 
> > > > > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > > > > ---
> > > > > >  config/common_base                     | 1 -
> > > > > >  doc/guides/rel_notes/release_17_05.rst | 6 ++++++
> > > > > >  lib/librte_ring/rte_ring.c             | 2 --
> > > > > >  lib/librte_ring/rte_ring.h             | 8 ++------
> > > > > >  4 files changed, 8 insertions(+), 9 deletions(-)
> > > > > > 
> > > > > > diff --git a/config/common_base b/config/common_base
> > > > > > index aeee13e..099ffda 100644
> > > > > > --- a/config/common_base
> > > > > > +++ b/config/common_base
> > > > > > @@ -448,7 +448,6 @@ CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
> > > > > >  #
> > > > > >  CONFIG_RTE_LIBRTE_RING=y
> > > > > >  CONFIG_RTE_LIBRTE_RING_DEBUG=n
> > > > > > -CONFIG_RTE_RING_SPLIT_PROD_CONS=n
> > > > > >  CONFIG_RTE_RING_PAUSE_REP_COUNT=0
> > > > > >  
> > > > > >  #
> > > > > > diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
> > > > > > index e25ea9f..ea45e0c 100644
> > > > > > --- a/doc/guides/rel_notes/release_17_05.rst
> > > > > > +++ b/doc/guides/rel_notes/release_17_05.rst
> > > > > > @@ -110,6 +110,12 @@ API Changes
> > > > > >     Also, make sure to start the actual text at the margin.
> > > > > >     =========================================================
> > > > > >  
> > > > > > +* **Reworked rte_ring library**
> > > > > > +
> > > > > > +  The rte_ring library has been reworked and updated. The following changes
> > > > > > +  have been made to it:
> > > > > > +
> > > > > > +  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
> > > > > >  
> > > > > >  ABI Changes
> > > > > >  -----------
> > > > > > diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
> > > > > > index ca0a108..4bc6da1 100644
> > > > > > --- a/lib/librte_ring/rte_ring.c
> > > > > > +++ b/lib/librte_ring/rte_ring.c
> > > > > > @@ -127,10 +127,8 @@ rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
> > > > > >  	/* compilation-time checks */
> > > > > >  	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
> > > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > > > -#ifdef RTE_RING_SPLIT_PROD_CONS
> > > > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
> > > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > > > -#endif
> > > > > >  	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
> > > > > >  			  RTE_CACHE_LINE_MASK) != 0);
> > > > > >  #ifdef RTE_LIBRTE_RING_DEBUG
> > > > > > diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
> > > > > > index 72ccca5..04fe667 100644
> > > > > > --- a/lib/librte_ring/rte_ring.h
> > > > > > +++ b/lib/librte_ring/rte_ring.h
> > > > > > @@ -168,7 +168,7 @@ struct rte_ring {
> > > > > >  		uint32_t mask;           /**< Mask (size-1) of ring. */
> > > > > >  		volatile uint32_t head;  /**< Producer head. */
> > > > > >  		volatile uint32_t tail;  /**< Producer tail. */
> > > > > > -	} prod __rte_cache_aligned;
> > > > > > +	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
> > > > > 
> > > > > I think we need to use RTE_CACHE_LINE_MIN_SIZE instead of
> > > > > RTE_CACHE_LINE_SIZE for alignment here. PPC and ThunderX1 targets are cache line
> > > > > size of 128B
> > > > > 
> > > > Sure.
> > > > 
> > > > However, can you perhaps try a performance test and check to see if
> > > > there is a performance difference between the two values before I change
> > > > it? In my tests I see improved performance by having an extra blank
> > > > cache-line between the producer and consumer data.
> > > 
> > > Sure. Which test are you running to measure the performance difference?
> > > Is it app/test/test_ring_perf.c?
> > > 
> > > > 
> > Yep, just the basic ring perf test. I look mostly at the core-to-core
> > numbers, since hyperthread-to-hyperthread or NUMA socket to NUMA socket
> > would be far less common use cases IMHO.
> 
> Performance test result shows regression with RTE_CACHE_LINE_MIN_SIZE
> scheme in some use case and some use case has higher performance(Testing using
> two physical cores)
> 
> 
> # base code
> RTE>>ring_perf_autotest
> ### Testing single element and burst enq/deq ###
> SP/SC single enq/dequeue: 84
> MP/MC single enq/dequeue: 301
> SP/SC burst enq/dequeue (size: 8): 20
> MP/MC burst enq/dequeue (size: 8): 46
> SP/SC burst enq/dequeue (size: 32): 12
> MP/MC burst enq/dequeue (size: 32): 18
> 
> ### Testing empty dequeue ###
> SC empty dequeue: 7.11
> MC empty dequeue: 12.15
> 
> ### Testing using a single lcore ###
> SP/SC bulk enq/dequeue (size: 8): 19.08
> MP/MC bulk enq/dequeue (size: 8): 46.28
> SP/SC bulk enq/dequeue (size: 32): 11.89
> MP/MC bulk enq/dequeue (size: 32): 18.84
> 
> ### Testing using two physical cores ###
> SP/SC bulk enq/dequeue (size: 8): 37.42
> MP/MC bulk enq/dequeue (size: 8): 73.32
> SP/SC bulk enq/dequeue (size: 32): 18.69
> MP/MC bulk enq/dequeue (size: 32): 24.59
> Test OK
> 
> # with ring rework patch
> RTE>>ring_perf_autotest
> ### Testing single element and burst enq/deq ###
> SP/SC single enq/dequeue: 84
> MP/MC single enq/dequeue: 301
> SP/SC burst enq/dequeue (size: 8): 19
> MP/MC burst enq/dequeue (size: 8): 45
> SP/SC burst enq/dequeue (size: 32): 11
> MP/MC burst enq/dequeue (size: 32): 18
> 
> ### Testing empty dequeue ###
> SC empty dequeue: 7.10
> MC empty dequeue: 12.15
> 
> ### Testing using a single lcore ###
> SP/SC bulk enq/dequeue (size: 8): 18.59
> MP/MC bulk enq/dequeue (size: 8): 45.49
> SP/SC bulk enq/dequeue (size: 32): 11.67
> MP/MC bulk enq/dequeue (size: 32): 18.65
> 
> ### Testing using two physical cores ###
> SP/SC bulk enq/dequeue (size: 8): 37.41
> MP/MC bulk enq/dequeue (size: 8): 72.98
> SP/SC bulk enq/dequeue (size: 32): 18.69
> MP/MC bulk enq/dequeue (size: 32): 24.59
> Test OK
> RTE>>
> 
> # with ring rework patch + cache-line size change to one on 128BCL target
> RTE>>ring_perf_autotest
> ### Testing single element and burst enq/deq ###
> SP/SC single enq/dequeue: 90
> MP/MC single enq/dequeue: 317
> SP/SC burst enq/dequeue (size: 8): 20
> MP/MC burst enq/dequeue (size: 8): 48
> SP/SC burst enq/dequeue (size: 32): 11
> MP/MC burst enq/dequeue (size: 32): 18
> 
> ### Testing empty dequeue ###
> SC empty dequeue: 8.10
> MC empty dequeue: 11.15
> 
> ### Testing using a single lcore ###
> SP/SC bulk enq/dequeue (size: 8): 20.24
> MP/MC bulk enq/dequeue (size: 8): 48.43
> SP/SC bulk enq/dequeue (size: 32): 11.01
> MP/MC bulk enq/dequeue (size: 32): 18.43
> 
> ### Testing using two physical cores ###
> SP/SC bulk enq/dequeue (size: 8): 25.92
> MP/MC bulk enq/dequeue (size: 8): 69.76
> SP/SC bulk enq/dequeue (size: 32): 14.27
> MP/MC bulk enq/dequeue (size: 32): 22.94
> Test OK
> RTE>>

So given that there is not much difference here, is the MIN_SIZE i.e.
forced 64B, your preference, rather than actual cacheline-size?

/Bruce
  
Olivier Matz March 1, 2017, 10:17 a.m. UTC | #7
Hi Bruce,

On Wed, 1 Mar 2017 09:47:03 +0000, Bruce Richardson
<bruce.richardson@intel.com> wrote:
> On Tue, Feb 28, 2017 at 11:24:25PM +0530, Jerin Jacob wrote:
> > On Tue, Feb 28, 2017 at 01:52:26PM +0000, Bruce Richardson wrote:  
> > > On Tue, Feb 28, 2017 at 05:38:34PM +0530, Jerin Jacob wrote:  
> > > > On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson
> > > > wrote:  
> > > > > On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:  
> > > > > > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson
> > > > > > wrote:  
> > > > > > > Users compiling DPDK should not need to know or care
> > > > > > > about the arrangement of cachelines in the rte_ring
> > > > > > > structure. Therefore just remove the build option and set
> > > > > > > the structures to be always split. For improved
> > > > > > > performance use 128B rather than 64B alignment since it
> > > > > > > stops the producer and consumer data being on adjacent


You say you see an improved performance on Intel by having an extra
blank cache-line between the producer and consumer data. Do you have an
idea why it behaves like this? Do you think it is related to the
hardware adjacent cache line prefetcher?



> [...]
> > # base code  
> > RTE>>ring_perf_autotest  
> > ### Testing single element and burst enq/deq ###
> > SP/SC single enq/dequeue: 84
> > MP/MC single enq/dequeue: 301
> > SP/SC burst enq/dequeue (size: 8): 20
> > MP/MC burst enq/dequeue (size: 8): 46
> > SP/SC burst enq/dequeue (size: 32): 12
> > MP/MC burst enq/dequeue (size: 32): 18
> > 
> > ### Testing empty dequeue ###
> > SC empty dequeue: 7.11
> > MC empty dequeue: 12.15
> > 
> > ### Testing using a single lcore ###
> > SP/SC bulk enq/dequeue (size: 8): 19.08
> > MP/MC bulk enq/dequeue (size: 8): 46.28
> > SP/SC bulk enq/dequeue (size: 32): 11.89
> > MP/MC bulk enq/dequeue (size: 32): 18.84
> > 
> > ### Testing using two physical cores ###
> > SP/SC bulk enq/dequeue (size: 8): 37.42
> > MP/MC bulk enq/dequeue (size: 8): 73.32
> > SP/SC bulk enq/dequeue (size: 32): 18.69
> > MP/MC bulk enq/dequeue (size: 32): 24.59
> > Test OK
> > 
> > # with ring rework patch  
> > RTE>>ring_perf_autotest  
> > ### Testing single element and burst enq/deq ###
> > SP/SC single enq/dequeue: 84
> > MP/MC single enq/dequeue: 301
> > SP/SC burst enq/dequeue (size: 8): 19
> > MP/MC burst enq/dequeue (size: 8): 45
> > SP/SC burst enq/dequeue (size: 32): 11
> > MP/MC burst enq/dequeue (size: 32): 18
> > 
> > ### Testing empty dequeue ###
> > SC empty dequeue: 7.10
> > MC empty dequeue: 12.15
> > 
> > ### Testing using a single lcore ###
> > SP/SC bulk enq/dequeue (size: 8): 18.59
> > MP/MC bulk enq/dequeue (size: 8): 45.49
> > SP/SC bulk enq/dequeue (size: 32): 11.67
> > MP/MC bulk enq/dequeue (size: 32): 18.65
> > 
> > ### Testing using two physical cores ###
> > SP/SC bulk enq/dequeue (size: 8): 37.41
> > MP/MC bulk enq/dequeue (size: 8): 72.98
> > SP/SC bulk enq/dequeue (size: 32): 18.69
> > MP/MC bulk enq/dequeue (size: 32): 24.59
> > Test OK  
> > RTE>>  
> > 
> > # with ring rework patch + cache-line size change to one on 128BCL
> > target  
> > RTE>>ring_perf_autotest  
> > ### Testing single element and burst enq/deq ###
> > SP/SC single enq/dequeue: 90
> > MP/MC single enq/dequeue: 317
> > SP/SC burst enq/dequeue (size: 8): 20
> > MP/MC burst enq/dequeue (size: 8): 48
> > SP/SC burst enq/dequeue (size: 32): 11
> > MP/MC burst enq/dequeue (size: 32): 18
> > 
> > ### Testing empty dequeue ###
> > SC empty dequeue: 8.10
> > MC empty dequeue: 11.15
> > 
> > ### Testing using a single lcore ###
> > SP/SC bulk enq/dequeue (size: 8): 20.24
> > MP/MC bulk enq/dequeue (size: 8): 48.43
> > SP/SC bulk enq/dequeue (size: 32): 11.01
> > MP/MC bulk enq/dequeue (size: 32): 18.43
> > 
> > ### Testing using two physical cores ###
> > SP/SC bulk enq/dequeue (size: 8): 25.92
> > MP/MC bulk enq/dequeue (size: 8): 69.76
> > SP/SC bulk enq/dequeue (size: 32): 14.27
> > MP/MC bulk enq/dequeue (size: 32): 22.94
> > Test OK  
> > RTE>>  
> 
> So given that there is not much difference here, is the MIN_SIZE i.e.
> forced 64B, your preference, rather than actual cacheline-size?
> 

I don't quite like this macro CACHE_LINE_MIN_SIZE. For me, it does not
mean anything. The reasons for aligning on a cache line size are
straightforward, but when should we need to align on the minimum
cache line size supported by dpdk? For instance, in mbuf structure,
aligning on 64 would make more sense to me.

So, I would prefer using (RTE_CACHE_LINE_SIZE * 2) here. If we don't
want it on some architectures, or if this optimization is only for Intel
(or all archs that need this optim), I think we could have something
like:

/* bla bla */
#ifdef INTEL
#define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE * 2)
#else
#define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
#endif


Olivier
  
Bruce Richardson March 1, 2017, 10:42 a.m. UTC | #8
On Wed, Mar 01, 2017 at 11:17:53AM +0100, Olivier Matz wrote:
> Hi Bruce,
> 
> On Wed, 1 Mar 2017 09:47:03 +0000, Bruce Richardson
> <bruce.richardson@intel.com> wrote:
> > On Tue, Feb 28, 2017 at 11:24:25PM +0530, Jerin Jacob wrote:
> > > On Tue, Feb 28, 2017 at 01:52:26PM +0000, Bruce Richardson wrote:  
> > > > On Tue, Feb 28, 2017 at 05:38:34PM +0530, Jerin Jacob wrote:  
> > > > > On Tue, Feb 28, 2017 at 11:57:03AM +0000, Bruce Richardson
> > > > > wrote:  
> > > > > > On Tue, Feb 28, 2017 at 05:05:13PM +0530, Jerin Jacob wrote:  
> > > > > > > On Thu, Feb 23, 2017 at 05:23:54PM +0000, Bruce Richardson
> > > > > > > wrote:  
> > > > > > > > Users compiling DPDK should not need to know or care
> > > > > > > > about the arrangement of cachelines in the rte_ring
> > > > > > > > structure. Therefore just remove the build option and set
> > > > > > > > the structures to be always split. For improved
> > > > > > > > performance use 128B rather than 64B alignment since it
> > > > > > > > stops the producer and consumer data being on adjacent
> 
> 
> You say you see an improved performance on Intel by having an extra
> blank cache-line between the producer and consumer data. Do you have an
> idea why it behaves like this? Do you think it is related to the
> hardware adjacent cache line prefetcher?
>

That is a likely candidate, but I haven't done a full analysis on the
details to know for sure what all the factors are. We see a similar
effect with the packet distributor library, which uses similar padding.

> 
> 
> > [...]
> > > # base code  
> > > RTE>>ring_perf_autotest  
> > > ### Testing single element and burst enq/deq ###
> > > SP/SC single enq/dequeue: 84
> > > MP/MC single enq/dequeue: 301
> > > SP/SC burst enq/dequeue (size: 8): 20
> > > MP/MC burst enq/dequeue (size: 8): 46
> > > SP/SC burst enq/dequeue (size: 32): 12
> > > MP/MC burst enq/dequeue (size: 32): 18
> > > 
> > > ### Testing empty dequeue ###
> > > SC empty dequeue: 7.11
> > > MC empty dequeue: 12.15
> > > 
> > > ### Testing using a single lcore ###
> > > SP/SC bulk enq/dequeue (size: 8): 19.08
> > > MP/MC bulk enq/dequeue (size: 8): 46.28
> > > SP/SC bulk enq/dequeue (size: 32): 11.89
> > > MP/MC bulk enq/dequeue (size: 32): 18.84
> > > 
> > > ### Testing using two physical cores ###
> > > SP/SC bulk enq/dequeue (size: 8): 37.42
> > > MP/MC bulk enq/dequeue (size: 8): 73.32
> > > SP/SC bulk enq/dequeue (size: 32): 18.69
> > > MP/MC bulk enq/dequeue (size: 32): 24.59
> > > Test OK
> > > 
> > > # with ring rework patch  
> > > RTE>>ring_perf_autotest  
> > > ### Testing single element and burst enq/deq ###
> > > SP/SC single enq/dequeue: 84
> > > MP/MC single enq/dequeue: 301
> > > SP/SC burst enq/dequeue (size: 8): 19
> > > MP/MC burst enq/dequeue (size: 8): 45
> > > SP/SC burst enq/dequeue (size: 32): 11
> > > MP/MC burst enq/dequeue (size: 32): 18
> > > 
> > > ### Testing empty dequeue ###
> > > SC empty dequeue: 7.10
> > > MC empty dequeue: 12.15
> > > 
> > > ### Testing using a single lcore ###
> > > SP/SC bulk enq/dequeue (size: 8): 18.59
> > > MP/MC bulk enq/dequeue (size: 8): 45.49
> > > SP/SC bulk enq/dequeue (size: 32): 11.67
> > > MP/MC bulk enq/dequeue (size: 32): 18.65
> > > 
> > > ### Testing using two physical cores ###
> > > SP/SC bulk enq/dequeue (size: 8): 37.41
> > > MP/MC bulk enq/dequeue (size: 8): 72.98
> > > SP/SC bulk enq/dequeue (size: 32): 18.69
> > > MP/MC bulk enq/dequeue (size: 32): 24.59
> > > Test OK  
> > > RTE>>  
> > > 
> > > # with ring rework patch + cache-line size change to one on 128BCL
> > > target  
> > > RTE>>ring_perf_autotest  
> > > ### Testing single element and burst enq/deq ###
> > > SP/SC single enq/dequeue: 90
> > > MP/MC single enq/dequeue: 317
> > > SP/SC burst enq/dequeue (size: 8): 20
> > > MP/MC burst enq/dequeue (size: 8): 48
> > > SP/SC burst enq/dequeue (size: 32): 11
> > > MP/MC burst enq/dequeue (size: 32): 18
> > > 
> > > ### Testing empty dequeue ###
> > > SC empty dequeue: 8.10
> > > MC empty dequeue: 11.15
> > > 
> > > ### Testing using a single lcore ###
> > > SP/SC bulk enq/dequeue (size: 8): 20.24
> > > MP/MC bulk enq/dequeue (size: 8): 48.43
> > > SP/SC bulk enq/dequeue (size: 32): 11.01
> > > MP/MC bulk enq/dequeue (size: 32): 18.43
> > > 
> > > ### Testing using two physical cores ###
> > > SP/SC bulk enq/dequeue (size: 8): 25.92
> > > MP/MC bulk enq/dequeue (size: 8): 69.76
> > > SP/SC bulk enq/dequeue (size: 32): 14.27
> > > MP/MC bulk enq/dequeue (size: 32): 22.94
> > > Test OK  
> > > RTE>>  
> > 
> > So given that there is not much difference here, is the MIN_SIZE i.e.
> > forced 64B, your preference, rather than actual cacheline-size?
> > 
> 
> I don't quite like this macro CACHE_LINE_MIN_SIZE. For me, it does not
> mean anything. The reasons for aligning on a cache line size are
> straightforward, but when should we need to align on the minimum
> cache line size supported by dpdk? For instance, in mbuf structure,
> aligning on 64 would make more sense to me.
> 
> So, I would prefer using (RTE_CACHE_LINE_SIZE * 2) here. If we don't
> want it on some architectures, or if this optimization is only for Intel
> (or all archs that need this optim), I think we could have something
> like:
> 
> /* bla bla */
> #ifdef INTEL
> #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE * 2)
> #else
> #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
> #endif
> 
I would agree that CACHE_LINE_MIN_SIZE probably doesn't make any sense
here, but I'm happy to put in any suitable scheme that others are happy
with. The options are:

* Keep as-is: 
    adv: simplest option, disadv: wastes 128B * 2 on some platforms
* Change to MIN_SIZE: 
    adv: no ifdefs, disadv: doesn't make much sense logically here
* Use ifdefs:
    adv: each platform gets what's best for it, disadv: untidy code, may
    be harder to maintain
* Use hard-coded 128:
    adv: short and simple, disadv: misses any logical reason why 128 is
    used, i.e. magic number

I'm ok with any option above.

/Bruce
  
Olivier Matz March 1, 2017, 11:06 a.m. UTC | #9
On Wed, 1 Mar 2017 10:42:58 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Wed, Mar 01, 2017 at 11:17:53AM +0100, Olivier Matz wrote:
> > On Wed, 1 Mar 2017 09:47:03 +0000, Bruce Richardson
> > <bruce.richardson@intel.com> wrote:  
> > > So given that there is not much difference here, is the MIN_SIZE i.e.
> > > forced 64B, your preference, rather than actual cacheline-size?
>
> [...]
>
> > I don't quite like this macro CACHE_LINE_MIN_SIZE. For me, it does not
> > mean anything. The reasons for aligning on a cache line size are
> > straightforward, but when should we need to align on the minimum
> > cache line size supported by dpdk? For instance, in mbuf structure,
> > aligning on 64 would make more sense to me.
> > 
> > So, I would prefer using (RTE_CACHE_LINE_SIZE * 2) here. If we don't
> > want it on some architectures, or if this optimization is only for Intel
> > (or all archs that need this optim), I think we could have something
> > like:
> > 
> > /* bla bla */
> > #ifdef INTEL
> > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE * 2)
> > #else
> > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
> > #endif
> >   
> I would agree that CACHE_LINE_MIN_SIZE probably doesn't make any sense
> here, but I'm happy to put in any suitable scheme that others are happy
> with. The options are:
> 
> * Keep as-is: 
>     adv: simplest option, disadv: wastes 128B * 2 on some platforms
> * Change to MIN_SIZE: 
>     adv: no ifdefs, disadv: doesn't make much sense logically here
> * Use ifdefs:
>     adv: each platform gets what's best for it, disadv: untidy code, may
>     be harder to maintain
> * Use hard-coded 128:
>     adv: short and simple, disadv: misses any logical reason why 128 is
>     used, i.e. magic number
> 
> I'm ok with any option above.

I'd vote for "Keep as-is" or "Use ifdefs".

Olivier
  
Jerin Jacob March 1, 2017, 11:19 a.m. UTC | #10
On Wed, Mar 01, 2017 at 12:06:33PM +0100, Olivier Matz wrote:
> On Wed, 1 Mar 2017 10:42:58 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> > On Wed, Mar 01, 2017 at 11:17:53AM +0100, Olivier Matz wrote:
> > > On Wed, 1 Mar 2017 09:47:03 +0000, Bruce Richardson
> > > <bruce.richardson@intel.com> wrote:  
> > > > So given that there is not much difference here, is the MIN_SIZE i.e.
> > > > forced 64B, your preference, rather than actual cacheline-size?
> >
> > [...]
> >
> > > I don't quite like this macro CACHE_LINE_MIN_SIZE. For me, it does not
> > > mean anything. The reasons for aligning on a cache line size are
> > > straightforward, but when should we need to align on the minimum
> > > cache line size supported by dpdk? For instance, in mbuf structure,
> > > aligning on 64 would make more sense to me.
> > > 
> > > So, I would prefer using (RTE_CACHE_LINE_SIZE * 2) here. If we don't
> > > want it on some architectures, or if this optimization is only for Intel
> > > (or all archs that need this optim), I think we could have something
> > > like:
> > > 
> > > /* bla bla */
> > > #ifdef INTEL
> > > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE * 2)
> > > #else
> > > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
> > > #endif
> > >   
> > I would agree that CACHE_LINE_MIN_SIZE probably doesn't make any sense
> > here, but I'm happy to put in any suitable scheme that others are happy
> > with. The options are:
> > 
> > * Keep as-is: 
> >     adv: simplest option, disadv: wastes 128B * 2 on some platforms
> > * Change to MIN_SIZE: 
> >     adv: no ifdefs, disadv: doesn't make much sense logically here
> > * Use ifdefs:
> >     adv: each platform gets what's best for it, disadv: untidy code, may
> >     be harder to maintain
> > * Use hard-coded 128:
> >     adv: short and simple, disadv: misses any logical reason why 128 is
> >     used, i.e. magic number
> > 
> > I'm ok with any option above.
> 
> I'd vote for "Keep as-is" or "Use ifdefs".

I'd vote for "Use ifdefs", default configuration can be "RTE_CACHE_LINE_SIZE
* 2" but a target can override if required.

> 
> Olivier
  
Bruce Richardson March 1, 2017, 12:12 p.m. UTC | #11
On Wed, Mar 01, 2017 at 04:49:56PM +0530, Jerin Jacob wrote:
> On Wed, Mar 01, 2017 at 12:06:33PM +0100, Olivier Matz wrote:
> > On Wed, 1 Mar 2017 10:42:58 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> > > On Wed, Mar 01, 2017 at 11:17:53AM +0100, Olivier Matz wrote:
> > > > On Wed, 1 Mar 2017 09:47:03 +0000, Bruce Richardson
> > > > <bruce.richardson@intel.com> wrote:  
> > > > > So given that there is not much difference here, is the MIN_SIZE i.e.
> > > > > forced 64B, your preference, rather than actual cacheline-size?
> > >
> > > [...]
> > >
> > > > I don't quite like this macro CACHE_LINE_MIN_SIZE. For me, it does not
> > > > mean anything. The reasons for aligning on a cache line size are
> > > > straightforward, but when should we need to align on the minimum
> > > > cache line size supported by dpdk? For instance, in mbuf structure,
> > > > aligning on 64 would make more sense to me.
> > > > 
> > > > So, I would prefer using (RTE_CACHE_LINE_SIZE * 2) here. If we don't
> > > > want it on some architectures, or if this optimization is only for Intel
> > > > (or all archs that need this optim), I think we could have something
> > > > like:
> > > > 
> > > > /* bla bla */
> > > > #ifdef INTEL
> > > > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE * 2)
> > > > #else
> > > > #define __rte_ring_aligned __rte_aligned(RTE_CACHE_LINE_SIZE)
> > > > #endif
> > > >   
> > > I would agree that CACHE_LINE_MIN_SIZE probably doesn't make any sense
> > > here, but I'm happy to put in any suitable scheme that others are happy
> > > with. The options are:
> > > 
> > > * Keep as-is: 
> > >     adv: simplest option, disadv: wastes 128B * 2 on some platforms
> > > * Change to MIN_SIZE: 
> > >     adv: no ifdefs, disadv: doesn't make much sense logically here
> > > * Use ifdefs:
> > >     adv: each platform gets what's best for it, disadv: untidy code, may
> > >     be harder to maintain
> > > * Use hard-coded 128:
> > >     adv: short and simple, disadv: misses any logical reason why 128 is
> > >     used, i.e. magic number
> > > 
> > > I'm ok with any option above.
> > 
> > I'd vote for "Keep as-is" or "Use ifdefs".
> 
> I'd vote for "Use ifdefs", default configuration can be "RTE_CACHE_LINE_SIZE
> * 2" but a target can override if required.
> 
> > 
Ok. Will update with some #ifdefs in v2.

/Bruce
  

Patch

diff --git a/config/common_base b/config/common_base
index aeee13e..099ffda 100644
--- a/config/common_base
+++ b/config/common_base
@@ -448,7 +448,6 @@  CONFIG_RTE_LIBRTE_PMD_NULL_CRYPTO=y
 #
 CONFIG_RTE_LIBRTE_RING=y
 CONFIG_RTE_LIBRTE_RING_DEBUG=n
-CONFIG_RTE_RING_SPLIT_PROD_CONS=n
 CONFIG_RTE_RING_PAUSE_REP_COUNT=0
 
 #
diff --git a/doc/guides/rel_notes/release_17_05.rst b/doc/guides/rel_notes/release_17_05.rst
index e25ea9f..ea45e0c 100644
--- a/doc/guides/rel_notes/release_17_05.rst
+++ b/doc/guides/rel_notes/release_17_05.rst
@@ -110,6 +110,12 @@  API Changes
    Also, make sure to start the actual text at the margin.
    =========================================================
 
+* **Reworked rte_ring library**
+
+  The rte_ring library has been reworked and updated. The following changes
+  have been made to it:
+
+  * removed the build-time setting ``CONFIG_RTE_RING_SPLIT_PROD_CONS``
 
 ABI Changes
 -----------
diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
index ca0a108..4bc6da1 100644
--- a/lib/librte_ring/rte_ring.c
+++ b/lib/librte_ring/rte_ring.c
@@ -127,10 +127,8 @@  rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
 	/* compilation-time checks */
 	RTE_BUILD_BUG_ON((sizeof(struct rte_ring) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#ifdef RTE_RING_SPLIT_PROD_CONS
 	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, cons) &
 			  RTE_CACHE_LINE_MASK) != 0);
-#endif
 	RTE_BUILD_BUG_ON((offsetof(struct rte_ring, prod) &
 			  RTE_CACHE_LINE_MASK) != 0);
 #ifdef RTE_LIBRTE_RING_DEBUG
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index 72ccca5..04fe667 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -168,7 +168,7 @@  struct rte_ring {
 		uint32_t mask;           /**< Mask (size-1) of ring. */
 		volatile uint32_t head;  /**< Producer head. */
 		volatile uint32_t tail;  /**< Producer tail. */
-	} prod __rte_cache_aligned;
+	} prod __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
 
 	/** Ring consumer status. */
 	struct cons {
@@ -177,11 +177,7 @@  struct rte_ring {
 		uint32_t mask;           /**< Mask (size-1) of ring. */
 		volatile uint32_t head;  /**< Consumer head. */
 		volatile uint32_t tail;  /**< Consumer tail. */
-#ifdef RTE_RING_SPLIT_PROD_CONS
-	} cons __rte_cache_aligned;
-#else
-	} cons;
-#endif
+	} cons __rte_aligned(RTE_CACHE_LINE_SIZE * 2);
 
 #ifdef RTE_LIBRTE_RING_DEBUG
 	struct rte_ring_debug_stats stats[RTE_MAX_LCORE];