[dpdk-dev,v1,12/14] ring: separate out head index manipulation for enq/deq

Message ID 20170223172407.27664-13-bruce.richardson@intel.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply patch file failure

Commit Message

Bruce Richardson Feb. 23, 2017, 5:24 p.m. UTC
  We can write a single common function for head manipulation for enq
and a common one for deq, allowing us to have a single worker function
for enq and deq, rather than two of each. Update all other inline
functions to use the new functions.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
---
 lib/librte_ring/rte_ring.c |   4 +-
 lib/librte_ring/rte_ring.h | 328 ++++++++++++++++++++-------------------------
 2 files changed, 149 insertions(+), 183 deletions(-)
  

Comments

Olivier Matz March 8, 2017, 10:49 a.m. UTC | #1
On Thu, 23 Feb 2017 17:24:05 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> We can write a single common function for head manipulation for enq
> and a common one for deq, allowing us to have a single worker function
> for enq and deq, rather than two of each. Update all other inline
> functions to use the new functions.
> 
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> ---
>  lib/librte_ring/rte_ring.c |   4 +-
>  lib/librte_ring/rte_ring.h | 328 ++++++++++++++++++++-------------------------
>  2 files changed, 149 insertions(+), 183 deletions(-)
> 

[...]

> +static inline __attribute__((always_inline)) unsigned int
> +__rte_ring_do_enqueue(struct rte_ring *r, void * const *obj_table,
> +		 unsigned int n, enum rte_ring_queue_behavior behavior,
> +		 int is_sp, unsigned int *free_space)
>  {
> -	uint32_t prod_head, cons_tail;
> -	uint32_t prod_next, free_entries;
> -	uint32_t mask = r->mask;
> -
> -	prod_head = r->prod.head;
> -	cons_tail = r->cons.tail;
> -	/* The subtraction is done between two unsigned 32bits value
> -	 * (the result is always modulo 32 bits even if we have
> -	 * prod_head > cons_tail). So 'free_entries' is always between 0
> -	 * and size(ring)-1. */
> -	free_entries = mask + cons_tail - prod_head;
> -
> -	/* check that we have enough room in ring */
> -	if (unlikely(n > free_entries))
> -		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : free_entries;
> +	uint32_t prod_head, prod_next;
> +	uint32_t free_entries;
>  
> +	n = __rte_ring_move_prod_head(r, is_sp, n, behavior,
> +			&prod_head, &prod_next, &free_entries);
>  	if (n == 0)
>  		goto end;
>  
> -
> -	prod_next = prod_head + n;
> -	r->prod.head = prod_next;
> -
> -	/* write entries in ring */
>  	ENQUEUE_PTRS();
>  	rte_smp_wmb();
>  
> +	/*
> +	 * If there are other enqueues in progress that preceded us,
> +	 * we need to wait for them to complete
> +	 */
> +	while (unlikely(r->prod.tail != prod_head))
> +		rte_pause();
> +

I'd say this part should not be done in case is_sp == 1.
Since it is sometimes a constant arg in an inline func, it may be better
to add the if (is_sp == 0).

[...]

> +static inline __attribute__((always_inline)) unsigned int
> +__rte_ring_do_dequeue(struct rte_ring *r, void **obj_table,
>  		 unsigned int n, enum rte_ring_queue_behavior behavior,
> -		 unsigned int *available)
> +		 int is_mp, unsigned int *available)
>  {
> -	uint32_t cons_head, prod_tail;
> -	uint32_t cons_next, entries;
> -	uint32_t mask = r->mask;
> -
> -	cons_head = r->cons.head;
> -	prod_tail = r->prod.tail;
> -	/* The subtraction is done between two unsigned 32bits value
> -	 * (the result is always modulo 32 bits even if we have
> -	 * cons_head > prod_tail). So 'entries' is always between 0
> -	 * and size(ring)-1. */
> -	entries = prod_tail - cons_head;
> -
> -	if (n > entries)
> -		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : entries;
> -
> -	if (unlikely(entries == 0))
> -		goto end;
> +	uint32_t cons_head, cons_next;
> +	uint32_t entries;
>  
> -	cons_next = cons_head + n;
> -	r->cons.head = cons_next;
> +	n = __rte_ring_move_cons_head(r, is_mp, n, behavior,
> +			&cons_head, &cons_next, &entries);
> +	if (n == 0)
> +		goto end;
>  
> -	/* copy in table */
>  	DEQUEUE_PTRS();
>  	rte_smp_rmb();
>  
> +	/*
> +	 * If there are other enqueues in progress that preceded us,
> +	 * we need to wait for them to complete
> +	 */
> +	while (unlikely(r->cons.tail != cons_head))
> +		rte_pause();
> +
>  	r->cons.tail = cons_next;

Same here.
  
Bruce Richardson March 8, 2017, 12:06 p.m. UTC | #2
On Wed, Mar 08, 2017 at 11:49:06AM +0100, Olivier MATZ wrote:
> On Thu, 23 Feb 2017 17:24:05 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> > We can write a single common function for head manipulation for enq
> > and a common one for deq, allowing us to have a single worker function
> > for enq and deq, rather than two of each. Update all other inline
> > functions to use the new functions.
> > 
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > ---
> >  lib/librte_ring/rte_ring.c |   4 +-
> >  lib/librte_ring/rte_ring.h | 328 ++++++++++++++++++++-------------------------
> >  2 files changed, 149 insertions(+), 183 deletions(-)
> > 
> 
> [...]
> 
> > +static inline __attribute__((always_inline)) unsigned int
> > +__rte_ring_do_enqueue(struct rte_ring *r, void * const *obj_table,
> > +		 unsigned int n, enum rte_ring_queue_behavior behavior,
> > +		 int is_sp, unsigned int *free_space)
> >  {
> > -	uint32_t prod_head, cons_tail;
> > -	uint32_t prod_next, free_entries;
> > -	uint32_t mask = r->mask;
> > -
> > -	prod_head = r->prod.head;
> > -	cons_tail = r->cons.tail;
> > -	/* The subtraction is done between two unsigned 32bits value
> > -	 * (the result is always modulo 32 bits even if we have
> > -	 * prod_head > cons_tail). So 'free_entries' is always between 0
> > -	 * and size(ring)-1. */
> > -	free_entries = mask + cons_tail - prod_head;
> > -
> > -	/* check that we have enough room in ring */
> > -	if (unlikely(n > free_entries))
> > -		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : free_entries;
> > +	uint32_t prod_head, prod_next;
> > +	uint32_t free_entries;
> >  
> > +	n = __rte_ring_move_prod_head(r, is_sp, n, behavior,
> > +			&prod_head, &prod_next, &free_entries);
> >  	if (n == 0)
> >  		goto end;
> >  
> > -
> > -	prod_next = prod_head + n;
> > -	r->prod.head = prod_next;
> > -
> > -	/* write entries in ring */
> >  	ENQUEUE_PTRS();
> >  	rte_smp_wmb();
> >  
> > +	/*
> > +	 * If there are other enqueues in progress that preceded us,
> > +	 * we need to wait for them to complete
> > +	 */
> > +	while (unlikely(r->prod.tail != prod_head))
> > +		rte_pause();
> > +
> 
> I'd say this part should not be done in case is_sp == 1.
> Since it is sometimes a constant arg in an inline func, it may be better
> to add the if (is_sp == 0).
> 
> [...]
> 

Yes, it's an unnecessary check. However, having it in place for the sp
case made no performance difference in my test, so I decided to keep
the code shorter by avoiding an additional branch. If there is a
performance hit I'll remove it, but I would rather not add more branches
to the code in the absense of a real impact to not having them.

Regards,
/Bruce
  
Olivier Matz March 14, 2017, 8:56 a.m. UTC | #3
On Wed, 8 Mar 2017 12:06:54 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:
> On Wed, Mar 08, 2017 at 11:49:06AM +0100, Olivier MATZ wrote:
> > On Thu, 23 Feb 2017 17:24:05 +0000, Bruce Richardson <bruce.richardson@intel.com> wrote:  
> > > We can write a single common function for head manipulation for enq
> > > and a common one for deq, allowing us to have a single worker function
> > > for enq and deq, rather than two of each. Update all other inline
> > > functions to use the new functions.
> > > 
> > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > ---
> > >  lib/librte_ring/rte_ring.c |   4 +-
> > >  lib/librte_ring/rte_ring.h | 328 ++++++++++++++++++++-------------------------
> > >  2 files changed, 149 insertions(+), 183 deletions(-)
> > >   
> > 
> > [...]
> >   
> > > +static inline __attribute__((always_inline)) unsigned int
> > > +__rte_ring_do_enqueue(struct rte_ring *r, void * const *obj_table,
> > > +		 unsigned int n, enum rte_ring_queue_behavior behavior,
> > > +		 int is_sp, unsigned int *free_space)
> > >  {
> > > -	uint32_t prod_head, cons_tail;
> > > -	uint32_t prod_next, free_entries;
> > > -	uint32_t mask = r->mask;
> > > -
> > > -	prod_head = r->prod.head;
> > > -	cons_tail = r->cons.tail;
> > > -	/* The subtraction is done between two unsigned 32bits value
> > > -	 * (the result is always modulo 32 bits even if we have
> > > -	 * prod_head > cons_tail). So 'free_entries' is always between 0
> > > -	 * and size(ring)-1. */
> > > -	free_entries = mask + cons_tail - prod_head;
> > > -
> > > -	/* check that we have enough room in ring */
> > > -	if (unlikely(n > free_entries))
> > > -		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : free_entries;
> > > +	uint32_t prod_head, prod_next;
> > > +	uint32_t free_entries;
> > >  
> > > +	n = __rte_ring_move_prod_head(r, is_sp, n, behavior,
> > > +			&prod_head, &prod_next, &free_entries);
> > >  	if (n == 0)
> > >  		goto end;
> > >  
> > > -
> > > -	prod_next = prod_head + n;
> > > -	r->prod.head = prod_next;
> > > -
> > > -	/* write entries in ring */
> > >  	ENQUEUE_PTRS();
> > >  	rte_smp_wmb();
> > >  
> > > +	/*
> > > +	 * If there are other enqueues in progress that preceded us,
> > > +	 * we need to wait for them to complete
> > > +	 */
> > > +	while (unlikely(r->prod.tail != prod_head))
> > > +		rte_pause();
> > > +  
> > 
> > I'd say this part should not be done in case is_sp == 1.
> > Since it is sometimes a constant arg in an inline func, it may be better
> > to add the if (is_sp == 0).
> > 
> > [...]
> >   
> 
> Yes, it's an unnecessary check. However, having it in place for the sp
> case made no performance difference in my test, so I decided to keep
> the code shorter by avoiding an additional branch. If there is a
> performance hit I'll remove it, but I would rather not add more branches
> to the code in the absense of a real impact to not having them.

Ok.
Maybe it's worth checking the numbers given by the unit test.

Olivier
  

Patch

diff --git a/lib/librte_ring/rte_ring.c b/lib/librte_ring/rte_ring.c
index 18fb644..4776079 100644
--- a/lib/librte_ring/rte_ring.c
+++ b/lib/librte_ring/rte_ring.c
@@ -138,8 +138,8 @@  rte_ring_init(struct rte_ring *r, const char *name, unsigned count,
 	if (ret < 0 || ret >= (int)sizeof(r->name))
 		return -ENAMETOOLONG;
 	r->flags = flags;
-	r->prod.sp_enqueue = !!(flags & RING_F_SP_ENQ);
-	r->cons.sc_dequeue = !!(flags & RING_F_SC_DEQ);
+	r->prod.sp_enqueue = (flags & RING_F_SP_ENQ) ? __IS_SP : __IS_MP;
+	r->cons.sc_dequeue = (flags & RING_F_SC_DEQ) ? __IS_SC : __IS_MC;
 	r->size = count;
 	r->mask = count - 1;
 	r->prod.head = r->cons.head = 0;
diff --git a/lib/librte_ring/rte_ring.h b/lib/librte_ring/rte_ring.h
index db50ce9..d10b7d1 100644
--- a/lib/librte_ring/rte_ring.h
+++ b/lib/librte_ring/rte_ring.h
@@ -164,6 +164,12 @@  struct rte_ring {
 #define RING_F_SC_DEQ 0x0002 /**< The default dequeue is "single-consumer". */
 #define RTE_RING_SZ_MASK  (unsigned)(0x0fffffff) /**< Ring size mask */
 
+/* @internal defines for passing to the enqueue dequeue worker functions */
+#define __IS_SP 1
+#define __IS_MP 0
+#define __IS_SC 1
+#define __IS_MC 0
+
 /**
  * Calculate the memory size needed for a ring
  *
@@ -282,7 +288,7 @@  void rte_ring_dump(FILE *f, const struct rte_ring *r);
 #define ENQUEUE_PTRS() do { \
 	unsigned int i; \
 	const uint32_t size = r->size; \
-	uint32_t idx = prod_head & mask; \
+	uint32_t idx = prod_head & r->mask; \
 	if (likely(idx + n < size)) { \
 		for (i = 0; i < (n & ((~(unsigned)0x3))); i+=4, idx+=4) { \
 			r->ring[idx] = obj_table[i]; \
@@ -308,7 +314,7 @@  void rte_ring_dump(FILE *f, const struct rte_ring *r);
  * single and multi consumer dequeue functions */
 #define DEQUEUE_PTRS() do { \
 	unsigned int i; \
-	uint32_t idx = cons_head & mask; \
+	uint32_t idx = cons_head & r->mask; \
 	const uint32_t size = r->size; \
 	if (likely(idx + n < size)) { \
 		for (i = 0; i < (n & (~(unsigned)0x3)); i+=4, idx+=4) {\
@@ -331,83 +337,72 @@  void rte_ring_dump(FILE *f, const struct rte_ring *r);
 } while (0)
 
 /**
- * @internal Enqueue several objects on the ring (multi-producers safe).
- *
- * This function uses a "compare and set" instruction to move the
- * producer index atomically.
+ * @internal This function updates the producer head for enqueue
  *
  * @param r
- *   A pointer to the ring structure.
- * @param obj_table
- *   A pointer to a table of void * pointers (objects).
+ *   A pointer to the ring structure
+ * @param is_sp
+ *   Indicates whether multi-producer path is needed or not
  * @param n
- *   The number of objects to add in the ring from the obj_table.
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
  * @param behavior
  *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
- *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items a possible from ring
+ *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where enqueue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where enqueue finishes
+ * @param free_entries
+ *   Returns the amount of free space in the ring BEFORE head was moved
  * @return
  *   Actual number of objects enqueued.
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-static inline unsigned int __attribute__((always_inline))
-__rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
-			 unsigned int n, enum rte_ring_queue_behavior behavior,
-			 unsigned int *free_space)
+static inline __attribute__((always_inline)) unsigned int
+__rte_ring_move_prod_head(struct rte_ring *r, int is_sp,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint32_t *old_head, uint32_t *new_head,
+		uint32_t *free_entries)
 {
-	uint32_t prod_head, prod_next;
-	uint32_t cons_tail, free_entries;
-	const unsigned int max = n;
+	const uint32_t mask = r->mask;
+	unsigned int max = n;
 	int success;
-	uint32_t mask = r->mask;
 
-	/* move prod.head atomically */
 	do {
 		/* Reset n to the initial burst count */
 		n = max;
 
-		prod_head = r->prod.head;
-		cons_tail = r->cons.tail;
+		*old_head = r->prod.head;
+		const uint32_t cons_tail = r->cons.tail;
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
-		 * prod_head > cons_tail). So 'free_entries' is always between 0
+		 * *old_head > cons_tail). So 'free_entries' is always between 0
 		 * and size(ring)-1. */
-		free_entries = (mask + cons_tail - prod_head);
+		*free_entries = (mask + cons_tail - *old_head);
 
 		/* check that we have enough room in ring */
-		if (unlikely(n > free_entries))
+		if (unlikely(n > *free_entries))
 			n = (behavior == RTE_RING_QUEUE_FIXED) ?
-					0 : free_entries;
+					0 : *free_entries;
 
 		if (n == 0)
-			goto end;
-
-		prod_next = prod_head + n;
-		success = rte_atomic32_cmpset(&r->prod.head, prod_head,
-					      prod_next);
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sp)
+			r->prod.head = *new_head, success = 1;
+		else
+			success = rte_atomic32_cmpset(&r->prod.head,
+					*old_head, *new_head);
 	} while (unlikely(success == 0));
-
-	/* write entries in ring */
-	ENQUEUE_PTRS();
-	rte_smp_wmb();
-
-	/*
-	 * If there are other enqueues in progress that preceded us,
-	 * we need to wait for them to complete
-	 */
-	while (unlikely(r->prod.tail != prod_head))
-		rte_pause();
-
-	r->prod.tail = prod_next;
-end:
-	if (free_space != NULL)
-		*free_space = free_entries - n;
 	return n;
 }
 
 /**
- * @internal Enqueue several objects on a ring (NOT multi-producers safe).
+ * @internal Enqueue several objects on the ring
  *
- * @param r
+  * @param r
  *   A pointer to the ring structure.
  * @param obj_table
  *   A pointer to a table of void * pointers (objects).
@@ -415,44 +410,40 @@  __rte_ring_mp_do_enqueue(struct rte_ring *r, void * const *obj_table,
  *   The number of objects to add in the ring from the obj_table.
  * @param behavior
  *   RTE_RING_QUEUE_FIXED:    Enqueue a fixed number of items from a ring
- *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items a possible from ring
+ *   RTE_RING_QUEUE_VARIABLE: Enqueue as many items as possible from ring
+ * @param is_sp
+ *   Indicates whether to use single producer or multi-producer head update
+ * @param free_space
+ *   returns the amount of space after the enqueue operation has finished
  * @return
  *   Actual number of objects enqueued.
  *   If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-static inline unsigned int __attribute__((always_inline))
-__rte_ring_sp_do_enqueue(struct rte_ring *r, void * const *obj_table,
-			 unsigned int n, enum rte_ring_queue_behavior behavior,
-			 unsigned int *free_space)
+static inline __attribute__((always_inline)) unsigned int
+__rte_ring_do_enqueue(struct rte_ring *r, void * const *obj_table,
+		 unsigned int n, enum rte_ring_queue_behavior behavior,
+		 int is_sp, unsigned int *free_space)
 {
-	uint32_t prod_head, cons_tail;
-	uint32_t prod_next, free_entries;
-	uint32_t mask = r->mask;
-
-	prod_head = r->prod.head;
-	cons_tail = r->cons.tail;
-	/* The subtraction is done between two unsigned 32bits value
-	 * (the result is always modulo 32 bits even if we have
-	 * prod_head > cons_tail). So 'free_entries' is always between 0
-	 * and size(ring)-1. */
-	free_entries = mask + cons_tail - prod_head;
-
-	/* check that we have enough room in ring */
-	if (unlikely(n > free_entries))
-		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : free_entries;
+	uint32_t prod_head, prod_next;
+	uint32_t free_entries;
 
+	n = __rte_ring_move_prod_head(r, is_sp, n, behavior,
+			&prod_head, &prod_next, &free_entries);
 	if (n == 0)
 		goto end;
 
-
-	prod_next = prod_head + n;
-	r->prod.head = prod_next;
-
-	/* write entries in ring */
 	ENQUEUE_PTRS();
 	rte_smp_wmb();
 
+	/*
+	 * If there are other enqueues in progress that preceded us,
+	 * we need to wait for them to complete
+	 */
+	while (unlikely(r->prod.tail != prod_head))
+		rte_pause();
+
 	r->prod.tail = prod_next;
+
 end:
 	if (free_space != NULL)
 		*free_space = free_entries - n;
@@ -460,130 +451,112 @@  __rte_ring_sp_do_enqueue(struct rte_ring *r, void * const *obj_table,
 }
 
 /**
- * @internal Dequeue several objects from a ring (multi-consumers safe). When
- * the request objects are more than the available objects, only dequeue the
- * actual number of objects
- *
- * This function uses a "compare and set" instruction to move the
- * consumer index atomically.
+ * @internal This function updates the consumer head for dequeue
  *
  * @param r
- *   A pointer to the ring structure.
- * @param obj_table
- *   A pointer to a table of void * pointers (objects) that will be filled.
+ *   A pointer to the ring structure
+ * @param is_sc
+ *   Indicates whether multi-consumer path is needed or not
  * @param n
- *   The number of objects to dequeue from the ring to the obj_table.
+ *   The number of elements we will want to enqueue, i.e. how far should the
+ *   head be moved
  * @param behavior
  *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
- *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items a possible from ring
+ *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
+ * @param old_head
+ *   Returns head value as it was before the move, i.e. where dequeue starts
+ * @param new_head
+ *   Returns the current/new head value i.e. where dequeue finishes
+ * @param entries
+ *   Returns the number of entries in the ring BEFORE head was moved
  * @return
  *   - Actual number of objects dequeued.
  *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-
-static inline unsigned int __attribute__((always_inline))
-__rte_ring_mc_do_dequeue(struct rte_ring *r, void **obj_table,
-		 unsigned int n, enum rte_ring_queue_behavior behavior,
-		 unsigned int *available)
+static inline __attribute__((always_inline)) unsigned int
+__rte_ring_move_cons_head(struct rte_ring *r, int is_sc,
+		unsigned int n, enum rte_ring_queue_behavior behavior,
+		uint32_t *old_head, uint32_t *new_head,
+		uint32_t *entries)
 {
-	uint32_t cons_head, prod_tail;
-	uint32_t cons_next, entries;
-	const unsigned max = n;
+	unsigned int max = n;
 	int success;
-	uint32_t mask = r->mask;
 
 	/* move cons.head atomically */
 	do {
 		/* Restore n as it may change every loop */
 		n = max;
 
-		cons_head = r->cons.head;
-		prod_tail = r->prod.tail;
+		*old_head = r->cons.head;
+		const uint32_t prod_tail = r->prod.tail;
 		/* The subtraction is done between two unsigned 32bits value
 		 * (the result is always modulo 32 bits even if we have
 		 * cons_head > prod_tail). So 'entries' is always between 0
 		 * and size(ring)-1. */
-		entries = (prod_tail - cons_head);
+		*entries = (prod_tail - *old_head);
 
 		/* Set the actual entries for dequeue */
-		if (n > entries)
-			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : entries;
+		if (n > *entries)
+			n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : *entries;
 
 		if (unlikely(n == 0))
-			goto end;
-
-		cons_next = cons_head + n;
-		success = rte_atomic32_cmpset(&r->cons.head, cons_head,
-					      cons_next);
+			return 0;
+
+		*new_head = *old_head + n;
+		if (is_sc)
+			r->cons.head = *new_head, success = 1;
+		else
+			success = rte_atomic32_cmpset(&r->cons.head, *old_head,
+					*new_head);
 	} while (unlikely(success == 0));
-
-	/* copy in table */
-	DEQUEUE_PTRS();
-	rte_smp_rmb();
-
-	/*
-	 * If there are other dequeues in progress that preceded us,
-	 * we need to wait for them to complete
-	 */
-	while (unlikely(r->cons.tail != cons_head))
-		rte_pause();
-
-	r->cons.tail = cons_next;
-end:
-	if (available != NULL)
-		*available = entries - n;
 	return n;
 }
 
 /**
- * @internal Dequeue several objects from a ring (NOT multi-consumers safe).
- * When the request objects are more than the available objects, only dequeue
- * the actual number of objects
+ * @internal Dequeue several objects from the ring
  *
  * @param r
  *   A pointer to the ring structure.
  * @param obj_table
- *   A pointer to a table of void * pointers (objects) that will be filled.
+ *   A pointer to a table of void * pointers (objects).
  * @param n
- *   The number of objects to dequeue from the ring to the obj_table.
+ *   The number of objects to pull from the ring.
  * @param behavior
  *   RTE_RING_QUEUE_FIXED:    Dequeue a fixed number of items from a ring
- *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items a possible from ring
+ *   RTE_RING_QUEUE_VARIABLE: Dequeue as many items as possible from ring
+ * @param is_sc
+ *   Indicates whether to use single consumer or multi-consumer head update
+ * @param available
+ *   returns the number of remaining ring entries after the dequeue has finished
  * @return
  *   - Actual number of objects dequeued.
  *     If behavior == RTE_RING_QUEUE_FIXED, this will be 0 or n only.
  */
-static inline unsigned int __attribute__((always_inline))
-__rte_ring_sc_do_dequeue(struct rte_ring *r, void **obj_table,
+static inline __attribute__((always_inline)) unsigned int
+__rte_ring_do_dequeue(struct rte_ring *r, void **obj_table,
 		 unsigned int n, enum rte_ring_queue_behavior behavior,
-		 unsigned int *available)
+		 int is_mp, unsigned int *available)
 {
-	uint32_t cons_head, prod_tail;
-	uint32_t cons_next, entries;
-	uint32_t mask = r->mask;
-
-	cons_head = r->cons.head;
-	prod_tail = r->prod.tail;
-	/* The subtraction is done between two unsigned 32bits value
-	 * (the result is always modulo 32 bits even if we have
-	 * cons_head > prod_tail). So 'entries' is always between 0
-	 * and size(ring)-1. */
-	entries = prod_tail - cons_head;
-
-	if (n > entries)
-		n = (behavior == RTE_RING_QUEUE_FIXED) ? 0 : entries;
-
-	if (unlikely(entries == 0))
-		goto end;
+	uint32_t cons_head, cons_next;
+	uint32_t entries;
 
-	cons_next = cons_head + n;
-	r->cons.head = cons_next;
+	n = __rte_ring_move_cons_head(r, is_mp, n, behavior,
+			&cons_head, &cons_next, &entries);
+	if (n == 0)
+		goto end;
 
-	/* copy in table */
 	DEQUEUE_PTRS();
 	rte_smp_rmb();
 
+	/*
+	 * If there are other enqueues in progress that preceded us,
+	 * we need to wait for them to complete
+	 */
+	while (unlikely(r->cons.tail != cons_head))
+		rte_pause();
+
 	r->cons.tail = cons_next;
+
 end:
 	if (available != NULL)
 		*available = entries - n;
@@ -609,8 +582,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_mp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
-	return __rte_ring_mp_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
-			free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+			__IS_MP, free_space);
 }
 
 /**
@@ -629,8 +602,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_sp_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
-	return __rte_ring_sp_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
-			free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+			__IS_SP, free_space);
 }
 
 /**
@@ -653,10 +626,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_enqueue_bulk(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
-	if (r->prod.sp_enqueue)
-		return rte_ring_sp_enqueue_bulk(r, obj_table, n, free_space);
-	else
-		return rte_ring_mp_enqueue_bulk(r, obj_table, n, free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+			r->prod.sp_enqueue, free_space);
 }
 
 /**
@@ -736,8 +707,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_mc_dequeue_bulk(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-	return __rte_ring_mc_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
-			available);
+	return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+			__IS_MC, available);
 }
 
 /**
@@ -757,8 +728,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_sc_dequeue_bulk(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-	return __rte_ring_sc_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
-			available);
+	return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+			__IS_SC, available);
 }
 
 /**
@@ -781,10 +752,8 @@  static inline unsigned int __attribute__((always_inline))
 rte_ring_dequeue_bulk(struct rte_ring *r, void **obj_table, unsigned int n,
 		unsigned int *available)
 {
-	if (r->cons.sc_dequeue)
-		return rte_ring_sc_dequeue_bulk(r, obj_table, n, available);
-	else
-		return rte_ring_mc_dequeue_bulk(r, obj_table, n, available);
+	return __rte_ring_do_dequeue(r, obj_table, n, RTE_RING_QUEUE_FIXED,
+				r->cons.sc_dequeue, available);
 }
 
 /**
@@ -967,8 +936,8 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_mp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
-	return __rte_ring_mp_do_enqueue(r, obj_table, n,
-			RTE_RING_QUEUE_VARIABLE, free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n,
+			RTE_RING_QUEUE_VARIABLE, __IS_MP, free_space);
 }
 
 /**
@@ -987,8 +956,8 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_sp_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 			 unsigned int n, unsigned int *free_space)
 {
-	return __rte_ring_sp_do_enqueue(r, obj_table, n,
-			RTE_RING_QUEUE_VARIABLE, free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n,
+			RTE_RING_QUEUE_VARIABLE, __IS_SP, free_space);
 }
 
 /**
@@ -1011,10 +980,8 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_enqueue_burst(struct rte_ring *r, void * const *obj_table,
 		      unsigned int n, unsigned int *free_space)
 {
-	if (r->prod.sp_enqueue)
-		return rte_ring_sp_enqueue_burst(r, obj_table, n, free_space);
-	else
-		return rte_ring_mp_enqueue_burst(r, obj_table, n, free_space);
+	return __rte_ring_do_enqueue(r, obj_table, n, RTE_RING_QUEUE_VARIABLE,
+			r->prod.sp_enqueue, free_space);
 }
 
 /**
@@ -1038,8 +1005,8 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_mc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-	return __rte_ring_mc_do_dequeue(r, obj_table, n,
-			RTE_RING_QUEUE_VARIABLE, available);
+	return __rte_ring_do_dequeue(r, obj_table, n,
+			RTE_RING_QUEUE_VARIABLE, __IS_MC, available);
 }
 
 /**
@@ -1060,8 +1027,8 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_sc_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-	return __rte_ring_sc_do_dequeue(r, obj_table, n,
-			RTE_RING_QUEUE_VARIABLE, available);
+	return __rte_ring_do_dequeue(r, obj_table, n,
+			RTE_RING_QUEUE_VARIABLE, __IS_SC, available);
 }
 
 /**
@@ -1084,10 +1051,9 @@  static inline unsigned __attribute__((always_inline))
 rte_ring_dequeue_burst(struct rte_ring *r, void **obj_table,
 		unsigned int n, unsigned int *available)
 {
-	if (r->cons.sc_dequeue)
-		return rte_ring_sc_dequeue_burst(r, obj_table, n, available);
-	else
-		return rte_ring_mc_dequeue_burst(r, obj_table, n, available);
+	return __rte_ring_do_dequeue(r, obj_table, n,
+				RTE_RING_QUEUE_VARIABLE,
+				r->cons.sc_dequeue, available);
 }
 
 #ifdef __cplusplus