[dpdk-dev,v2] sched: fix overflow errors in WRR weighting code

Message ID 1512032696-30765-1-git-send-email-alan.dewar@att.com (mailing list archive)
State Rejected, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Alan Dewar Nov. 30, 2017, 9:04 a.m. UTC
  From: Alan Dewar <alan.dewar@att.com>

Revised patch - this version fixes an issue when a small wrr_cost is
shifted so far right that its value becomes zero.

The WRR code calculates the lowest common denominator between the four
WRR weights as a uint32_t value and divides the LCD by each of the WRR
weights and casts the results as a uint8_t.  This casting can cause
the ratios of the computed wrr costs to be wrong.  For example with
WRR weights of 3, 5, 7 and 11, the LCD is computed to be
1155.  The WRR costs get computed as:

  1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105.

When the value 385 is cast into an uint8_t it ends up as 129.
Rather than casting straight into a uint8_t, this patch shifts the
computed WRR costs right so that the largest value is only eight bits
wide.

In grinder_schedule, the packet length is multiplied by the WRR cost
and added to the grinder's wrr_tokens value.  The grinder's wrr_tokens
field is a uint16_t, so combination of a packet length of 1500 bytes
and a wrr cost of 44 will overflow this field on the first packet.

This patch increases the width of the grinder's wrr_tokens and
wrr_mask fields from uint16_t to uint32_t.

In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens
array are copied to the appropriate pipe's wrr_tokens array.  However
the pipe's wrr_tokens array is only a uint8_t array so unused tokens
were quite frequently lost which upsets the balance of traffic across
the four WRR queues.

This patch increases the width of the pipe's wrr_tokens array from
a uint8_t to uint32_t.

Signed-off-by: Alan Dewar <alan.dewar@att.com>
---
v2 - fixed bug in the wrr_cost calculation code that could result
in a zero wrr_cost

 lib/librte_sched/rte_sched.c        | 59 +++++++++++++++++++++++++++++--------
 lib/librte_sched/rte_sched_common.h | 15 ++++++++++
 2 files changed, 61 insertions(+), 13 deletions(-)
  

Comments

Luca Boccassi Nov. 30, 2017, 10:47 a.m. UTC | #1
On Thu, 2017-11-30 at 09:04 +0000, alangordondewar@gmail.com wrote:
> From: Alan Dewar <alan.dewar@att.com>
> 
> Revised patch - this version fixes an issue when a small wrr_cost is
> shifted so far right that its value becomes zero.
> 
> The WRR code calculates the lowest common denominator between the
> four
> WRR weights as a uint32_t value and divides the LCD by each of the
> WRR
> weights and casts the results as a uint8_t.  This casting can cause
> the ratios of the computed wrr costs to be wrong.  For example with
> WRR weights of 3, 5, 7 and 11, the LCD is computed to be
> 1155.  The WRR costs get computed as:
> 
>   1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105.
> 
> When the value 385 is cast into an uint8_t it ends up as 129.
> Rather than casting straight into a uint8_t, this patch shifts the
> computed WRR costs right so that the largest value is only eight bits
> wide.
> 
> In grinder_schedule, the packet length is multiplied by the WRR cost
> and added to the grinder's wrr_tokens value.  The grinder's
> wrr_tokens
> field is a uint16_t, so combination of a packet length of 1500 bytes
> and a wrr cost of 44 will overflow this field on the first packet.
> 
> This patch increases the width of the grinder's wrr_tokens and
> wrr_mask fields from uint16_t to uint32_t.
> 
> In grinder_wrr_store, the remaining tokens in the grinder's
> wrr_tokens
> array are copied to the appropriate pipe's wrr_tokens array.  However
> the pipe's wrr_tokens array is only a uint8_t array so unused tokens
> were quite frequently lost which upsets the balance of traffic across
> the four WRR queues.
> 
> This patch increases the width of the pipe's wrr_tokens array from
> a uint8_t to uint32_t.
> 
> Signed-off-by: Alan Dewar <alan.dewar@att.com>
> ---
> v2 - fixed bug in the wrr_cost calculation code that could result
> in a zero wrr_cost
> 
>  lib/librte_sched/rte_sched.c        | 59
> +++++++++++++++++++++++++++++--------
>  lib/librte_sched/rte_sched_common.h | 15 ++++++++++
>  2 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_sched/rte_sched.c
> b/lib/librte_sched/rte_sched.c
> index 7252f85..324743d 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -130,7 +130,7 @@ struct rte_sched_pipe {
>  	uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
>  
>  	/* Weighted Round Robin (WRR) */
> -	uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
>  
>  	/* TC oversubscription */
>  	uint32_t tc_ov_credits;
> @@ -205,8 +205,8 @@ struct rte_sched_grinder {
>  	struct rte_mbuf *pkt;
>  
>  	/* WRR */
> -	uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> -	uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  	uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  };
>  
> @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms,
> uint32_t rate)
>  	return time;
>  }
>  
> +static uint32_t rte_sched_reduce_to_byte(uint32_t value)
> +{
> +	uint32_t shift = 0;
> +
> +	while (value & 0xFFFFFF00) {
> +		value >>= 1;
> +		shift++;
> +	}
> +	return shift;
> +}
> +
>  static void
>  rte_sched_port_config_pipe_profile_table(struct rte_sched_port
> *port, struct rte_sched_port_params *params)
>  {
> @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			uint32_t
> wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  			uint32_t lcd, lcd1, lcd2;
>  			uint32_t qindex;
> +			uint32_t low_pos;
> +			uint32_t shift;
>  
>  			qindex = j *
> RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS;
>  
> @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			lcd1 = rte_get_lcd(wrr_cost[0],
> wrr_cost[1]);
>  			lcd2 = rte_get_lcd(wrr_cost[2],
> wrr_cost[3]);
>  			lcd = rte_get_lcd(lcd1, lcd2);
> +			low_pos = rte_min_pos_4_u32(wrr_cost);
>  
>  			wrr_cost[0] = lcd / wrr_cost[0];
>  			wrr_cost[1] = lcd / wrr_cost[1];
>  			wrr_cost[2] = lcd / wrr_cost[2];
>  			wrr_cost[3] = lcd / wrr_cost[3];
>  
> +			shift =
> rte_sched_reduce_to_byte(wrr_cost[low_pos]);
> +			wrr_cost[0] >>= shift;
> +			wrr_cost[1] >>= shift;
> +			wrr_cost[2] >>= shift;
> +			wrr_cost[3] >>= shift;
> +
> +			if (wrr_cost[0] == 0)
> +				wrr_cost[0]++;
> +			if (wrr_cost[1] == 0)
> +				wrr_cost[1]++;
> +			if (wrr_cost[2] == 0)
> +				wrr_cost[2]++;
> +			if (wrr_cost[3] == 0)
> +				wrr_cost[3]++;
> +
>  			dst->wrr_cost[qindex] = (uint8_t)
> wrr_cost[0];
>  			dst->wrr_cost[qindex + 1] = (uint8_t)
> wrr_cost[1];
>  			dst->wrr_cost[qindex + 2] = (uint8_t)
> wrr_cost[2];
> @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port,
> uint32_t pos)
>  
>  	qindex = tc_index * 4;
>  
> -	grinder->wrr_tokens[0] = ((uint16_t) pipe-
> >wrr_tokens[qindex]) << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex 
> + 1]) << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex 
> + 2]) << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex 
> + 3]) << RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] <<
> +		RTE_SCHED_WRR_SHIFT;
>  
> -	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
> +	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF;
>  
>  	grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex];
>  	grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1];
> @@ -1981,14 +2014,14 @@ static inline void
>  grinder_wrr(struct rte_sched_port *port, uint32_t pos)
>  {
>  	struct rte_sched_grinder *grinder = port->grinder + pos;
> -	uint16_t wrr_tokens_min;
> +	uint32_t wrr_tokens_min;
>  
>  	grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
>  	grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
>  	grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
>  	grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
>  
> -	grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
> +	grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens);
>  	wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
>  
>  	grinder->wrr_tokens[0] -= wrr_tokens_min;
> diff --git a/lib/librte_sched/rte_sched_common.h
> b/lib/librte_sched/rte_sched_common.h
> index aed144b..a3c6bc2 100644
> --- a/lib/librte_sched/rte_sched_common.h
> +++ b/lib/librte_sched/rte_sched_common.h
> @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x)
>  	return pos0;
>  }
>  
> +static inline uint32_t
> +rte_min_pos_4_u32(uint32_t *x)
> +{
> +	uint32_t pos0 = 0;
> +	uint32_t pos1 = 2;
> +
> +	if (x[1] <= x[0])
> +		pos0 = 1;
> +	if (x[3] <= x[2])
> +		pos1 = 3;
> +	if (x[pos1] <= x[pos0])
> +		pos0 = pos1;
> +
> +	return pos0;
> +}
>  #endif
>  
>  /*

Reviewed-by: Luca Boccassi <bluca@debian.org>

LGTM
  
Cristian Dumitrescu Jan. 2, 2018, 4:13 p.m. UTC | #2
Hi Alan,

NAK for now.

There is a good reason for truncating the WRR cost to 8-bit value, which is keeping the size of the rte_sched_pipe structure to single cache line (64 bytes). This is done for performance reasons at the expense of some accuracy loss for the scheduling of the 4x queues per traffic class.

Is there a way to make the improvement while working with 8-bit WRR cost values in the pipe structure?

> -----Original Message-----
> From: alangordondewar@gmail.com [mailto:alangordondewar@gmail.com]
> Sent: Thursday, November 30, 2017 9:05 AM
> To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com>
> Subject: [PATCH v2] sched: fix overflow errors in WRR weighting code
> 
> From: Alan Dewar <alan.dewar@att.com>
> 
> Revised patch - this version fixes an issue when a small wrr_cost is
> shifted so far right that its value becomes zero.
> 
> The WRR code calculates the lowest common denominator between the
> four
> WRR weights as a uint32_t value and divides the LCD by each of the WRR
> weights and casts the results as a uint8_t.  This casting can cause
> the ratios of the computed wrr costs to be wrong.  For example with
> WRR weights of 3, 5, 7 and 11, the LCD is computed to be
> 1155.  The WRR costs get computed as:

Picking prime numbers for the weights is generally a bad idea. If you would pick e.g. 4, 6, 8 and 12 rather than 3, 5, 7 and 11 you would avoid any issues due to the 8-bit truncation.

> 
>   1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105.
> 
> When the value 385 is cast into an uint8_t it ends up as 129.
> Rather than casting straight into a uint8_t, this patch shifts the
> computed WRR costs right so that the largest value is only eight bits
> wide.
> 
> In grinder_schedule, the packet length is multiplied by the WRR cost
> and added to the grinder's wrr_tokens value.  The grinder's wrr_tokens
> field is a uint16_t, so combination of a packet length of 1500 bytes
> and a wrr cost of 44 will overflow this field on the first packet.
> 
> This patch increases the width of the grinder's wrr_tokens and
> wrr_mask fields from uint16_t to uint32_t.
> 

Increasing the size of the grinder fields is OK, but I am not sure whether it is really helpful, as the values saved in the pipe structure are 8-bit.


> In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens
> array are copied to the appropriate pipe's wrr_tokens array.  However
> the pipe's wrr_tokens array is only a uint8_t array so unused tokens
> were quite frequently lost which upsets the balance of traffic across
> the four WRR queues.
> 
> This patch increases the width of the pipe's wrr_tokens array from
> a uint8_t to uint32_t.

This is not allowed for performance reasons, as having 16-bit or 32-bit WRR cost values in pipe structure would increase the size of the pipe structure from one cache line to two cache lines.

> 
> Signed-off-by: Alan Dewar <alan.dewar@att.com>
> ---
> v2 - fixed bug in the wrr_cost calculation code that could result
> in a zero wrr_cost
> 
>  lib/librte_sched/rte_sched.c        | 59 +++++++++++++++++++++++++++++---
> -----
>  lib/librte_sched/rte_sched_common.h | 15 ++++++++++
>  2 files changed, 61 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
> index 7252f85..324743d 100644
> --- a/lib/librte_sched/rte_sched.c
> +++ b/lib/librte_sched/rte_sched.c
> @@ -130,7 +130,7 @@ struct rte_sched_pipe {
>  	uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
> 
>  	/* Weighted Round Robin (WRR) */
> -	uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> 
>  	/* TC oversubscription */
>  	uint32_t tc_ov_credits;
> @@ -205,8 +205,8 @@ struct rte_sched_grinder {
>  	struct rte_mbuf *pkt;
> 
>  	/* WRR */
> -	uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> -	uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> +	uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  	uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  };
> 
> @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms,
> uint32_t rate)
>  	return time;
>  }
> 
> +static uint32_t rte_sched_reduce_to_byte(uint32_t value)
> +{
> +	uint32_t shift = 0;
> +
> +	while (value & 0xFFFFFF00) {
> +		value >>= 1;
> +		shift++;
> +	}
> +	return shift;
> +}
> +
>  static void
>  rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port,
> struct rte_sched_port_params *params)
>  {
> @@ -583,6 +594,8 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			uint32_t
> wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
>  			uint32_t lcd, lcd1, lcd2;
>  			uint32_t qindex;
> +			uint32_t low_pos;
> +			uint32_t shift;
> 
>  			qindex = j *
> RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS;
> 
> @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct
> rte_sched_port *port, struct rte
>  			lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
>  			lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
>  			lcd = rte_get_lcd(lcd1, lcd2);
> +			low_pos = rte_min_pos_4_u32(wrr_cost);
> 
>  			wrr_cost[0] = lcd / wrr_cost[0];
>  			wrr_cost[1] = lcd / wrr_cost[1];
>  			wrr_cost[2] = lcd / wrr_cost[2];
>  			wrr_cost[3] = lcd / wrr_cost[3];
> 
> +			shift =
> rte_sched_reduce_to_byte(wrr_cost[low_pos]);
> +			wrr_cost[0] >>= shift;
> +			wrr_cost[1] >>= shift;
> +			wrr_cost[2] >>= shift;
> +			wrr_cost[3] >>= shift;
> +
> +			if (wrr_cost[0] == 0)
> +				wrr_cost[0]++;
> +			if (wrr_cost[1] == 0)
> +				wrr_cost[1]++;
> +			if (wrr_cost[2] == 0)
> +				wrr_cost[2]++;
> +			if (wrr_cost[3] == 0)
> +				wrr_cost[3]++;
> +
>  			dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0];
>  			dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1];
>  			dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2];
> @@ -1941,15 +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port,
> uint32_t pos)
> 
>  	qindex = tc_index * 4;
> 
> -	grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) <<
> RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1])
> << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2])
> << RTE_SCHED_WRR_SHIFT;
> -	grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3])
> << RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] <<
> +		RTE_SCHED_WRR_SHIFT;
> +	grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] <<
> +		RTE_SCHED_WRR_SHIFT;
> 
> -	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
> -	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
> +	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF;
> +	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF;
> 
>  	grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex];
>  	grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1];
> @@ -1981,14 +2014,14 @@ static inline void
>  grinder_wrr(struct rte_sched_port *port, uint32_t pos)
>  {
>  	struct rte_sched_grinder *grinder = port->grinder + pos;
> -	uint16_t wrr_tokens_min;
> +	uint32_t wrr_tokens_min;
> 
>  	grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
>  	grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
>  	grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
>  	grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
> 
> -	grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
> +	grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens);
>  	wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
> 
>  	grinder->wrr_tokens[0] -= wrr_tokens_min;
> diff --git a/lib/librte_sched/rte_sched_common.h
> b/lib/librte_sched/rte_sched_common.h
> index aed144b..a3c6bc2 100644
> --- a/lib/librte_sched/rte_sched_common.h
> +++ b/lib/librte_sched/rte_sched_common.h
> @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x)
>  	return pos0;
>  }
> 
> +static inline uint32_t
> +rte_min_pos_4_u32(uint32_t *x)
> +{
> +	uint32_t pos0 = 0;
> +	uint32_t pos1 = 2;
> +
> +	if (x[1] <= x[0])
> +		pos0 = 1;
> +	if (x[3] <= x[2])
> +		pos1 = 3;
> +	if (x[pos1] <= x[pos0])
> +		pos0 = pos1;
> +
> +	return pos0;
> +}
>  #endif
> 
>  /*
> --
> 2.1.4

Regards,
Cristian
  
Dewar, Alan Jan. 3, 2018, 1:45 p.m. UTC | #3
Hi Cristian,

Responses in-line below...

> -----Original Message-----
> From: Dumitrescu, Cristian [mailto:cristian.dumitrescu@intel.com] 
> Sent: Tuesday, January 02, 2018 4:14 PM
> To: alangordondewar@gmail.com
> Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com>
> Subject: RE: [PATCH v2] sched: fix overflow errors in WRR weighting code
>
> Hi Alan,
>
> NAK for now.
>
> There is a good reason for truncating the WRR cost to 8-bit value, which is keeping the size of the rte_sched_pipe structure to single cache line (64 bytes). This is done for performance reasons at the expense of some accuracy loss for the scheduling of the 4x queues per traffic class.
>

Ah, I wasn't aware of this restriction, but I fully understand why you don't want to increase the size of the rte_sched_pipe structure.

> Is there a way to make the improvement while working with 8-bit WRR cost values in the pipe structure?

Off the top of my head, I think that we might be able to improve the behavior of WRR queues a little, but not as well as we could with 32-bit wrr_cost array in the rte_sched_pipe structure.  I'll need to play around with it and see what happens.

>
> > -----Original Message-----
> > From: alangordondewar@gmail.com [mailto:alangordondewar@gmail.com]
> > Sent: Thursday, November 30, 2017 9:05 AM
> > To: Dumitrescu, Cristian <cristian.dumitrescu@intel.com>
> > Cc: dev@dpdk.org; Alan Dewar <alan.dewar@att.com>
> > Subject: [PATCH v2] sched: fix overflow errors in WRR weighting code
> > 
> > From: Alan Dewar <alan.dewar@att.com>
> > 
> > Revised patch - this version fixes an issue when a small wrr_cost is 
> > shifted so far right that its value becomes zero.
> > 
> > The WRR code calculates the lowest common denominator between the four 
> > WRR weights as a uint32_t value and divides the LCD by each of the WRR 
> > weights and casts the results as a uint8_t.  This casting can cause
> > the ratios of the computed wrr costs to be wrong.  For example with 
> > WRR weights of 3, 5, 7 and 11, the LCD is computed to be 1155.  The 
> > WRR costs get computed as:
>
> Picking prime numbers for the weights is generally a bad idea. If you would pick e.g. 4, 6, 8 and 12 rather than 3, 5, 7 and 11 you would avoid any issues due to the 8-bit truncation.
>

I selected 3, 5, 7 and 11 as a pathological case to illustrate part of the problem, I think that it is unlikely that customers will actually use these numbers.
However the DPDK doesn't impose any restrictions on what values wrr_weights can have any values between 1 and 255 are allowed.  In the product where we are using the DPDK as part of a software dataplane, we currently restricted wrr_weights to values between 1 and 100.    However we have no control over the values that our customers select within that range. 

To impose any further restriction would not be backwards compatible with previous versions of our software.   This non-backwards compatibility is not acceptable to customers when they upgrade the software on their routers with the newer version, and their previously accepted configuration is rejected by the new version of the software.


> > 
> >   1155/3 = 385, 1155/5 = 231, 1155/7 = 165, 1155/11 = 105.
> > 
> > When the value 385 is cast into an uint8_t it ends up as 129.
> > Rather than casting straight into a uint8_t, this patch shifts the 
> > computed WRR costs right so that the largest value is only eight bits 
> > wide.
> > 
> > In grinder_schedule, the packet length is multiplied by the WRR cost 
> > and added to the grinder's wrr_tokens value.  The grinder's wrr_tokens 
> > field is a uint16_t, so combination of a packet length of 1500 bytes 
> > and a wrr cost of 44 will overflow this field on the first packet.
> > 
> > This patch increases the width of the grinder's wrr_tokens and 
> > wrr_mask fields from uint16_t to uint32_t.
> > 
>
> Increasing the size of the grinder fields is OK, but I am not sure whether it is really helpful, as the values saved in the pipe structure are 8-bit.
>

I will investigate this further, but I don't think that we will see much benefit.

>
> > In grinder_wrr_store, the remaining tokens in the grinder's wrr_tokens 
> > array are copied to the appropriate pipe's wrr_tokens array.  However 
> > the pipe's wrr_tokens array is only a uint8_t array so unused tokens 
> > were quite frequently lost which upsets the balance of traffic across 
> > the four WRR queues.
> > 
> > This patch increases the width of the pipe's wrr_tokens array from a 
> > uint8_t to uint32_t.
>
> This is not allowed for performance reasons, as having 16-bit or 32-bit WRR cost values in pipe structure would increase the size of the pipe structure from one cache line to two cache lines.

Understood, it might be interesting to try to determine what the performance hit is.

Regards
Alan

>
> > 
> > Signed-off-by: Alan Dewar <alan.dewar@att.com>
> > ---
> > v2 - fixed bug in the wrr_cost calculation code that could result in a 
> > zero wrr_cost
> > 
> >  lib/librte_sched/rte_sched.c        | 59 +++++++++++++++++++++++++++++---
> > -----
> >  lib/librte_sched/rte_sched_common.h | 15 ++++++++++
> >  2 files changed, 61 insertions(+), 13 deletions(-)
> > 
> > diff --git a/lib/librte_sched/rte_sched.c 
> > b/lib/librte_sched/rte_sched.c index 7252f85..324743d 100644
> > --- a/lib/librte_sched/rte_sched.c
> > +++ b/lib/librte_sched/rte_sched.c
> > @@ -130,7 +130,7 @@ struct rte_sched_pipe {
> >  	uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
> > 
> >  	/* Weighted Round Robin (WRR) */
> > -	uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> > +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
> > 
> >  	/* TC oversubscription */
> >  	uint32_t tc_ov_credits;
> > @@ -205,8 +205,8 @@ struct rte_sched_grinder {
> >  	struct rte_mbuf *pkt;
> > 
> >  	/* WRR */
> > -	uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> > -	uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> > +	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> > +	uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> >  	uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> >  };
> > 
> > @@ -542,6 +542,17 @@ rte_sched_time_ms_to_bytes(uint32_t time_ms, 
> > uint32_t rate)
> >  	return time;
> >  }
> > 
> > +static uint32_t rte_sched_reduce_to_byte(uint32_t value) {
> > +	uint32_t shift = 0;
> > +
> > +	while (value & 0xFFFFFF00) {
> > +		value >>= 1;
> > +		shift++;
> > +	}
> > +	return shift;
> > +}
> > +
> >  static void
> >  rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, 
> > struct rte_sched_port_params *params)  { @@ -583,6 +594,8 @@ 
> > rte_sched_port_config_pipe_profile_table(struct
> > rte_sched_port *port, struct rte
> >  			uint32_t
> > wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
> >  			uint32_t lcd, lcd1, lcd2;
> >  			uint32_t qindex;
> > +			uint32_t low_pos;
> > +			uint32_t shift;
> > 
> >  			qindex = j *
> > RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS;
> > 
> > @@ -594,12 +607,28 @@ rte_sched_port_config_pipe_profile_table(struct
> > rte_sched_port *port, struct rte
> >  			lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
> >  			lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
> >  			lcd = rte_get_lcd(lcd1, lcd2);
> > +			low_pos = rte_min_pos_4_u32(wrr_cost);
> > 
> >  			wrr_cost[0] = lcd / wrr_cost[0];
> >  			wrr_cost[1] = lcd / wrr_cost[1];
> >  			wrr_cost[2] = lcd / wrr_cost[2];
> >  			wrr_cost[3] = lcd / wrr_cost[3];
> > 
> > +			shift =
> > rte_sched_reduce_to_byte(wrr_cost[low_pos]);
> > +			wrr_cost[0] >>= shift;
> > +			wrr_cost[1] >>= shift;
> > +			wrr_cost[2] >>= shift;
> > +			wrr_cost[3] >>= shift;
> > +
> > +			if (wrr_cost[0] == 0)
> > +				wrr_cost[0]++;
> > +			if (wrr_cost[1] == 0)
> > +				wrr_cost[1]++;
> > +			if (wrr_cost[2] == 0)
> > +				wrr_cost[2]++;
> > +			if (wrr_cost[3] == 0)
> > +				wrr_cost[3]++;
> > +
> >  			dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0];
> >  			dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1];
> >  			dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2]; @@ -1941,15 
> > +1970,19 @@ grinder_wrr_load(struct rte_sched_port *port, uint32_t 
> > pos)
> > 
> >  	qindex = tc_index * 4;
> > 
> > -	grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) <<
> > RTE_SCHED_WRR_SHIFT;
> > -	grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1])
> > << RTE_SCHED_WRR_SHIFT;
> > -	grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2])
> > << RTE_SCHED_WRR_SHIFT;
> > -	grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3])
> > << RTE_SCHED_WRR_SHIFT;
> > +	grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] <<
> > +		RTE_SCHED_WRR_SHIFT;
> > +	grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] <<
> > +		RTE_SCHED_WRR_SHIFT;
> > +	grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] <<
> > +		RTE_SCHED_WRR_SHIFT;
> > +	grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] <<
> > +		RTE_SCHED_WRR_SHIFT;
> > 
> > -	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
> > -	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
> > -	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
> > -	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
> > +	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF;
> > +	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF;
> > +	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF;
> > +	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF;
> > 
> >  	grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex];
> >  	grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1]; @@ 
> > -1981,14 +2014,14 @@ static inline void  grinder_wrr(struct 
> > rte_sched_port *port, uint32_t pos)  {
> >  	struct rte_sched_grinder *grinder = port->grinder + pos;
> > -	uint16_t wrr_tokens_min;
> > +	uint32_t wrr_tokens_min;
> > 
> >  	grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
> >  	grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
> >  	grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
> >  	grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
> > 
> > -	grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
> > +	grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens);
> >  	wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
> > 
> >  	grinder->wrr_tokens[0] -= wrr_tokens_min; diff --git 
> > a/lib/librte_sched/rte_sched_common.h
> > b/lib/librte_sched/rte_sched_common.h
> > index aed144b..a3c6bc2 100644
> > --- a/lib/librte_sched/rte_sched_common.h
> > +++ b/lib/librte_sched/rte_sched_common.h
> > @@ -77,6 +77,21 @@ rte_min_pos_4_u16(uint16_t *x)
> >  	return pos0;
> >  }
> > 
> > +static inline uint32_t
> > +rte_min_pos_4_u32(uint32_t *x)
> > +{
> > +	uint32_t pos0 = 0;
> > +	uint32_t pos1 = 2;
> > +
> > +	if (x[1] <= x[0])
> > +		pos0 = 1;
> > +	if (x[3] <= x[2])
> > +		pos1 = 3;
> > +	if (x[pos1] <= x[pos0])
> > +		pos0 = pos1;
> > +
> > +	return pos0;
> > +}
> >  #endif
> > 
> >  /*
> > --
> > 2.1.4
>
> Regards,
> Cristian
  

Patch

diff --git a/lib/librte_sched/rte_sched.c b/lib/librte_sched/rte_sched.c
index 7252f85..324743d 100644
--- a/lib/librte_sched/rte_sched.c
+++ b/lib/librte_sched/rte_sched.c
@@ -130,7 +130,7 @@  struct rte_sched_pipe {
 	uint32_t tc_credits[RTE_SCHED_TRAFFIC_CLASSES_PER_PIPE];
 
 	/* Weighted Round Robin (WRR) */
-	uint8_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
+	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_PIPE];
 
 	/* TC oversubscription */
 	uint32_t tc_ov_credits;
@@ -205,8 +205,8 @@  struct rte_sched_grinder {
 	struct rte_mbuf *pkt;
 
 	/* WRR */
-	uint16_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
-	uint16_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
+	uint32_t wrr_tokens[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
+	uint32_t wrr_mask[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
 	uint8_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
 };
 
@@ -542,6 +542,17 @@  rte_sched_time_ms_to_bytes(uint32_t time_ms, uint32_t rate)
 	return time;
 }
 
+static uint32_t rte_sched_reduce_to_byte(uint32_t value)
+{
+	uint32_t shift = 0;
+
+	while (value & 0xFFFFFF00) {
+		value >>= 1;
+		shift++;
+	}
+	return shift;
+}
+
 static void
 rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte_sched_port_params *params)
 {
@@ -583,6 +594,8 @@  rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte
 			uint32_t wrr_cost[RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS];
 			uint32_t lcd, lcd1, lcd2;
 			uint32_t qindex;
+			uint32_t low_pos;
+			uint32_t shift;
 
 			qindex = j * RTE_SCHED_QUEUES_PER_TRAFFIC_CLASS;
 
@@ -594,12 +607,28 @@  rte_sched_port_config_pipe_profile_table(struct rte_sched_port *port, struct rte
 			lcd1 = rte_get_lcd(wrr_cost[0], wrr_cost[1]);
 			lcd2 = rte_get_lcd(wrr_cost[2], wrr_cost[3]);
 			lcd = rte_get_lcd(lcd1, lcd2);
+			low_pos = rte_min_pos_4_u32(wrr_cost);
 
 			wrr_cost[0] = lcd / wrr_cost[0];
 			wrr_cost[1] = lcd / wrr_cost[1];
 			wrr_cost[2] = lcd / wrr_cost[2];
 			wrr_cost[3] = lcd / wrr_cost[3];
 
+			shift = rte_sched_reduce_to_byte(wrr_cost[low_pos]);
+			wrr_cost[0] >>= shift;
+			wrr_cost[1] >>= shift;
+			wrr_cost[2] >>= shift;
+			wrr_cost[3] >>= shift;
+
+			if (wrr_cost[0] == 0)
+				wrr_cost[0]++;
+			if (wrr_cost[1] == 0)
+				wrr_cost[1]++;
+			if (wrr_cost[2] == 0)
+				wrr_cost[2]++;
+			if (wrr_cost[3] == 0)
+				wrr_cost[3]++;
+
 			dst->wrr_cost[qindex] = (uint8_t) wrr_cost[0];
 			dst->wrr_cost[qindex + 1] = (uint8_t) wrr_cost[1];
 			dst->wrr_cost[qindex + 2] = (uint8_t) wrr_cost[2];
@@ -1941,15 +1970,19 @@  grinder_wrr_load(struct rte_sched_port *port, uint32_t pos)
 
 	qindex = tc_index * 4;
 
-	grinder->wrr_tokens[0] = ((uint16_t) pipe->wrr_tokens[qindex]) << RTE_SCHED_WRR_SHIFT;
-	grinder->wrr_tokens[1] = ((uint16_t) pipe->wrr_tokens[qindex + 1]) << RTE_SCHED_WRR_SHIFT;
-	grinder->wrr_tokens[2] = ((uint16_t) pipe->wrr_tokens[qindex + 2]) << RTE_SCHED_WRR_SHIFT;
-	grinder->wrr_tokens[3] = ((uint16_t) pipe->wrr_tokens[qindex + 3]) << RTE_SCHED_WRR_SHIFT;
+	grinder->wrr_tokens[0] = pipe->wrr_tokens[qindex] <<
+		RTE_SCHED_WRR_SHIFT;
+	grinder->wrr_tokens[1] = pipe->wrr_tokens[qindex + 1] <<
+		RTE_SCHED_WRR_SHIFT;
+	grinder->wrr_tokens[2] = pipe->wrr_tokens[qindex + 2] <<
+		RTE_SCHED_WRR_SHIFT;
+	grinder->wrr_tokens[3] = pipe->wrr_tokens[qindex + 3] <<
+		RTE_SCHED_WRR_SHIFT;
 
-	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFF;
-	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFF;
-	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFF;
-	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFF;
+	grinder->wrr_mask[0] = (qmask & 0x1) * 0xFFFFFFFF;
+	grinder->wrr_mask[1] = ((qmask >> 1) & 0x1) * 0xFFFFFFFF;
+	grinder->wrr_mask[2] = ((qmask >> 2) & 0x1) * 0xFFFFFFFF;
+	grinder->wrr_mask[3] = ((qmask >> 3) & 0x1) * 0xFFFFFFFF;
 
 	grinder->wrr_cost[0] = pipe_params->wrr_cost[qindex];
 	grinder->wrr_cost[1] = pipe_params->wrr_cost[qindex + 1];
@@ -1981,14 +2014,14 @@  static inline void
 grinder_wrr(struct rte_sched_port *port, uint32_t pos)
 {
 	struct rte_sched_grinder *grinder = port->grinder + pos;
-	uint16_t wrr_tokens_min;
+	uint32_t wrr_tokens_min;
 
 	grinder->wrr_tokens[0] |= ~grinder->wrr_mask[0];
 	grinder->wrr_tokens[1] |= ~grinder->wrr_mask[1];
 	grinder->wrr_tokens[2] |= ~grinder->wrr_mask[2];
 	grinder->wrr_tokens[3] |= ~grinder->wrr_mask[3];
 
-	grinder->qpos = rte_min_pos_4_u16(grinder->wrr_tokens);
+	grinder->qpos = rte_min_pos_4_u32(grinder->wrr_tokens);
 	wrr_tokens_min = grinder->wrr_tokens[grinder->qpos];
 
 	grinder->wrr_tokens[0] -= wrr_tokens_min;
diff --git a/lib/librte_sched/rte_sched_common.h b/lib/librte_sched/rte_sched_common.h
index aed144b..a3c6bc2 100644
--- a/lib/librte_sched/rte_sched_common.h
+++ b/lib/librte_sched/rte_sched_common.h
@@ -77,6 +77,21 @@  rte_min_pos_4_u16(uint16_t *x)
 	return pos0;
 }
 
+static inline uint32_t
+rte_min_pos_4_u32(uint32_t *x)
+{
+	uint32_t pos0 = 0;
+	uint32_t pos1 = 2;
+
+	if (x[1] <= x[0])
+		pos0 = 1;
+	if (x[3] <= x[2])
+		pos1 = 3;
+	if (x[pos1] <= x[pos0])
+		pos0 = pos1;
+
+	return pos0;
+}
 #endif
 
 /*