[dpdk-dev,v4,4/4] net/mlx5: enforce Tx num of segments limitation

Message ID d8111a1597b60f77b60ffb284441f01c5f9bed1a.1505299539.git.shahafs@mellanox.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Shahaf Shuler Sept. 13, 2017, 10:50 a.m. UTC
  Mellanox NICs has a limitation on the number of mbuf segments a multi
segment mbuf can have. The max number depends on the Tx offloads requested.

The current code not enforce such limitation, which might cause
malformed work requests to be written to the device.

This commit adds verification for the number of mbuf segments posted
to the device. In case of overflow the packet will not be sent.

In addition update the nic documentation with the limitation.
Considering device limitation is 63 data segments in a work request, the
maximum number of segment in mbuf was calculated taking TSO as the worst
case:

max_nb_segs = 63 - (control_segment + ethernet segment +
		    TSO headers inline + inline segment +
		    extra inline to align to cacheline)

Cc: stable@dpdk.org

Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
---
 doc/guides/nics/mlx5.rst             |  2 ++
 drivers/net/mlx5/mlx5_defs.h         |  3 ++-
 drivers/net/mlx5/mlx5_prm.h          |  3 +++
 drivers/net/mlx5/mlx5_rxtx.c         |  4 ++++
 drivers/net/mlx5/mlx5_rxtx_vec_sse.c |  5 +++++
 drivers/net/mlx5/mlx5_txq.c          | 27 +++++++++++++++++++++++++++
 6 files changed, 43 insertions(+), 1 deletion(-)
  

Comments

Yongseok Koh Sept. 13, 2017, 7:51 p.m. UTC | #1
On Wed, Sep 13, 2017 at 01:50:39PM +0300, Shahaf Shuler wrote:
> Mellanox NICs has a limitation on the number of mbuf segments a multi
> segment mbuf can have. The max number depends on the Tx offloads requested.
> 
> The current code not enforce such limitation, which might cause
> malformed work requests to be written to the device.
> 
> This commit adds verification for the number of mbuf segments posted
> to the device. In case of overflow the packet will not be sent.
> 
> In addition update the nic documentation with the limitation.
> Considering device limitation is 63 data segments in a work request, the
> maximum number of segment in mbuf was calculated taking TSO as the worst
> case:
> 
> max_nb_segs = 63 - (control_segment + ethernet segment +
> 		    TSO headers inline + inline segment +
> 		    extra inline to align to cacheline)
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> ---
>  doc/guides/nics/mlx5.rst             |  2 ++
>  drivers/net/mlx5/mlx5_defs.h         |  3 ++-
>  drivers/net/mlx5/mlx5_prm.h          |  3 +++
>  drivers/net/mlx5/mlx5_rxtx.c         |  4 ++++
>  drivers/net/mlx5/mlx5_rxtx_vec_sse.c |  5 +++++
>  drivers/net/mlx5/mlx5_txq.c          | 27 +++++++++++++++++++++++++++
>  6 files changed, 43 insertions(+), 1 deletion(-)
> 
> diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
> index f4cb18bca..d8244de97 100644
> --- a/doc/guides/nics/mlx5.rst
> +++ b/doc/guides/nics/mlx5.rst
> @@ -124,6 +124,8 @@ Limitations
>  
>    Will match any ipv4 packet (VLAN included).
>  
> +- A multi segment mbuf must have less than 50 segments. That means mbuf->nb_segs < 50.
Isn't it better to use either "multiple segment packet" or "multi-segment
packet"? Also, more information might be needed here. If MPW/eMPW is enabled,
the code restricts the max number of segments up to MLX5_MPW_DSEG_MAX(5).

> +
>  Configuration
>  -------------
>  
> diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
> index a76bc6f65..3de0e5d81 100644
> --- a/drivers/net/mlx5/mlx5_defs.h
> +++ b/drivers/net/mlx5/mlx5_defs.h
> @@ -100,7 +100,8 @@
>  
>  /*
>   * Maximum size of burst for vectorized Tx. This is related to the maximum size
> - * of Enhaned MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
> + * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
> + * Careful when changing, large value can cause wqe DS to overlap.
wqe -> WQE.

>   */
>  #define MLX5_VPMD_TX_MAX_BURST        32U
>  
> diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
> index 608072f7e..bc2b72333 100644
> --- a/drivers/net/mlx5/mlx5_prm.h
> +++ b/drivers/net/mlx5/mlx5_prm.h
> @@ -154,6 +154,9 @@
>  /* Default mark value used when none is provided. */
>  #define MLX5_FLOW_MARK_DEFAULT 0xffffff
>  
> +/* Maximum number of DS in WQE. */
> +#define MLX5_MAX_DS 63
How about make it consistent with MLX5_MPW_DSEG_MAX by naming MLX5_DSEG_MAX?

> +
>  /* Subset of struct mlx5_wqe_eth_seg. */
>  struct mlx5_wqe_eth_seg_small {
>  	uint32_t rsvd0;
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 7567f2329..fdd7067da 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -661,6 +661,10 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		else
>  			j += sg;
>  next_pkt:
> +		if (ds > MLX5_MAX_DS) {
> +			txq->stats.oerrors++;
> +			break;
> +		}
>  		++elts_head;
>  		++pkts;
>  		++i;
> diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> index f89762ff8..3583e6780 100644
> --- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> +++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
> @@ -248,6 +248,10 @@ txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		if (segs_n == 1 ||
>  		    max_elts < segs_n || max_wqe < 2)
>  			break;
> +		if (segs_n > MLX5_MPW_DSEG_MAX) {
> +			txq->stats.oerrors++;
> +			break;
> +		}
>  		wqe = &((volatile struct mlx5_wqe64 *)
>  			 txq->wqes)[wqe_ci & wq_mask].hdr;
>  		if (buf->ol_flags &
> @@ -365,6 +369,7 @@ txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
>  	max_elts = (elts_n - (elts_head - txq->elts_tail));
>  	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
>  	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
> +	assert(pkts_n <= MLX5_MAX_DS - nb_dword_in_hdr);
>  	if (unlikely(!pkts_n))
>  		return 0;
>  	elts = &(*txq->elts)[elts_head & elts_m];
> diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
> index 4b0b532b1..091b1a93d 100644
> --- a/drivers/net/mlx5/mlx5_txq.c
> +++ b/drivers/net/mlx5/mlx5_txq.c
> @@ -288,6 +288,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
>  		.comp_mask = IBV_EXP_QP_INIT_ATTR_PD,
>  	};
>  	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
> +		unsigned int ds_cnt;
> +
>  		tmpl.txq.max_inline =
>  			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
>  			 RTE_CACHE_LINE_SIZE);
> @@ -320,6 +322,31 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
>  			attr.init.cap.max_inline_data =
>  				tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
>  		}
> +		/*
> +		 * Check if the inline size is too large in a way which
> +		 * can make the wqe DS to overflow.
wqe -> WQE.

> +		 * Considering in calculation:
> +		 *	WQE CTRL (1 DS)
> +		 *	WQE ETH  (1 DS)
> +		 *	inline part (N DS)
inline -> Inline ?

> +		 */
> +		ds_cnt = 2 +
> +			(attr.init.cap.max_inline_data / MLX5_WQE_DWORD_SIZE);
> +		if (ds_cnt > MLX5_MAX_DS) {
> +			unsigned int max_inline = (MLX5_MAX_DS - 2) *
> +						   MLX5_WQE_DWORD_SIZE;
> +
> +			/* Ceil down*/
Missing space and period. Rather, this comment could be unnecessary as the
following code is so obvious. Or, you might want to explain why you make it
aligned.

> +			max_inline = max_inline - (max_inline %
> +						   RTE_CACHE_LINE_SIZE);
> +			WARN("txq inline is too large (%d) setting it to "
> +			     "the maximum possible: %d\n",
> +			     priv->txq_inline, max_inline);
> +			tmpl.txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE;
> +			attr.init.cap.max_inline_data = max_inline;
> +			if (priv->mps == MLX5_MPW_ENHANCED)
> +				tmpl.txq.inline_max_packet_sz = max_inline;
No need to set inline_max_packet_sz. inline_max_packet_sz is to limit the max
size of a packet which can be inlined in eMPW mode. As long as txq->max_inline
is correctly set, txq->inline_max_packet_sz doesn't affect the total number of
DSEGs in a WQE.


Thanks,
Yongseok
  
Shahaf Shuler Sept. 14, 2017, 5:23 a.m. UTC | #2
Hi Yongseok,

Wednesday, September 13, 2017 10:52 PM, Yongseok Koh:
> >
> > +/* Maximum number of DS in WQE. */
> > +#define MLX5_MAX_DS 63
> How about make it consistent with MLX5_MPW_DSEG_MAX by naming
> MLX5_DSEG_MAX?
> 

It doesn't have the same meaning. 

The MLX5_MPW_DSEG_MAX is to limit the number of mbuf segments (buf->nb_segs) for multi packet wqe.  The inline part  is taken into account differently. 
The MLX_MAX_DS is to limit the number data segments (i.e. MLX5_WQE_DWORD_SIZE) that could be set into a WQE. This includes everything (inline, ctrl seg, eth seg, pointers).
For the regular Tx burst there are many options for different inline sizes which impact on the max number of mbuf segments possible. 

BTW - am still not sure why we have the MLX5_MPW_DSEG_MAX limitation.
  
Yongseok Koh Sept. 14, 2017, 8:05 a.m. UTC | #3
Thanks!
Yongseok
> On Sep 13, 2017, at 10:23 PM, Shahaf Shuler <shahafs@mellanox.com> wrote:
> 
> Hi Yongseok,
> 
> Wednesday, September 13, 2017 10:52 PM, Yongseok Koh:
>>> 
>>> +/* Maximum number of DS in WQE. */
>>> +#define MLX5_MAX_DS 63
>> How about make it consistent with MLX5_MPW_DSEG_MAX by naming
>> MLX5_DSEG_MAX?
> 
> It doesn't have the same meaning. 
> 
> The MLX5_MPW_DSEG_MAX is to limit the number of mbuf segments (buf->nb_segs) for multi packet wqe.  The inline part  is taken into account differently. 
> The MLX_MAX_DS is to limit the number data segments (i.e. MLX5_WQE_DWORD_SIZE) that could be set into a WQE. This includes everything (inline, ctrl seg, eth seg, pointers).
> For the regular Tx burst there are many options for different inline sizes which impact on the max number of mbuf segments possible.
I should've been clearer. I just suggested a small change from DS to DSEG as I thought DS and DSEG are same. But if it just comes from the field name in WQE Ctrl segment, MLX5_WQE_DS_MAX or MLX5_WQE_CTRL_DS_MAX could be good. I hoped it could be a little more explanatory. I'm not good at naming, will defer to you. :-)

> BTW - am still not sure why we have the MLX5_MPW_DSEG_MAX limitation.
I don't know either. We should discuss it with chip design.

Thanks
Yongseok
  

Patch

diff --git a/doc/guides/nics/mlx5.rst b/doc/guides/nics/mlx5.rst
index f4cb18bca..d8244de97 100644
--- a/doc/guides/nics/mlx5.rst
+++ b/doc/guides/nics/mlx5.rst
@@ -124,6 +124,8 @@  Limitations
 
   Will match any ipv4 packet (VLAN included).
 
+- A multi segment mbuf must have less than 50 segments. That means mbuf->nb_segs < 50.
+
 Configuration
 -------------
 
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index a76bc6f65..3de0e5d81 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -100,7 +100,8 @@ 
 
 /*
  * Maximum size of burst for vectorized Tx. This is related to the maximum size
- * of Enhaned MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
+ * of Enhanced MPW (eMPW) WQE as vectorized Tx is supported with eMPW.
+ * Careful when changing, large value can cause wqe DS to overlap.
  */
 #define MLX5_VPMD_TX_MAX_BURST        32U
 
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 608072f7e..bc2b72333 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -154,6 +154,9 @@ 
 /* Default mark value used when none is provided. */
 #define MLX5_FLOW_MARK_DEFAULT 0xffffff
 
+/* Maximum number of DS in WQE. */
+#define MLX5_MAX_DS 63
+
 /* Subset of struct mlx5_wqe_eth_seg. */
 struct mlx5_wqe_eth_seg_small {
 	uint32_t rsvd0;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 7567f2329..fdd7067da 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -661,6 +661,10 @@  mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		else
 			j += sg;
 next_pkt:
+		if (ds > MLX5_MAX_DS) {
+			txq->stats.oerrors++;
+			break;
+		}
 		++elts_head;
 		++pkts;
 		++i;
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
index f89762ff8..3583e6780 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
+++ b/drivers/net/mlx5/mlx5_rxtx_vec_sse.c
@@ -248,6 +248,10 @@  txq_scatter_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		if (segs_n == 1 ||
 		    max_elts < segs_n || max_wqe < 2)
 			break;
+		if (segs_n > MLX5_MPW_DSEG_MAX) {
+			txq->stats.oerrors++;
+			break;
+		}
 		wqe = &((volatile struct mlx5_wqe64 *)
 			 txq->wqes)[wqe_ci & wq_mask].hdr;
 		if (buf->ol_flags &
@@ -365,6 +369,7 @@  txq_burst_v(struct txq *txq, struct rte_mbuf **pkts, uint16_t pkts_n,
 	max_elts = (elts_n - (elts_head - txq->elts_tail));
 	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
 	pkts_n = RTE_MIN((unsigned int)RTE_MIN(pkts_n, max_wqe), max_elts);
+	assert(pkts_n <= MLX5_MAX_DS - nb_dword_in_hdr);
 	if (unlikely(!pkts_n))
 		return 0;
 	elts = &(*txq->elts)[elts_head & elts_m];
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 4b0b532b1..091b1a93d 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -288,6 +288,8 @@  txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		.comp_mask = IBV_EXP_QP_INIT_ATTR_PD,
 	};
 	if (priv->txq_inline && (priv->txqs_n >= priv->txqs_inline)) {
+		unsigned int ds_cnt;
+
 		tmpl.txq.max_inline =
 			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
 			 RTE_CACHE_LINE_SIZE);
@@ -320,6 +322,31 @@  txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 			attr.init.cap.max_inline_data =
 				tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
 		}
+		/*
+		 * Check if the inline size is too large in a way which
+		 * can make the wqe DS to overflow.
+		 * Considering in calculation:
+		 *	WQE CTRL (1 DS)
+		 *	WQE ETH  (1 DS)
+		 *	inline part (N DS)
+		 */
+		ds_cnt = 2 +
+			(attr.init.cap.max_inline_data / MLX5_WQE_DWORD_SIZE);
+		if (ds_cnt > MLX5_MAX_DS) {
+			unsigned int max_inline = (MLX5_MAX_DS - 2) *
+						   MLX5_WQE_DWORD_SIZE;
+
+			/* Ceil down*/
+			max_inline = max_inline - (max_inline %
+						   RTE_CACHE_LINE_SIZE);
+			WARN("txq inline is too large (%d) setting it to "
+			     "the maximum possible: %d\n",
+			     priv->txq_inline, max_inline);
+			tmpl.txq.max_inline = max_inline / RTE_CACHE_LINE_SIZE;
+			attr.init.cap.max_inline_data = max_inline;
+			if (priv->mps == MLX5_MPW_ENHANCED)
+				tmpl.txq.inline_max_packet_sz = max_inline;
+		}
 	}
 	if (priv->tso) {
 		attr.init.max_tso_header =