[dpdk-dev] [PATCH 2/3] net/mlx5: fix Tx WQE corruption caused by starvation

Nelio Laranjeiro nelio.laranjeiro at 6wind.com
Thu Feb 2 11:34:12 CET 2017


Fixes an issue which may occurs with the inline feature activated and a
packet greater than the max_inline requested.

In such situation, more work request elements can be consumed and in the
worst case override some still handled by the NIC, this can result in
sending garbage on the network or putting the work queue in error.

Fixes: 2a66cf378954 ("net/mlx5: support inline send")

Cc: stable at dpdk.org
Reported-by: Elad Persiko <eladpe at mellanox.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
Acked-by: Yongseok Koh <yskoh at mellanox.com>
---
 drivers/net/mlx5/mlx5_rxtx.c | 91 +++++++++++++++++++++++++++++++++++++-------
 drivers/net/mlx5/mlx5_rxtx.h |  1 +
 2 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index ffca21c..a0e15ac 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -238,8 +238,9 @@ txq_complete(struct txq *txq)
 	} while (1);
 	if (unlikely(cqe == NULL))
 		return;
+	txq->wqe_pi = ntohs(cqe->wqe_counter);
 	ctrl = (volatile struct mlx5_wqe_ctrl *)
-		tx_mlx5_wqe(txq, ntohs(cqe->wqe_counter));
+		tx_mlx5_wqe(txq, txq->wqe_pi);
 	elts_tail = ctrl->ctrl3;
 	assert(elts_tail < (1 << txq->wqe_n));
 	/* Free buffers. */
@@ -365,6 +366,7 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
+	uint16_t max_wqe;
 	unsigned int comp;
 	volatile struct mlx5_wqe_v *wqe = NULL;
 	unsigned int segs_n = 0;
@@ -380,6 +382,9 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
+	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!max_wqe))
+		return 0;
 	do {
 		volatile rte_v128u32_t *dseg = NULL;
 		uint32_t length;
@@ -407,6 +412,8 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		--segs_n;
 		if (!segs_n)
 			--pkts_n;
+		if (unlikely(--max_wqe == 0))
+			break;
 		wqe = (volatile struct mlx5_wqe_v *)
 			tx_mlx5_wqe(txq, txq->wqe_ci);
 		rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
@@ -476,10 +483,19 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (room > max_inline) {
 				uintptr_t addr_end = (addr + max_inline) &
 					~(RTE_CACHE_LINE_SIZE - 1);
-				uint16_t copy_b = ((addr_end - addr) > length) ?
-						  length :
-						  (addr_end - addr);
+				unsigned int copy_b =
+					RTE_MIN((addr_end - addr), length);
+				uint16_t n;
 
+				/*
+				 * One Dseg remains in the current WQE.  To
+				 * keep the computation positive, it is
+				 * removed after the bytes to Dseg conversion.
+				 */
+				n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
+				if (unlikely(max_wqe < n))
+					break;
+				max_wqe -= n;
 				rte_memcpy((void *)raw, (void *)addr, copy_b);
 				addr += copy_b;
 				length -= copy_b;
@@ -493,12 +509,18 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			 */
 			ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
 			if (length > 0) {
-				dseg = (volatile rte_v128u32_t *)
-					((uintptr_t)wqe +
-					 (ds * MLX5_WQE_DWORD_SIZE));
-				if ((uintptr_t)dseg >= end)
+				if (ds % (MLX5_WQE_SIZE /
+					  MLX5_WQE_DWORD_SIZE) == 0) {
+					if (unlikely(--max_wqe == 0))
+						break;
+					dseg = (volatile rte_v128u32_t *)
+					       tx_mlx5_wqe(txq, txq->wqe_ci +
+							   ds / 4);
+				} else {
 					dseg = (volatile rte_v128u32_t *)
-					       txq->wqes;
+						((uintptr_t)wqe +
+						 (ds * MLX5_WQE_DWORD_SIZE));
+				}
 				goto use_dseg;
 			} else if (!segs_n) {
 				goto next_pkt;
@@ -541,12 +563,12 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 */
 		assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
 		if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
-			unsigned int n = (txq->wqe_ci + ((ds + 3) / 4)) &
-				((1 << txq->wqe_n) - 1);
-
+			if (unlikely(--max_wqe == 0))
+				break;
 			dseg = (volatile rte_v128u32_t *)
-			       tx_mlx5_wqe(txq, n);
-			rte_prefetch0(tx_mlx5_wqe(txq, n + 1));
+			       tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
+			rte_prefetch0(tx_mlx5_wqe(txq,
+						  txq->wqe_ci + ds / 4 + 1));
 		} else {
 			++dseg;
 		}
@@ -711,6 +733,7 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
+	uint16_t max_wqe;
 	unsigned int comp;
 	struct mlx5_mpw mpw = {
 		.state = MLX5_MPW_STATE_CLOSED,
@@ -726,6 +749,9 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	max = (elts_n - (elts_head - txq->elts_tail));
 	if (max > elts_n)
 		max -= elts_n;
+	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!max_wqe))
+		return 0;
 	do {
 		struct rte_mbuf *buf = *(pkts++);
 		unsigned int elts_head_next;
@@ -759,6 +785,14 @@ mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		     (mpw.wqe->eseg.cs_flags != cs_flags)))
 			mlx5_mpw_close(txq, &mpw);
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+			/*
+			 * Multi-Packet WQE consumes at most two WQE.
+			 * mlx5_mpw_new() expects to be able to use such
+			 * resources.
+			 */
+			if (unlikely(max_wqe < 2))
+				break;
+			max_wqe -= 2;
 			mlx5_mpw_new(txq, &mpw, length);
 			mpw.wqe->eseg.cs_flags = cs_flags;
 		}
@@ -914,11 +948,24 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 	unsigned int i = 0;
 	unsigned int j = 0;
 	unsigned int max;
+	uint16_t max_wqe;
 	unsigned int comp;
 	unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
 	struct mlx5_mpw mpw = {
 		.state = MLX5_MPW_STATE_CLOSED,
 	};
+	/*
+	 * Compute the maximum number of WQE which can be consumed by inline
+	 * code.
+	 * - 2 DSEG for:
+	 *   - 1 control segment,
+	 *   - 1 Ethernet segment,
+	 * - N Dseg from the inline request.
+	 */
+	const unsigned int wqe_inl_n =
+		((2 * MLX5_WQE_DWORD_SIZE +
+		  txq->max_inline * RTE_CACHE_LINE_SIZE) +
+		 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
 
 	if (unlikely(!pkts_n))
 		return 0;
@@ -950,6 +997,11 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 			break;
 		max -= segs_n;
 		--pkts_n;
+		/*
+		 * Compute max_wqe in case less WQE were consumed in previous
+		 * iteration.
+		 */
+		max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
 		/* Should we enable HW CKSUM offload */
 		if (buf->ol_flags &
 		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
@@ -975,9 +1027,20 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
 			if ((segs_n != 1) ||
 			    (length > inline_room)) {
+				/*
+				 * Multi-Packet WQE consumes at most two WQE.
+				 * mlx5_mpw_new() expects to be able to use
+				 * such resources.
+				 */
+				if (unlikely(max_wqe < 2))
+					break;
+				max_wqe -= 2;
 				mlx5_mpw_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
 			} else {
+				if (unlikely(max_wqe < wqe_inl_n))
+					break;
+				max_wqe -= wqe_inl_n;
 				mlx5_mpw_inline_new(txq, &mpw, length);
 				mpw.wqe->eseg.cs_flags = cs_flags;
 			}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 302ca49..41a34d7 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -249,6 +249,7 @@ struct txq {
 	uint16_t elts_comp; /* Counter since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
+	uint16_t wqe_pi; /* Producer index for work queue. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
-- 
2.1.4



More information about the dev mailing list