[dpdk-dev] [PATCH 1/2] net/mlx5: add enhanced multi-packet send for ConnectX-5

Yongseok Koh yskoh at mellanox.com
Wed Mar 1 06:02:24 CET 2017


ConnectX-5 supports enhanced version of multi-packet send (MPS). An MPS Tx
descriptor can carry multiple packets either by including pointers of
packets or by inlining packets. Inlining packet data can be helpful to
better utilize PCIe bandwidth. In addition, Enhanced MPS supports hybrid
mode - mixing inlined packets and pointers in a descriptor. This feature is
enabled by default if supported by HW.

Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
---
 drivers/net/mlx5/mlx5.c        |  34 +++-
 drivers/net/mlx5/mlx5.h        |   4 +-
 drivers/net/mlx5/mlx5_ethdev.c |   6 +-
 drivers/net/mlx5/mlx5_prm.h    |  23 +++
 drivers/net/mlx5/mlx5_rxtx.c   | 405 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h   |   5 +
 drivers/net/mlx5/mlx5_txq.c    |  18 +-
 7 files changed, 486 insertions(+), 9 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index d4bd4696c..24e3865f0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -84,6 +84,12 @@
 /* Device parameter to enable multi-packet send WQEs. */
 #define MLX5_TXQ_MPW_EN "txq_mpw_en"
 
+/* Device parameter to configure the number of dsegs before inlined packet. */
+#define MLX5_TXQ_MPW_HDR_DSEG_EN "txq_mpw_hdr_dseg_en"
+
+/* Device parameter to limit the size of inlining packet */
+#define MLX5_TXQ_MAX_INLINE_LEN "txq_max_inline_len"
+
 /**
  * Retrieve integer value from environment variable.
  *
@@ -289,7 +295,11 @@ mlx5_args_check(const char *key, const char *val, void *opaque)
 	} else if (strcmp(MLX5_TXQS_MIN_INLINE, key) == 0) {
 		priv->txqs_inline = tmp;
 	} else if (strcmp(MLX5_TXQ_MPW_EN, key) == 0) {
-		priv->mps &= !!tmp; /* Enable MPW only if HW supports */
+		priv->mps = !!tmp ? priv->mps : MLX5_MPW_DISABLED;
+	} else if (strcmp(MLX5_TXQ_MPW_HDR_DSEG_EN, key) == 0) {
+		priv->mpw_hdr_dseg = !!tmp;
+	} else if (strcmp(MLX5_TXQ_MAX_INLINE_LEN, key) == 0) {
+		priv->txq_max_inline_len = tmp;
 	} else {
 		WARN("%s: unknown parameter", key);
 		return -EINVAL;
@@ -316,6 +326,8 @@ mlx5_args(struct priv *priv, struct rte_devargs *devargs)
 		MLX5_TXQ_INLINE,
 		MLX5_TXQS_MIN_INLINE,
 		MLX5_TXQ_MPW_EN,
+		MLX5_TXQ_MPW_HDR_DSEG_EN,
+		MLX5_TXQ_MAX_INLINE_LEN,
 		NULL,
 	};
 	struct rte_kvargs *kvlist;
@@ -424,20 +436,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		 */
 		switch (pci_dev->id.device_id) {
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX4LX:
+			mps = MLX5_MPW;
+			break;
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EX:
 		case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
-			mps = 1;
+			mps = MLX5_MPW_ENHANCED;
 			break;
 		default:
-			mps = 0;
+			mps = MLX5_MPW_DISABLED;
 		}
 		INFO("PCI information matches, using device \"%s\""
-		     " (SR-IOV: %s, MPS: %s)",
+		     " (SR-IOV: %s, %sMPS: %s)",
 		     list[i]->name,
 		     sriov ? "true" : "false",
-		     mps ? "true" : "false");
+		     mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
+		     mps != MLX5_MPW_DISABLED ? "true" : "false");
 		attr_ctx = ibv_open_device(list[i]);
 		err = errno;
 		break;
@@ -531,6 +546,12 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
 		priv->mps = mps; /* Enable MPW by default if supported. */
+		/* Set default values for Enhanced MPW, a.k.a MPWv2 */
+		if (mps == MLX5_MPW_ENHANCED) {
+			priv->mpw_hdr_dseg = 0;
+			priv->txqs_inline = MLX5_EMPW_MIN_TXQS;
+			priv->txq_max_inline_len = MLX5_EMPW_MAX_INLINE_LEN;
+		}
 		priv->cqe_comp = 1; /* Enable compression by default. */
 		err = mlx5_args(priv, pci_dev->device.devargs);
 		if (err) {
@@ -586,6 +607,9 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv, struct rte_pci_device *pci_dev)
 			err = ENOTSUP;
 			goto port_error;
 		}
+		INFO("%sMPS is %s",
+		     priv->mps == MLX5_MPW_ENHANCED ? "Enhanced " : "",
+		     priv->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
 		/* Allocate and register default RSS hash keys. */
 		priv->rss_conf = rte_calloc(__func__, hash_rxq_init_n,
 					    sizeof((*priv->rss_conf)[0]), 0);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 2b4345a69..4076eb4d5 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -123,11 +123,13 @@ struct priv {
 	unsigned int hw_fcs_strip:1; /* FCS stripping is supported. */
 	unsigned int hw_padding:1; /* End alignment padding is supported. */
 	unsigned int sriov:1; /* This is a VF or PF with VF devices. */
-	unsigned int mps:1; /* Whether multi-packet send is supported. */
+	unsigned int mps:2; /* Multi-packet send mode (0: disabled). */
+	unsigned int mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB */
 	unsigned int cqe_comp:1; /* Whether CQE compression is enabled. */
 	unsigned int pending_alarm:1; /* An alarm is pending. */
 	unsigned int txq_inline; /* Maximum packet size for inlining. */
 	unsigned int txqs_inline; /* Queue number threshold for inlining. */
+	unsigned int txq_max_inline_len; /* Max packet length for inlining */
 	/* RX/TX queues. */
 	unsigned int rxqs_n; /* RX queues array size. */
 	unsigned int txqs_n; /* TX queues array size. */
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 5677f03c9..20d3563e4 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -1584,7 +1584,11 @@ priv_select_tx_function(struct priv *priv)
 {
 	priv->dev->tx_pkt_burst = mlx5_tx_burst;
 	/* Select appropriate TX function. */
-	if (priv->mps && priv->txq_inline) {
+	if (priv->mps == MLX5_MPW_ENHANCED) {
+		priv->dev->tx_pkt_burst =
+			mlx5_tx_burst_empw;
+		DEBUG("selected Enhanced MPW TX function");
+	} else if (priv->mps && priv->txq_inline) {
 		priv->dev->tx_pkt_burst = mlx5_tx_burst_mpw_inline;
 		DEBUG("selected MPW inline TX function");
 	} else if (priv->mps) {
diff --git a/drivers/net/mlx5/mlx5_prm.h b/drivers/net/mlx5/mlx5_prm.h
index 755b5d779..139a54f0d 100644
--- a/drivers/net/mlx5/mlx5_prm.h
+++ b/drivers/net/mlx5/mlx5_prm.h
@@ -73,6 +73,9 @@
 /* WQE size */
 #define MLX5_WQE_SIZE (4 * MLX5_WQE_DWORD_SIZE)
 
+/* Max size of a WQE session */
+#define MLX5_WQE_SIZE_MAX 960U
+
 /* Compute the number of DS. */
 #define MLX5_WQE_DS(n) \
 	(((n) + MLX5_WQE_DWORD_SIZE - 1) / MLX5_WQE_DWORD_SIZE)
@@ -80,10 +83,22 @@
 /* Room for inline data in multi-packet WQE. */
 #define MLX5_MWQE64_INL_DATA 28
 
+/* Ratio(1/N) of inline quota in a Enhanced MPS WQE */
+#define MLX5_EMPW_INL_QUOTA_DIV 1
+
+/* Default number of Tx queues for inlining packets */
+#define MLX5_EMPW_MIN_TXQS 12
+
+/* Default max packet length to be inlined */
+#define MLX5_EMPW_MAX_INLINE_LEN (4U * MLX5_WQE_SIZE)
+
 #ifndef HAVE_VERBS_MLX5_OPCODE_TSO
 #define MLX5_OPCODE_TSO MLX5_OPCODE_LSO_MPW /* Compat with OFED 3.3. */
 #endif
 
+#define MLX5_OPC_MOD_ENHANCED_MPSW 0
+#define MLX5_OPCODE_ENHANCED_MPSW 0x29
+
 /* CQE value to inform that VLAN is stripped. */
 #define MLX5_CQE_VLAN_STRIPPED (1u << 0)
 
@@ -170,10 +185,18 @@ struct mlx5_wqe64 {
 	uint8_t raw[32];
 } __rte_aligned(MLX5_WQE_SIZE);
 
+/* MPW mode */
+enum mlx5_mpw_mode {
+	MLX5_MPW_DISABLED,
+	MLX5_MPW,
+	MLX5_MPW_ENHANCED, /* Enhanced Multi-Packet Send WQE, a.k.a MPWv2 */
+};
+
 /* MPW session status. */
 enum mlx5_mpw_state {
 	MLX5_MPW_STATE_OPENED,
 	MLX5_MPW_INL_STATE_OPENED,
+	MLX5_MPW_ENHANCED_STATE_OPENED,
 	MLX5_MPW_STATE_CLOSED,
 };
 
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index b2b722380..9fc3f5016 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -195,6 +195,62 @@ tx_mlx5_wqe(struct txq *txq, uint16_t ci)
 }
 
 /**
+ * Return the size of tailroom of WQ.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param addr
+ *   Pointer to tail of WQ.
+ *
+ * @return
+ *   Size of tailroom.
+ */
+static inline size_t
+tx_mlx5_wqe_tailroom(struct txq *txq, void *addr)
+{
+	size_t tailroom;
+	tailroom = (uintptr_t)(txq->wqes) +
+				(1 << txq->wqe_n) * MLX5_WQE_SIZE -
+				(uintptr_t)addr;
+	return tailroom;
+}
+
+/**
+ * Copy data to tailroom of circular queue.
+ *
+ * @param dst
+ *   Pointer to destination.
+ * @param src
+ *   Pointer to source.
+ * @param n
+ *   Number of bytes to copy.
+ * @param base
+ *   Pointer to head of queue.
+ * @param tailroom
+ *   Size of tailroom from dst.
+ *
+ * @return
+ *   Pointer after copied data.
+ */
+static inline void *
+memcpy_to_tailroom(void *dst, const void *src, size_t n,
+		void *base, size_t tailroom)
+{
+	void *ret;
+
+	if (n > tailroom) {
+		rte_memcpy(dst, src, tailroom);
+		rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
+				n - tailroom);
+		ret = (uint8_t *)base + n - tailroom;
+	} else {
+		rte_memcpy(dst, src, n);
+		ret = (n == tailroom) ? base : (uint8_t *)dst + n;
+	}
+	return ret;
+}
+
+/**
  * Manage TX completions.
  *
  * When sending a burst, mlx5_tx_burst() posts several WRs.
@@ -1155,6 +1211,355 @@ mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
 }
 
 /**
+ * Open an Enhanced MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ * @param length
+ *   Packet length.
+ */
+static inline void
+mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
+{
+	uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
+
+	mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
+	mpw->pkts_n = 0;
+	mpw->total_len = sizeof(struct mlx5_wqe);
+	mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
+	mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
+				  (txq->wqe_ci << 8) |
+				  MLX5_OPCODE_ENHANCED_MPSW);
+	mpw->wqe->ctrl[2] = 0;
+	mpw->wqe->ctrl[3] = 0;
+	memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
+	if (unlikely(padding)) {
+		uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
+
+		/* Pad the first 2 DWORDs with zero-length inline header */
+		*(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
+		*(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE)
+			= htonl(MLX5_INLINE_SEG);
+		mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
+		/* Start from the next WQEBB */
+		mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
+	} else {
+		mpw->data.raw = (volatile void *)(mpw->wqe + 1);
+	}
+}
+
+/**
+ * Close an Enhanced MPW session.
+ *
+ * @param txq
+ *   Pointer to TX queue structure.
+ * @param mpw
+ *   Pointer to MPW session structure.
+ *
+ * @return
+ *   Number of consumed WQEs.
+ */
+static inline uint16_t
+mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
+{
+	uint16_t ret;
+
+	/* Store size in multiple of 16 bytes. Control and Ethernet segments
+	 * count as 2.
+	 */
+	mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
+	mpw->state = MLX5_MPW_STATE_CLOSED;
+	ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
+	txq->wqe_ci += ret;
+	return ret;
+}
+
+/**
+ * DPDK callback for TX with Enhanced MPW support.
+ *
+ * @param dpdk_txq
+ *   Generic pointer to TX queue structure.
+ * @param[in] pkts
+ *   Packets to transmit.
+ * @param pkts_n
+ *   Number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct txq *txq = (struct txq *)dpdk_txq;
+	uint16_t elts_head = txq->elts_head;
+	const unsigned int elts_n = 1 << txq->elts_n;
+	unsigned int i = 0;
+	unsigned int j = 0;
+	unsigned int max_elts;
+	unsigned int mpw_n = 0; /* the number of outstanding WQEs. */
+	uint16_t max_wqe, max_cqe;
+	unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
+	unsigned int mpw_room = 0;
+	unsigned int inl_pad = 0;
+	unsigned int inl_budget = 0;
+	uint32_t inl_hdr;
+	struct mlx5_mpw mpw = {
+		.state = MLX5_MPW_STATE_CLOSED,
+	};
+
+	if (unlikely(!pkts_n))
+		return 0;
+	/* Start processing. */
+	txq_complete(txq);
+	max_elts = (elts_n - (elts_head - txq->elts_tail));
+	if (max_elts > elts_n)
+		max_elts -= elts_n;
+	max_cqe = (1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci);
+	/* One CQE slot is needed at the end */
+	if (!max_cqe)
+		return 0;
+	max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
+	if (unlikely(!max_wqe))
+		return 0;
+	do {
+		struct rte_mbuf *buf = *(pkts++);
+		unsigned int elts_head_next;
+		uintptr_t addr;
+		unsigned int n;
+		uint32_t length;
+		unsigned int segs_n = buf->nb_segs;
+		uint32_t cs_flags = 0;
+
+		/*
+		 * Make sure there is enough room to store this packet and
+		 * that one ring entry remains unused.
+		 */
+		assert(segs_n);
+		if (max_elts - j < segs_n + 1)
+			break;
+		/* Do not bother with large packets MPW cannot handle. */
+		if (segs_n > MLX5_MPW_DSEG_MAX)
+			break;
+		/* Should we enable HW CKSUM offload */
+		if (buf->ol_flags &
+		    (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
+			cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
+		/* Retrieve packet information. */
+		length = PKT_LEN(buf);
+		/* Start new session if:
+		 * - multi-segment packet
+		 * - no space left even for a dseg
+		 * - next packet can be inlined with a new WQE
+		 * - cs_flag differs
+		 * It can't be MLX5_MPW_STATE_OPENED as always have a single
+		 * segmented packet.
+		 */
+		if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
+			if ((segs_n != 1) ||
+			    (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
+				mpw_room) ||
+			    (length <= txq->max_inline_len &&
+			     (length > inl_budget ||
+			      inl_pad + sizeof(inl_hdr) + length >
+				mpw_room)) ||
+			    (mpw.wqe->eseg.cs_flags != cs_flags))
+				max_wqe -= mlx5_empw_close(txq, &mpw);
+		}
+		if (mpw.state == MLX5_MPW_STATE_CLOSED) {
+			if (segs_n != 1) {
+				/* Fall back to legacy MPW.
+				 * A MPW session consumes 2 WQEs at most to
+				 * include MLX5_MPW_DSEG_MAX pointers.
+				 */
+				if (unlikely(max_wqe < 2))
+					break;
+				mlx5_mpw_new(txq, &mpw, length);
+			} else {
+				/* In Enhanced MPW, inline as much as the budget
+				 * is * allowed. The remaining space is to be
+				 * filled * with dsegs. If the title WQEBB isn't
+				 * padded, it will have 2 dsegs there.
+				 */
+				mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
+					    (max_inline ? max_inline :
+					     pkts_n * MLX5_WQE_DWORD_SIZE) +
+					    MLX5_WQE_SIZE);
+				if (unlikely((max_wqe * MLX5_WQE_SIZE) <
+						mpw_room))
+					break;
+				/* Do not pad the title WQEBB to not waste WQ */
+				mlx5_empw_new(txq, &mpw, 0);
+				mpw_room -= mpw.total_len;
+				inl_budget = max_inline ?
+					mpw_room / MLX5_EMPW_INL_QUOTA_DIV :
+					0;
+				inl_pad = 0;
+			}
+			mpw.wqe->eseg.cs_flags = cs_flags;
+			++mpw_n;
+		}
+		/* Multi-segment packets must be alone in their MPW. */
+		assert((segs_n == 1) || (mpw.pkts_n == 0));
+		if (mpw.state == MLX5_MPW_STATE_OPENED) {
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+			length = 0;
+#endif
+			do {
+				volatile struct mlx5_wqe_data_seg *dseg;
+
+				elts_head_next =
+					(elts_head + 1) & (elts_n - 1);
+				assert(buf);
+				(*txq->elts)[elts_head] = buf;
+				dseg = mpw.data.dseg[mpw.pkts_n];
+				addr = rte_pktmbuf_mtod(buf, uintptr_t);
+				*dseg = (struct mlx5_wqe_data_seg){
+					.byte_count = htonl(DATA_LEN(buf)),
+					.lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
+					.addr = htonll(addr),
+				};
+				elts_head = elts_head_next;
+#if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
+				length += DATA_LEN(buf);
+#endif
+				buf = buf->next;
+				++j;
+				++mpw.pkts_n;
+			} while (--segs_n);
+			/* A multi-segmented packet takes one MPW session.
+			 * TODO: Pack more multi-segmented packets if possible.
+			 */
+			mlx5_mpw_close(txq, &mpw);
+			if (mpw.pkts_n < 3)
+				max_wqe--;
+			else
+				max_wqe -= 2;
+		} else if (length <= txq->max_inline_len &&
+			   length <= inl_budget &&
+			   (inl_pad + sizeof(inl_hdr) + length) <= mpw_room &&
+			   (!txq->mpw_hdr_dseg ||
+				mpw.total_len >= MLX5_WQE_SIZE)) {
+			/* Inline packet into WQE */
+			unsigned int max;
+
+			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+			assert(length == DATA_LEN(buf));
+			inl_hdr = htonl(length | MLX5_INLINE_SEG);
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			mpw.data.raw = (volatile void *)
+				((uintptr_t)mpw.data.raw + inl_pad);
+			max = tx_mlx5_wqe_tailroom(txq,
+					(void *)(uintptr_t)mpw.data.raw);
+			/* Copy inline header */
+			mpw.data.raw = (volatile void *)
+				memcpy_to_tailroom(
+					  (void *)(uintptr_t)mpw.data.raw,
+					  &inl_hdr,
+					  sizeof(inl_hdr),
+					  (void *)(uintptr_t)txq->wqes,
+					  max);
+			max = tx_mlx5_wqe_tailroom(txq,
+					(void *)(uintptr_t)mpw.data.raw);
+			/* Copy packet data */
+			mpw.data.raw = (volatile void *)
+				memcpy_to_tailroom(
+					  (void *)(uintptr_t)mpw.data.raw,
+					  (void *)addr,
+					  length,
+					  (void *)(uintptr_t)txq->wqes,
+					  max);
+			++mpw.pkts_n;
+			inl_budget -= length;
+			mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
+			/* No need to get completion as the entire packet is
+			 * copied to WQ. Free the buf right away.
+			 */
+			elts_head_next = elts_head;
+			rte_pktmbuf_free_seg(buf);
+			mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
+			/* Add pad in the next packet if any */
+			inl_pad = (((uintptr_t)mpw.data.raw +
+					(MLX5_WQE_DWORD_SIZE - 1)) &
+					~(MLX5_WQE_DWORD_SIZE - 1)) -
+				  (uintptr_t)mpw.data.raw;
+		} else {
+			/* No inline. Load a dseg of packet pointer */
+			volatile rte_v128u32_t *dseg;
+
+			assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
+			assert((inl_pad + sizeof(*dseg)) <= mpw_room);
+			assert(length == DATA_LEN(buf));
+			if (!tx_mlx5_wqe_tailroom(txq,
+					(void *)((uintptr_t)mpw.data.raw
+						+ inl_pad)))
+				dseg = (volatile void *)txq->wqes;
+			else
+				dseg = (volatile void *)
+					((uintptr_t)mpw.data.raw +
+					 inl_pad);
+			elts_head_next = (elts_head + 1) & (elts_n - 1);
+			(*txq->elts)[elts_head] = buf;
+			addr = rte_pktmbuf_mtod(buf, uintptr_t);
+			for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
+				rte_prefetch2((void *)(addr +
+						n * RTE_CACHE_LINE_SIZE));
+			addr = htonll(addr);
+			*dseg = (rte_v128u32_t) {
+				htonl(length),
+				txq_mp2mr(txq, txq_mb2mp(buf)),
+				addr,
+				addr >> 32,
+			};
+			mpw.data.raw = (volatile void *)(dseg + 1);
+			mpw.total_len += (inl_pad + sizeof(*dseg));
+			++j;
+			++mpw.pkts_n;
+			mpw_room -= (inl_pad + sizeof(*dseg));
+			inl_pad = 0;
+		}
+		elts_head = elts_head_next;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+		/* Increment sent bytes counter. */
+		txq->stats.obytes += length;
+#endif
+		++i;
+	} while (i < pkts_n);
+	/* Take a shortcut if nothing must be sent. */
+	if (unlikely(i == 0))
+		return 0;
+	/* Check whether completion threshold has been reached. */
+	if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
+			txq->mpw_comp + mpw_n >= MLX5_TX_COMP_THRESH) {
+		volatile struct mlx5_wqe *wqe = mpw.wqe;
+
+		/* Request completion on last WQE. */
+		wqe->ctrl[2] = htonl(8);
+		/* Save elts_head in unused "immediate" field of WQE. */
+		wqe->ctrl[3] = elts_head;
+		txq->elts_comp = 0;
+		txq->mpw_comp = 0;
+		txq->cq_pi++;
+	} else {
+		txq->elts_comp += j;
+		txq->mpw_comp += mpw_n;
+	}
+#ifdef MLX5_PMD_SOFT_COUNTERS
+	/* Increment sent packets counter. */
+	txq->stats.opackets += i;
+#endif
+	if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
+		mlx5_empw_close(txq, &mpw);
+	else if (mpw.state == MLX5_MPW_STATE_OPENED)
+		mlx5_mpw_close(txq, &mpw);
+	/* Ring QP doorbell. */
+	mlx5_tx_dbrec(txq, mpw.wqe);
+	txq->elts_head = elts_head;
+	return i;
+}
+
+/**
  * Translate RX completion flags to packet type.
  *
  * @param[in] cqe
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 41a34d7ff..e70d65465 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -247,13 +247,17 @@ struct txq {
 	uint16_t elts_head; /* Current index in (*elts)[]. */
 	uint16_t elts_tail; /* First element awaiting completion. */
 	uint16_t elts_comp; /* Counter since last completion request. */
+	uint16_t mpw_comp; /* Counter of MPW since last completion request. */
 	uint16_t cq_ci; /* Consumer index for completion queue. */
+	uint16_t cq_pi; /* Producer index for completion queue. */
 	uint16_t wqe_ci; /* Consumer index for work queue. */
 	uint16_t wqe_pi; /* Producer index for work queue. */
 	uint16_t elts_n:4; /* (*elts)[] length (in log2). */
 	uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
 	uint16_t wqe_n:4; /* Number of of WQ elements (in log2). */
+	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
+	uint16_t max_inline_len; /* Max packet length to inilne */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
 	volatile void *wqes; /* Work queue (use volatile to write into). */
@@ -320,6 +324,7 @@ uint16_t mlx5_tx_burst_secondary_setup(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_mpw(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_tx_burst_mpw_inline(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_tx_burst_empw(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 949035bd4..ef8775382 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -276,6 +276,8 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 	(void)conf; /* Thresholds configuration (ignored). */
 	assert(desc > MLX5_TX_COMP_THRESH);
 	tmpl.txq.elts_n = log2above(desc);
+	if (priv->mps == MLX5_MPW_ENHANCED)
+		tmpl.txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
 	/* MRs will be registered in mp2mr[] later. */
 	attr.rd = (struct ibv_exp_res_domain_init_attr){
 		.comp_mask = (IBV_EXP_RES_DOMAIN_THREAD_MODEL |
@@ -340,8 +342,20 @@ txq_ctrl_setup(struct rte_eth_dev *dev, struct txq_ctrl *txq_ctrl,
 		tmpl.txq.max_inline =
 			((priv->txq_inline + (RTE_CACHE_LINE_SIZE - 1)) /
 			 RTE_CACHE_LINE_SIZE);
-		attr.init.cap.max_inline_data =
-			tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
+		if (priv->mps == MLX5_MPW_ENHANCED) {
+			tmpl.txq.max_inline_len = priv->txq_max_inline_len;
+			/* To minimize the size of data set, avoid requesting
+			 * too large WQ
+			 */
+			attr.init.cap.max_inline_data =
+				((RTE_MIN(priv->txq_inline,
+					  priv->txq_max_inline_len) +
+				  (RTE_CACHE_LINE_SIZE - 1)) /
+				 RTE_CACHE_LINE_SIZE) * RTE_CACHE_LINE_SIZE;
+		} else {
+			attr.init.cap.max_inline_data =
+				tmpl.txq.max_inline * RTE_CACHE_LINE_SIZE;
+		}
 	}
 	tmpl.qp = ibv_exp_create_qp(priv->ctx, &attr.init);
 	if (tmpl.qp == NULL) {
-- 
2.11.0



More information about the dev mailing list