diff mbox series

[v4,2/4] net/iavf: add offload path for Tx AVX512

Message ID	1617947944-130983-3-git-send-email-wenzhuo.lu@intel.com (mailing list archive)
State	Superseded, archived
Delegated to:	Qi Zhang
Headers	IronPort-SDR: n7SKjiUYHEr3ONPVtFDFqb4l8u55kwHB53i4/6aesUlJ2rbHG1g+M3nQW3+vP7p0VZqSay6VT5 LQ4as/ksOaDg== IronPort-SDR: N27El6UOGo1BNT4paEmignESi1Vds6wZj0RjfLtPmL2Ph8soW+rihVboBwQqKkStrtbsR4XsfT nHR/MdiaZUMA== From: Wenzhuo Lu <wenzhuo.lu@intel.com> To: dev@dpdk.org Cc: Wenzhuo Lu <wenzhuo.lu@intel.com> Date: Fri, 9 Apr 2021 13:59:02 +0800 Message-Id: <1617947944-130983-3-git-send-email-wenzhuo.lu@intel.com> In-Reply-To: <1617947944-130983-1-git-send-email-wenzhuo.lu@intel.com> References: <1616722322-6027-1-git-send-email-wenzhuo.lu@intel.com> <1617947944-130983-1-git-send-email-wenzhuo.lu@intel.com> Subject: [dpdk-dev] [PATCH v4 2/4] net/iavf: add offload path for Tx AVX512 Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	add Rx/Tx offload paths for IAVF AVX512 \| [v4,0/4] add Rx/Tx offload paths for IAVF AVX512 [v4,1/4] net/iavf: store offload flag of Rx queue [v4,2/4] net/iavf: add offload path for Tx AVX512 [v4,3/4] net/iavf: add offload path for Rx AVX512 [v4,4/4] net/iavf: add offload path for Rx AVX512 flex desc

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK

Commit Message

Wenzhuo Lu April 9, 2021, 5:59 a.m. UTC

  Add a specific path for TX AVX512.
In this path, support the HW offload features, like,
checksum insertion, VLAN insertion.
This path is chosen automatically according to the
configuration.

'inline' is used, then the duplicate code is generated
by the compiler.

Signed-off-by: Wenzhuo Lu <wenzhuo.lu@intel.com>
---
 drivers/net/iavf/iavf_rxtx.c            |  57 +++++++++++------
 drivers/net/iavf/iavf_rxtx.h            |  14 +++-
 drivers/net/iavf/iavf_rxtx_vec_avx512.c | 110 +++++++++++++++++++-------------
 drivers/net/iavf/iavf_rxtx_vec_common.h |  98 ++++++++++++++++++++++++++--
 4 files changed, 210 insertions(+), 69 deletions(-)

diff mbox series

Patch

diff --git a/drivers/net/iavf/iavf_rxtx.c b/drivers/net/iavf/iavf_rxtx.c
index bd0b7ee..099ede7 100644
--- a/drivers/net/iavf/iavf_rxtx.c
+++ b/drivers/net/iavf/iavf_rxtx.c
@@ -160,7 +160,7 @@ 
 static inline bool
 check_tx_vec_allow(struct iavf_tx_queue *txq)
 {
-	if (!(txq->offloads & IAVF_NO_VECTOR_FLAGS) &&
+	if (!(txq->offloads & IAVF_TX_NO_VECTOR_FLAGS) &&
 	    txq->rs_thresh >= IAVF_VPMD_TX_MAX_BURST &&
 	    txq->rs_thresh <= IAVF_VPMD_TX_MAX_FREE_BUF) {
 		PMD_INIT_LOG(DEBUG, "Vector tx can be enabled on this txq.");
@@ -2498,17 +2498,23 @@ 
 #ifdef RTE_ARCH_X86
 	struct iavf_tx_queue *txq;
 	int i;
+	int check_ret;
+	bool use_sse = false;
 	bool use_avx2 = false;
-#ifdef CC_AVX512_SUPPORT
 	bool use_avx512 = false;
-#endif
 
-	if (!iavf_tx_vec_dev_check(dev) &&
-			rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
-		if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
-		     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
-				rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
-			use_avx2 = true;
+	check_ret = iavf_tx_vec_dev_check(dev);
+
+	if (check_ret >= 0 &&
+	    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_128) {
+		/* SSE and AVX2 not support offload path yet. */
+		if (check_ret == IAVF_VECTOR_PATH) {
+			use_sse = true;
+			if ((rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX2) == 1 ||
+			     rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1) &&
+			    rte_vect_get_max_simd_bitwidth() >= RTE_VECT_SIMD_256)
+				use_avx2 = true;
+		}
 #ifdef CC_AVX512_SUPPORT
 		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512F) == 1 &&
 		    rte_cpu_get_flag_enabled(RTE_CPUFLAG_AVX512BW) == 1 &&
@@ -2516,15 +2522,29 @@ 
 			use_avx512 = true;
 #endif
 
-		PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
-			    use_avx2 ? "avx2 " : "",
-			    dev->data->port_id);
-		dev->tx_pkt_burst = use_avx2 ?
-				    iavf_xmit_pkts_vec_avx2 :
-				    iavf_xmit_pkts_vec;
+		if (!use_sse && !use_avx2 && !use_avx512)
+			goto normal;
+
+		if (!use_avx512) {
+			PMD_DRV_LOG(DEBUG, "Using %sVector Tx (port %d).",
+				    use_avx2 ? "avx2 " : "",
+				    dev->data->port_id);
+			dev->tx_pkt_burst = use_avx2 ?
+					    iavf_xmit_pkts_vec_avx2 :
+					    iavf_xmit_pkts_vec;
+		}
 #ifdef CC_AVX512_SUPPORT
-		if (use_avx512)
-			dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+		if (use_avx512) {
+			if (check_ret == IAVF_VECTOR_PATH) {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 Vector Tx (port %d).",
+					    dev->data->port_id);
+			} else {
+				dev->tx_pkt_burst = iavf_xmit_pkts_vec_avx512_offload;
+				PMD_DRV_LOG(DEBUG, "Using AVX512 OFFLOAD Vector Tx (port %d).",
+					    dev->data->port_id);
+			}
+		}
 #endif
 		dev->tx_pkt_prepare = NULL;
 
@@ -2544,8 +2564,9 @@ 
 
 		return;
 	}
-#endif
 
+normal:
+#endif
 	PMD_DRV_LOG(DEBUG, "Using Basic Tx callback (port=%d).",
 		    dev->data->port_id);
 	dev->tx_pkt_burst = iavf_xmit_pkts;
diff --git a/drivers/net/iavf/iavf_rxtx.h b/drivers/net/iavf/iavf_rxtx.h
index f56dd74..bead119 100644
--- a/drivers/net/iavf/iavf_rxtx.h
+++ b/drivers/net/iavf/iavf_rxtx.h
@@ -23,14 +23,21 @@ 
 #define IAVF_VPMD_DESCS_PER_LOOP  4
 #define IAVF_VPMD_TX_MAX_FREE_BUF 64
 
-#define IAVF_NO_VECTOR_FLAGS (				 \
+#define IAVF_TX_NO_VECTOR_FLAGS (				 \
 		DEV_TX_OFFLOAD_MULTI_SEGS |		 \
+		DEV_TX_OFFLOAD_TCP_TSO)
+
+#define IAVF_TX_VECTOR_OFFLOAD (				 \
 		DEV_TX_OFFLOAD_VLAN_INSERT |		 \
+		DEV_TX_OFFLOAD_QINQ_INSERT |		 \
+		DEV_TX_OFFLOAD_IPV4_CKSUM |		 \
 		DEV_TX_OFFLOAD_SCTP_CKSUM |		 \
 		DEV_TX_OFFLOAD_UDP_CKSUM |		 \
-		DEV_TX_OFFLOAD_TCP_TSO |		 \
 		DEV_TX_OFFLOAD_TCP_CKSUM)
 
+#define IAVF_VECTOR_PATH 0
+#define IAVF_VECTOR_OFFLOAD_PATH 1
+
 #define DEFAULT_TX_RS_THRESH     32
 #define DEFAULT_TX_FREE_THRESH   32
 
@@ -488,6 +495,9 @@  uint16_t iavf_recv_scattered_pkts_vec_avx512_flex_rxd(void *rx_queue,
 						      uint16_t nb_pkts);
 uint16_t iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
 				   uint16_t nb_pkts);
+uint16_t iavf_xmit_pkts_vec_avx512_offload(void *tx_queue,
+					   struct rte_mbuf **tx_pkts,
+					   uint16_t nb_pkts);
 int iavf_txq_vec_setup_avx512(struct iavf_tx_queue *txq);
 
 uint8_t iavf_proto_xtr_type_to_rxdid(uint8_t xtr_type);
diff --git a/drivers/net/iavf/iavf_rxtx_vec_avx512.c b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
index 2927a7c..fbbf4b9 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_avx512.c
+++ b/drivers/net/iavf/iavf_rxtx_vec_avx512.c
@@ -1518,14 +1518,16 @@ 
 		txep[i].mbuf = tx_pkts[i];
 }
 
-static inline void
+static __rte_always_inline void
 iavf_vtx1(volatile struct iavf_tx_desc *txdp,
-	  struct rte_mbuf *pkt, uint64_t flags)
+	  struct rte_mbuf *pkt, uint64_t flags, bool offload)
 {
 	uint64_t high_qw =
 		(IAVF_TX_DESC_DTYPE_DATA |
 		 ((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT) |
 		 ((uint64_t)pkt->data_len << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT));
+	if (offload)
+		iavf_txd_enable_offload(pkt, &high_qw);
 
 	__m128i descriptor = _mm_set_epi64x(high_qw,
 					    pkt->buf_iova + pkt->data_off);
@@ -1534,62 +1536,70 @@ 
 
 #define IAVF_TX_LEN_MASK 0xAA
 #define IAVF_TX_OFF_MASK 0x55
-static inline void
+static __rte_always_inline void
 iavf_vtx(volatile struct iavf_tx_desc *txdp,
-	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags)
+	 struct rte_mbuf **pkt, uint16_t nb_pkts,  uint64_t flags,
+	 bool offload)
 {
 	const uint64_t hi_qw_tmpl = (IAVF_TX_DESC_DTYPE_DATA |
 			((uint64_t)flags  << IAVF_TXD_QW1_CMD_SHIFT));
 
 	/* if unaligned on 32-bit boundary, do one to align */
 	if (((uintptr_t)txdp & 0x1F) != 0 && nb_pkts != 0) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		nb_pkts--, txdp++, pkt++;
 	}
 
 	/* do 4 at a time while possible, in bursts */
 	for (; nb_pkts > 3; txdp += 4, pkt += 4, nb_pkts -= 4) {
-		__m512i desc4 =
-			_mm512_set_epi64
-				((uint64_t)pkt[3]->data_len,
-				 pkt[3]->buf_iova,
-				 (uint64_t)pkt[2]->data_len,
-				 pkt[2]->buf_iova,
-				 (uint64_t)pkt[1]->data_len,
-				 pkt[1]->buf_iova,
-				 (uint64_t)pkt[0]->data_len,
-				 pkt[0]->buf_iova);
-		__m512i hi_qw_tmpl_4 = _mm512_set1_epi64(hi_qw_tmpl);
-		__m512i data_off_4 =
+		uint64_t hi_qw3 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[3]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[3], &hi_qw3);
+		uint64_t hi_qw2 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[2]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[2], &hi_qw2);
+		uint64_t hi_qw1 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[1]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[1], &hi_qw1);
+		uint64_t hi_qw0 =
+			hi_qw_tmpl |
+			((uint64_t)pkt[0]->data_len <<
+			 IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
+		if (offload)
+			iavf_txd_enable_offload(pkt[0], &hi_qw0);
+
+		__m512i desc0_3 =
 			_mm512_set_epi64
-				(0,
-				 pkt[3]->data_off,
-				 0,
-				 pkt[2]->data_off,
-				 0,
-				 pkt[1]->data_off,
-				 0,
-				 pkt[0]->data_off);
-
-		desc4 = _mm512_mask_slli_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					       IAVF_TXD_QW1_TX_BUF_SZ_SHIFT);
-		desc4 = _mm512_mask_or_epi64(desc4, IAVF_TX_LEN_MASK, desc4,
-					     hi_qw_tmpl_4);
-		desc4 = _mm512_mask_add_epi64(desc4, IAVF_TX_OFF_MASK, desc4,
-					      data_off_4);
-		_mm512_storeu_si512((void *)txdp, desc4);
+				(hi_qw3,
+				 pkt[3]->buf_iova + pkt[3]->data_off,
+				 hi_qw2,
+				 pkt[2]->buf_iova + pkt[2]->data_off,
+				 hi_qw1,
+				 pkt[1]->buf_iova + pkt[1]->data_off,
+				 hi_qw0,
+				 pkt[0]->buf_iova + pkt[0]->data_off);
+		_mm512_storeu_si512((void *)txdp, desc0_3);
 	}
 
 	/* do any last ones */
 	while (nb_pkts) {
-		iavf_vtx1(txdp, *pkt, flags);
+		iavf_vtx1(txdp, *pkt, flags, offload);
 		txdp++, pkt++, nb_pkts--;
 	}
 }
 
-static inline uint16_t
+static __rte_always_inline uint16_t
 iavf_xmit_fixed_burst_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-				 uint16_t nb_pkts)
+				 uint16_t nb_pkts, bool offload)
 {
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
 	volatile struct iavf_tx_desc *txdp;
@@ -1620,11 +1630,11 @@ 
 	if (nb_commit >= n) {
 		tx_backlog_entry_avx512(txep, tx_pkts, n);
 
-		iavf_vtx(txdp, tx_pkts, n - 1, flags);
+		iavf_vtx(txdp, tx_pkts, n - 1, flags, offload);
 		tx_pkts += (n - 1);
 		txdp += (n - 1);
 
-		iavf_vtx1(txdp, *tx_pkts++, rs);
+		iavf_vtx1(txdp, *tx_pkts++, rs, offload);
 
 		nb_commit = (uint16_t)(nb_commit - n);
 
@@ -1639,7 +1649,7 @@ 
 
 	tx_backlog_entry_avx512(txep, tx_pkts, nb_commit);
 
-	iavf_vtx(txdp, tx_pkts, nb_commit, flags);
+	iavf_vtx(txdp, tx_pkts, nb_commit, flags, offload);
 
 	tx_id = (uint16_t)(tx_id + nb_commit);
 	if (tx_id > txq->next_rs) {
@@ -1657,9 +1667,9 @@ 
 	return nb_pkts;
 }
 
-uint16_t
-iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
-			  uint16_t nb_pkts)
+static __rte_always_inline uint16_t
+iavf_xmit_pkts_vec_avx512_cmn(void *tx_queue, struct rte_mbuf **tx_pkts,
+			      uint16_t nb_pkts, bool offload)
 {
 	uint16_t nb_tx = 0;
 	struct iavf_tx_queue *txq = (struct iavf_tx_queue *)tx_queue;
@@ -1669,7 +1679,7 @@ 
 
 		num = (uint16_t)RTE_MIN(nb_pkts, txq->rs_thresh);
 		ret = iavf_xmit_fixed_burst_vec_avx512(tx_queue, &tx_pkts[nb_tx],
-						       num);
+						       num, offload);
 		nb_tx += ret;
 		nb_pkts -= ret;
 		if (ret < num)
@@ -1679,6 +1689,13 @@ 
 	return nb_tx;
 }
 
+uint16_t
+iavf_xmit_pkts_vec_avx512(void *tx_queue, struct rte_mbuf **tx_pkts,
+			  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, false);
+}
+
 static inline void
 iavf_tx_queue_release_mbufs_avx512(struct iavf_tx_queue *txq)
 {
@@ -1709,3 +1726,10 @@ 
 	txq->ops = &avx512_vec_txq_ops;
 	return 0;
 }
+
+uint16_t
+iavf_xmit_pkts_vec_avx512_offload(void *tx_queue, struct rte_mbuf **tx_pkts,
+				  uint16_t nb_pkts)
+{
+	return iavf_xmit_pkts_vec_avx512_cmn(tx_queue, tx_pkts, nb_pkts, true);
+}
diff --git a/drivers/net/iavf/iavf_rxtx_vec_common.h b/drivers/net/iavf/iavf_rxtx_vec_common.h
index 57b4381..8e96cb5 100644
--- a/drivers/net/iavf/iavf_rxtx_vec_common.h
+++ b/drivers/net/iavf/iavf_rxtx_vec_common.h
@@ -240,14 +240,17 @@ 
 	if (!txq)
 		return -1;
 
-	if (txq->offloads & IAVF_NO_VECTOR_FLAGS)
-		return -1;
-
 	if (txq->rs_thresh < IAVF_VPMD_TX_MAX_BURST ||
 	    txq->rs_thresh > IAVF_VPMD_TX_MAX_FREE_BUF)
 		return -1;
 
-	return 0;
+	if (txq->offloads & IAVF_TX_NO_VECTOR_FLAGS)
+		return -1;
+
+	if (txq->offloads & IAVF_TX_VECTOR_OFFLOAD)
+		return IAVF_VECTOR_OFFLOAD_PATH;
+
+	return IAVF_VECTOR_PATH;
 }
 
 static inline int
@@ -270,14 +273,97 @@ 
 {
 	int i;
 	struct iavf_tx_queue *txq;
+	int ret;
+	int result = 0;
 
 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
 		txq = dev->data->tx_queues[i];
-		if (iavf_tx_vec_queue_default(txq))
+		ret = iavf_tx_vec_queue_default(txq);
+
+		if (ret < 0)
 			return -1;
+		if (ret > result)
+			result = ret;
 	}
 
-	return 0;
+	return result;
+}
+
+/******************************************************************************
+ * If user knows a specific offload is not enabled by APP,
+ * the macro can be commented to save the effort of fast path.
+ * Currently below 2 features are supported in TX path,
+ * 1, checksum offload
+ * 2, VLAN/QINQ insertion
+ ******************************************************************************/
+#define IAVF_TX_CSUM_OFFLOAD
+#define IAVF_TX_VLAN_QINQ_OFFLOAD
+
+static __rte_always_inline void
+iavf_txd_enable_offload(__rte_unused struct rte_mbuf *tx_pkt,
+			uint64_t *txd_hi)
+{
+#if defined(IAVF_TX_CSUM_OFFLOAD) || defined(IAVF_TX_VLAN_QINQ_OFFLOAD)
+	uint64_t ol_flags = tx_pkt->ol_flags;
+#endif
+	uint32_t td_cmd = 0;
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	uint32_t td_offset = 0;
+#endif
+
+#ifdef IAVF_TX_CSUM_OFFLOAD
+	/* Set MACLEN */
+	td_offset |= (tx_pkt->l2_len >> 1) <<
+		     IAVF_TX_DESC_LENGTH_MACLEN_SHIFT;
+
+	/* Enable L3 checksum offloads */
+	if (ol_flags & PKT_TX_IP_CKSUM) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4_CSUM;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV4) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV4;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	} else if (ol_flags & PKT_TX_IPV6) {
+		td_cmd |= IAVF_TX_DESC_CMD_IIPT_IPV6;
+		td_offset |= (tx_pkt->l3_len >> 2) <<
+			     IAVF_TX_DESC_LENGTH_IPLEN_SHIFT;
+	}
+
+	/* Enable L4 checksum offloads */
+	switch (ol_flags & PKT_TX_L4_MASK) {
+	case PKT_TX_TCP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_TCP;
+		td_offset |= (sizeof(struct rte_tcp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_SCTP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_SCTP;
+		td_offset |= (sizeof(struct rte_sctp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	case PKT_TX_UDP_CKSUM:
+		td_cmd |= IAVF_TX_DESC_CMD_L4T_EOFT_UDP;
+		td_offset |= (sizeof(struct rte_udp_hdr) >> 2) <<
+			     IAVF_TX_DESC_LENGTH_L4_FC_LEN_SHIFT;
+		break;
+	default:
+		break;
+	}
+
+	*txd_hi |= ((uint64_t)td_offset) << IAVF_TXD_QW1_OFFSET_SHIFT;
+#endif
+
+#ifdef IAVF_TX_VLAN_QINQ_OFFLOAD
+	if (ol_flags & (PKT_TX_VLAN | PKT_TX_QINQ)) {
+		td_cmd |= IAVF_TX_DESC_CMD_IL2TAG1;
+		*txd_hi |= ((uint64_t)tx_pkt->vlan_tci <<
+			    IAVF_TXD_QW1_L2TAG1_SHIFT);
+	}
+#endif
+
+	*txd_hi |= ((uint64_t)td_cmd) << IAVF_TXD_QW1_CMD_SHIFT;
 }
 
 #ifdef RTE_ARCH_X86