[dpdk-dev] [RFC 2/2] net/tap: implement TAP TSO

Ophir Munk ophirmu at mellanox.com
Fri Mar 9 22:10:26 CET 2018


This commit implements TCP segmentation offload in TAP.
Dpdk rte_gso library is used to segment large TCP payloads (e.g. 64K bytes)
into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on dpdk librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TSO segments is limited to 64.

Signed-off-by: Ophir Munk <ophirmu at mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 157 ++++++++++++++++++++++++++++++++----------
 drivers/net/tap/rte_eth_tap.h |   4 ++
 3 files changed, 126 insertions(+), 37 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@ CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index f312084..4dda100 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -473,40 +473,37 @@ tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
 	}
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static void
+tap_mbuf_pool_create(struct rte_mempool **mp)
 {
-	struct tx_queue *txq = queue;
-	uint16_t num_tx = 0;
-	unsigned long num_tx_bytes = 0;
-	uint32_t max_size;
-	int i;
+	*mp = NULL; /* TODO - create mp */
+}
 
-	if (unlikely(nb_pkts == 0))
-		return 0;
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+			struct rte_mbuf **pmbufs,
+			uint16_t *num_packets, unsigned long *num_tx_bytes)
+{
+	int i;
 
-	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-	for (i = 0; i < nb_pkts; i++) {
-		struct rte_mbuf *mbuf = bufs[num_tx];
-		struct iovec iovecs[mbuf->nb_segs + 1];
+	for (i = 0; i < num_mbufs; i++) {
+		struct rte_mbuf *mbuf = pmbufs[i];
+		struct iovec iovecs[mbuf->nb_segs + 2];
 		struct tun_pi pi = { .flags = 0 };
 		struct rte_mbuf *seg = mbuf;
 		char m_copy[mbuf->data_len];
 		int n;
 		int j;
-		int k; /* first index in iovecs for copying segments */
+		int k; /* current index in iovecs for copying segments */
 		uint16_t l234_len; /* length of layers 2,3,4 headers */
 		uint16_t seg_len; /* length of first segment */
+		uint16_t nb_segs;
 
-		/* stats.errs will be incremented */
-		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-			break;
-
-		iovecs[0].iov_base = π
-		iovecs[0].iov_len = sizeof(pi);
-		k = 1;
+		k = 0;
+		iovecs[k].iov_base = π
+		iovecs[k].iov_len = sizeof(pi);
+		k++;
+		nb_segs = mbuf->nb_segs;
 		if (txq->csum &&
 		    ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
 		     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
@@ -523,39 +520,99 @@ pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 			/* To change checksums, work on a
 			 * copy of l2, l3 l4 headers.
 			 */
-			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
-					l234_len);
+			rte_memcpy(m_copy,
+				rte_pktmbuf_mtod(mbuf, void *), l234_len);
 			tap_tx_offload(m_copy, mbuf->ol_flags,
 				       mbuf->l2_len, mbuf->l3_len);
-			iovecs[1].iov_base = m_copy;
-			iovecs[1].iov_len = l234_len;
+			iovecs[k].iov_base = m_copy;
+			iovecs[k].iov_len = l234_len;
 			k++;
+
 			/* Adjust data pointer beyond l2, l3, l4 headers.
 			 * If this segment becomes empty - skip it
 			 */
 			if (seg_len > l234_len) {
-				rte_pktmbuf_adj(mbuf, l234_len);
-			} else {
-				seg = seg->next;
-				mbuf->nb_segs--;
+				iovecs[k].iov_len = seg_len - l234_len;
+				iovecs[k].iov_base =
+					rte_pktmbuf_mtod(seg, char *) +
+						l234_len;
+				k++;
+			} else { /* seg_len == l234_len */
+				nb_segs--;
 			}
+
+			seg = seg->next;
 		}
-		for (j = k; j <= mbuf->nb_segs; j++) {
+		for (j = k; j <= nb_segs; j++) {
 			iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
 			iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
 			seg = seg->next;
 		}
 		/* copy the tx frame data */
-		n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
+		n = writev(txq->fd, iovecs, j);
 		if (n <= 0)
 			break;
+		(*num_packets)++;
+		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+	}
+}
 
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct tx_queue *txq = queue;
+	uint16_t num_tx = 0;
+	uint16_t num_packets = 0;
+	unsigned long num_tx_bytes = 0;
+	uint32_t max_size;
+	int i;
+	uint64_t tso;
+	int ret;
+
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+	for (i = 0; i < nb_pkts; i++) {
+		struct rte_mbuf *mbuf_in = bufs[num_tx];
+		struct rte_mbuf **mbuf;
+		uint16_t num_mbufs;
+
+		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+		if (tso) {
+			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+			/* gso size is calculated without ETHER_CRC_LEN */
+			gso_ctx->gso_size = *txq->mtu + ETHER_HDR_LEN;
+			ret = rte_gso_segment(mbuf_in, /* packet to segment */
+				gso_ctx, /* gso control block */
+				(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+				RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+			/* ret contains the number of new created mbufs */
+			if (ret < 0)
+				break;
+
+			mbuf = gso_mbufs;
+			num_mbufs = ret;
+		} else {
+			/* stats.errs will be incremented */
+			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+				break;
+
+			mbuf = &mbuf_in;
+			num_mbufs = 1;
+		}
+
+		tap_write_mbufs(txq, num_mbufs, mbuf,
+				&num_packets, &num_tx_bytes);
 		num_tx++;
-		num_tx_bytes += mbuf->pkt_len;
-		rte_pktmbuf_free(mbuf);
+		rte_pktmbuf_free(mbuf_in);
 	}
 
-	txq->stats.opackets += num_tx;
+	txq->stats.opackets += num_packets;
 	txq->stats.errs += nb_pkts - num_tx;
 	txq->stats.obytes += num_tx_bytes;
 
@@ -996,11 +1053,35 @@ tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
 }
 
 static int
+tap_init_gso_ctx(struct tx_queue *tx)
+{
+	uint32_t gso_types;
+
+	/* Create private mbuf pool with 128 bytes size
+	 * use this pool for both direct and indirect mbufs
+	 */
+	struct rte_mempool *mp;      /* Mempool for TX/GSO packets */
+	tap_mbuf_pool_create(&mp); /* tx->mp or maybe embedded in gso_ctx */
+
+	/* initialize GSO context */
+	gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+		DEV_TX_OFFLOAD_GRE_TNL_TSO;
+	tx->gso_ctx.direct_pool = mp;
+	tx->gso_ctx.indirect_pool = mp;
+	tx->gso_ctx.gso_types = gso_types;
+	tx->gso_ctx.gso_size = ETHER_MAX_LEN - ETHER_CRC_LEN;
+	tx->gso_ctx.flag = 0;
+
+	return 0;
+}
+
+static int
 tap_setup_queue(struct rte_eth_dev *dev,
 		struct pmd_internals *internals,
 		uint16_t qid,
 		int is_rx)
 {
+	int ret;
 	int *fd;
 	int *other_fd;
 	const char *dir;
@@ -1048,6 +1129,10 @@ tap_setup_queue(struct rte_eth_dev *dev,
 	tx->mtu = &dev->data->mtu;
 	rx->rxmode = &dev->data->dev_conf.rxmode;
 
+	ret = tap_init_gso_ctx(tx);
+	if (ret)
+		return -1;
+
 	return *fd;
 }
 
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 53a506a..65da5f8 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES	TAP_MAX_QUEUES
@@ -22,6 +23,8 @@
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define MAX_GSO_MBUFS 64
+
 struct pkt_stats {
 	uint64_t opackets;              /* Number of output packets */
 	uint64_t ipackets;              /* Number of input packets */
@@ -50,6 +53,7 @@ struct tx_queue {
 	uint16_t *mtu;                  /* Pointer to MTU from dev_data */
 	uint16_t csum:1;                /* Enable checksum offloading */
 	struct pkt_stats stats;         /* Stats for this TX queue */
+	struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
-- 
2.7.4



More information about the dev mailing list