[dpdk-dev,v1,2/2] net/tap: support TSO (TCP Segment Offload)

Message ID 1523313192-18048-3-git-send-email-ophirmu@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Ophir Munk April 9, 2018, 10:33 p.m. UTC
  This commit implements TCP segmentation offload in TAP.
librte_gso library is used to segment large TCP payloads (e.g. packets
of 64K bytes size) into smaller MTU size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

For more details on librte_gso implementation please refer to dpdk
documentation.
The number of newly generated TCP TSO segments is limited to 64.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 153 +++++++++++++++++++++++++++++++++++-------
 drivers/net/tap/rte_eth_tap.h |   4 ++
 mk/rte.app.mk                 |   4 +-
 4 files changed, 135 insertions(+), 28 deletions(-)
  

Comments

Ophir Munk April 22, 2018, 11:30 a.m. UTC | #1
v1: 
- Initial release
v2: 
- Fixing cksum errors
- TCP segment size refers to TCP payload size (not including l2,l3,l4 headers)

This patch implements TAP TSO (TSP segmentation offload) in SW.
It uses dpdk library librte_gso.
Dpdk librte_gso library segments large TCP payloads (e.g. 64K bytes)
into smaller size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

This patch includes 2 commits:
1. Calculation of IP/TCP/UDP checksums for multi segments packets.
Previously checksum offload was skipped if the number of packet segments
was greater than 1.
This commit removes this limitation. It is required before supporting TAP TSO
since the generated TCP TSO may be composed of two segments where the first segment
includes l2,l3,l4 headers.
2. TAP TSO implementation: calling rte_gso_segment() to segment large TCP packets.
This commits creates of a small private mbuf pool in TAP PMD required by librte_gso.
The number of buffers will be 64 - each of 128 bytes length.
TSO segments size refers to TCP payload size (not including l2,l3,l4 headers)
librte_gso supports TCP segmentation above IPv4

Ophir Munk (2):
  net/tap: calculate checksums of multi segs packets
  net/tap: support TSO (TCP Segment Offload)

 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 305 ++++++++++++++++++++++++++++++++----------
 drivers/net/tap/rte_eth_tap.h |   4 +
 mk/rte.app.mk                 |   4 +-
 4 files changed, 239 insertions(+), 76 deletions(-)
  

Patch

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index ccc5c5f..3243365 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -24,7 +24,7 @@  CFLAGS += -I.
 CFLAGS += $(WERROR_FLAGS)
 LDLIBS += -lrte_eal -lrte_mbuf -lrte_mempool -lrte_ring
 LDLIBS += -lrte_ethdev -lrte_net -lrte_kvargs -lrte_hash
-LDLIBS += -lrte_bus_vdev
+LDLIBS += -lrte_bus_vdev -lrte_gso
 
 CFLAGS += -DTAP_MAX_QUEUES=$(TAP_MAX_QUEUES)
 
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index df23c4d..717a2b1 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -17,6 +17,7 @@ 
 #include <rte_ip.h>
 #include <rte_string_fns.h>
 
+#include <assert.h>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
@@ -408,7 +409,8 @@  tap_tx_offload_get_port_capa(void)
 	return DEV_TX_OFFLOAD_MULTI_SEGS |
 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
 	       DEV_TX_OFFLOAD_UDP_CKSUM |
-	       DEV_TX_OFFLOAD_TCP_CKSUM;
+	       DEV_TX_OFFLOAD_TCP_CKSUM |
+	       DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static uint64_t
@@ -417,7 +419,8 @@  tap_tx_offload_get_queue_capa(void)
 	return DEV_TX_OFFLOAD_MULTI_SEGS |
 	       DEV_TX_OFFLOAD_IPV4_CKSUM |
 	       DEV_TX_OFFLOAD_UDP_CKSUM |
-	       DEV_TX_OFFLOAD_TCP_CKSUM;
+	       DEV_TX_OFFLOAD_TCP_CKSUM |
+	       DEV_TX_OFFLOAD_TCP_TSO;
 }
 
 static bool
@@ -486,38 +489,26 @@  tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
 	}
 }
 
-/* Callback to handle sending packets from the tap interface
- */
-static uint16_t
-pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+static inline void
+tap_write_mbufs(struct tx_queue *txq, uint16_t num_mbufs,
+			struct rte_mbuf **pmbufs,
+			uint16_t *num_packets, unsigned long *num_tx_bytes)
 {
-	struct tx_queue *txq = queue;
-	uint16_t num_tx = 0;
-	unsigned long num_tx_bytes = 0;
-	uint32_t max_size;
 	int i;
 
-	if (unlikely(nb_pkts == 0))
-		return 0;
-
-	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
-	for (i = 0; i < nb_pkts; i++) {
-		struct rte_mbuf *mbuf = bufs[num_tx];
-		struct iovec iovecs[mbuf->nb_segs + 1];
+	for (i = 0; i < num_mbufs; i++) {
+		struct rte_mbuf *mbuf = pmbufs[i];
+		struct iovec iovecs[mbuf->nb_segs + 2];
 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
 		struct rte_mbuf *seg = mbuf;
 		char m_copy[mbuf->data_len];
 		int n;
 		int j;
-		int k; /* first index in iovecs for copying segments */
+		int k; /* current index in iovecs for copying segments */
 		uint16_t l234_len; /* length of layers 2,3,4 headers */
 		uint16_t seg_len; /* length of first segment */
 		uint16_t nb_segs;
 
-		/* stats.errs will be incremented */
-		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
-			break;
-
 		/*
 		 * TUN and TAP are created with IFF_NO_PI disabled.
 		 * For TUN PMD this mandatory as fields are used by
@@ -581,13 +572,75 @@  pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		n = writev(txq->fd, iovecs, j);
 		if (n <= 0)
 			break;
+		(*num_packets)++;
+		(*num_tx_bytes) += rte_pktmbuf_pkt_len(mbuf);
+	}
+}
+
+/* Callback to handle sending packets from the tap interface
+ */
+static uint16_t
+pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct tx_queue *txq = queue;
+	uint16_t num_tx = 0;
+	uint16_t num_packets = 0;
+	unsigned long num_tx_bytes = 0;
+	uint16_t tso_segsz = 0;
+	uint32_t max_size;
+	int i;
+	uint64_t tso;
+	int ret;
+
+	if (unlikely(nb_pkts == 0))
+		return 0;
+
+	struct rte_mbuf *gso_mbufs[MAX_GSO_MBUFS];
+	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
+	for (i = 0; i < nb_pkts; i++) {
+		struct rte_mbuf *mbuf_in = bufs[num_tx];
+		struct rte_mbuf **mbuf;
+		uint16_t num_mbufs;
+
+		tso = mbuf_in->ol_flags & PKT_TX_TCP_SEG;
+		if (tso) {
+			struct rte_gso_ctx *gso_ctx = &txq->gso_ctx;
+			assert(gso_ctx != NULL);
+			/* gso size is calculated without ETHER_CRC_LEN */
+			tso_segsz = mbuf_in->tso_segsz;
+			if (unlikely(tso_segsz == 0) ||
+				tso_segsz > max_size) {
+				txq->stats.errs++;
+				break;
+			}
+			gso_ctx->gso_size = tso_segsz;
+			ret = rte_gso_segment(mbuf_in, /* packet to segment */
+				gso_ctx, /* gso control block */
+				(struct rte_mbuf **)&gso_mbufs, /* out mbufs */
+				RTE_DIM(gso_mbufs)); /* max tso mbufs */
+
+			/* ret contains the number of new created mbufs */
+			if (ret < 0)
+				break;
 
+			mbuf = gso_mbufs;
+			num_mbufs = ret;
+		} else {
+			/* stats.errs will be incremented */
+			if (rte_pktmbuf_pkt_len(mbuf_in) > max_size)
+				break;
+
+			mbuf = &mbuf_in;
+			num_mbufs = 1;
+		}
+
+		tap_write_mbufs(txq, num_mbufs, mbuf,
+				&num_packets, &num_tx_bytes);
 		num_tx++;
-		num_tx_bytes += mbuf->pkt_len;
-		rte_pktmbuf_free(mbuf);
+		rte_pktmbuf_free(mbuf_in);
 	}
 
-	txq->stats.opackets += num_tx;
+	txq->stats.opackets += num_packets;
 	txq->stats.errs += nb_pkts - num_tx;
 	txq->stats.obytes += num_tx_bytes;
 
@@ -1027,32 +1080,77 @@  tap_mac_set(struct rte_eth_dev *dev, struct ether_addr *mac_addr)
 	}
 }
 
+#define TAP_GSO_MBUFS_NUM 64
+#define TAP_GSO_MBUF_SEG_SIZE 128
+
+static int
+tap_gso_ctx_setup(struct rte_gso_ctx *gso_ctx, struct rte_eth_dev *dev)
+{
+	uint32_t gso_types;
+	char pool_name[64];
+
+	/* Create private mbuf pool with 128 bytes size per mbuf
+	 * use this pool for both direct and indirect mbufs
+	 */
+
+	struct rte_mempool *mp;      /* Mempool for GSO packets */
+	/* initialize GSO context */
+	gso_types = DEV_TX_OFFLOAD_TCP_TSO | DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+		DEV_TX_OFFLOAD_GRE_TNL_TSO;
+	snprintf(pool_name, sizeof(pool_name), "mp_%s", dev->device->name);
+	mp = rte_mempool_lookup((const char *)pool_name);
+	if (!mp) {
+		mp = rte_pktmbuf_pool_create(pool_name, TAP_GSO_MBUFS_NUM,
+			0, 0, RTE_PKTMBUF_HEADROOM + TAP_GSO_MBUF_SEG_SIZE,
+			SOCKET_ID_ANY);
+		if (!mp) {
+			struct pmd_internals *pmd = dev->data->dev_private;
+			RTE_LOG(DEBUG, PMD, "%s: failed to create mbuf pool for device %s\n",
+				pmd->name, dev->device->name);
+			return -1;
+		}
+	}
+
+	gso_ctx->direct_pool = mp;
+	gso_ctx->indirect_pool = mp;
+	gso_ctx->gso_types = gso_types;
+	gso_ctx->gso_size = 0; /* gso_size is set in tx_burst() per packet */
+	gso_ctx->flag = 0;
+
+	return 0;
+}
+
 static int
 tap_setup_queue(struct rte_eth_dev *dev,
 		struct pmd_internals *internals,
 		uint16_t qid,
 		int is_rx)
 {
+	int ret;
 	int *fd;
 	int *other_fd;
 	const char *dir;
 	struct pmd_internals *pmd = dev->data->dev_private;
 	struct rx_queue *rx = &internals->rxq[qid];
 	struct tx_queue *tx = &internals->txq[qid];
+	struct rte_gso_ctx *gso_ctx;
 
 	if (is_rx) {
 		fd = &rx->fd;
 		other_fd = &tx->fd;
 		dir = "rx";
+		gso_ctx = NULL;
 	} else {
 		fd = &tx->fd;
 		other_fd = &rx->fd;
 		dir = "tx";
+		gso_ctx = &tx->gso_ctx;
 	}
 	if (*fd != -1) {
 		/* fd for this queue already exists */
 		RTE_LOG(DEBUG, PMD, "%s: fd %d for %s queue qid %d exists\n",
 			pmd->name, *fd, dir, qid);
+		gso_ctx = NULL;
 	} else if (*other_fd != -1) {
 		/* Only other_fd exists. dup it */
 		*fd = dup(*other_fd);
@@ -1079,6 +1177,11 @@  tap_setup_queue(struct rte_eth_dev *dev,
 
 	tx->mtu = &dev->data->mtu;
 	rx->rxmode = &dev->data->dev_conf.rxmode;
+	if (gso_ctx) {
+		ret = tap_gso_ctx_setup(gso_ctx, dev);
+		if (ret)
+			return -1;
+	}
 
 	return *fd;
 }
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 53a506a..65da5f8 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -15,6 +15,7 @@ 
 
 #include <rte_ethdev_driver.h>
 #include <rte_ether.h>
+#include <rte_gso.h>
 
 #ifdef IFF_MULTI_QUEUE
 #define RTE_PMD_TAP_MAX_QUEUES	TAP_MAX_QUEUES
@@ -22,6 +23,8 @@ 
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
 
+#define MAX_GSO_MBUFS 64
+
 struct pkt_stats {
 	uint64_t opackets;              /* Number of output packets */
 	uint64_t ipackets;              /* Number of input packets */
@@ -50,6 +53,7 @@  struct tx_queue {
 	uint16_t *mtu;                  /* Pointer to MTU from dev_data */
 	uint16_t csum:1;                /* Enable checksum offloading */
 	struct pkt_stats stats;         /* Stats for this TX queue */
+	struct rte_gso_ctx gso_ctx;     /* GSO context */
 };
 
 struct pmd_internals {
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 005803a..62cf545 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -66,8 +66,6 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_PORT)           += -lrte_port
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PDUMP)          += -lrte_pdump
 _LDLIBS-$(CONFIG_RTE_LIBRTE_DISTRIBUTOR)    += -lrte_distributor
 _LDLIBS-$(CONFIG_RTE_LIBRTE_IP_FRAG)        += -lrte_ip_frag
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
-_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_METER)          += -lrte_meter
 _LDLIBS-$(CONFIG_RTE_LIBRTE_LPM)            += -lrte_lpm
 # librte_acl needs --whole-archive because of weak functions
@@ -86,6 +84,8 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_EFD)            += -lrte_efd
 _LDLIBS-y += --whole-archive
 
 _LDLIBS-$(CONFIG_RTE_LIBRTE_CFGFILE)        += -lrte_cfgfile
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GRO)            += -lrte_gro
+_LDLIBS-$(CONFIG_RTE_LIBRTE_GSO)            += -lrte_gso
 _LDLIBS-$(CONFIG_RTE_LIBRTE_HASH)           += -lrte_hash
 _LDLIBS-$(CONFIG_RTE_LIBRTE_MEMBER)         += -lrte_member
 _LDLIBS-$(CONFIG_RTE_LIBRTE_VHOST)          += -lrte_vhost