[dpdk-dev,v2,1/2] net/tap: calculate checksums of multi segs packets

Message ID 1524396611-22391-2-git-send-email-ophirmu@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply patch file failure

Commit Message

Ophir Munk April 22, 2018, 11:30 a.m. UTC
  Prior to this commit IP/UDP/TCP checksum offload calculations
were skipped in case of a multi segments packet.
This commit enables TAP checksum calculations for multi segments
packets.
The only restriction is that the first segment must contain
headers of layers 3 (IP) and 4 (UDP or TCP)

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/tap/rte_eth_tap.c | 154 ++++++++++++++++++++++++++++--------------
 1 file changed, 104 insertions(+), 50 deletions(-)
  

Comments

Ophir Munk May 7, 2018, 9:54 p.m. UTC | #1
v1: 
- Initial release
v2: 
- Fixing cksum errors
- TCP segment size refers to TCP payload size (not including l2,l3,l4 headers)
v3: 
- Bug fixing in case input mbuf is segmented
- Following review comments by Raslan Darawsha

This patch implements TAP TSO (TSP segmentation offload) in SW.
It uses dpdk library librte_gso.
Dpdk librte_gso library segments large TCP payloads (e.g. 64K bytes)
into smaller size buffers.
By supporting TSO offload capability in software a TAP device can be used
as a failsafe sub device and be paired with another PCI device which
supports TSO capability in HW.

This patch includes 2 commits:
1. Calculation of IP/TCP/UDP checksums for multi segments packets.
Previously checksum offload was skipped if the number of packet segments
was greater than 1.
This commit removes this limitation. It is required before supporting TAP TSO
since the generated TCP TSO may be composed of two segments where the first segment
includes l2,l3,l4 headers.
2. TAP TSO implementation: calling rte_gso_segment() to segment large TCP packets.
This commits creates of a small private mbuf pool in TAP PMD required by librte_gso.
The number of buffers will be 64 - each of 128 bytes length.
TSO segments size refers to TCP payload size (not including l2,l3,l4 headers)
librte_gso supports TCP segmentation above IPv4
Ophir Munk (2):
  net/tap: calculate checksums of multi segs packets
  net/tap: support TSO (TCP Segment Offload)

 drivers/net/tap/Makefile      |   2 +-
 drivers/net/tap/rte_eth_tap.c | 308 ++++++++++++++++++++++++++++++++----------
 drivers/net/tap/rte_eth_tap.h |   4 +
 mk/rte.app.mk                 |   4 +-
 4 files changed, 243 insertions(+), 75 deletions(-)
  
Ferruh Yigit May 31, 2018, 1:52 p.m. UTC | #2
On 4/22/2018 12:30 PM, Ophir Munk wrote:
> Prior to this commit IP/UDP/TCP checksum offload calculations
> were skipped in case of a multi segments packet.
> This commit enables TAP checksum calculations for multi segments
> packets.
> The only restriction is that the first segment must contain
> headers of layers 3 (IP) and 4 (UDP or TCP)
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

Hi Ophir,

Can you please rebase the patch on top of latest master, it doesn't applies cleanly.

This is an feature from previous release, please send updates early so that we
can get this early into this release.

Thanks,
ferruh
  
Ferruh Yigit May 31, 2018, 1:54 p.m. UTC | #3
On 5/31/2018 2:52 PM, Ferruh Yigit wrote:
> On 4/22/2018 12:30 PM, Ophir Munk wrote:
>> Prior to this commit IP/UDP/TCP checksum offload calculations
>> were skipped in case of a multi segments packet.
>> This commit enables TAP checksum calculations for multi segments
>> packets.
>> The only restriction is that the first segment must contain
>> headers of layers 3 (IP) and 4 (UDP or TCP)
>>
>> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> 
> Hi Ophir,
> 
> Can you please rebase the patch on top of latest master, it doesn't applies cleanly.
> 
> This is an feature from previous release, please send updates early so that we
> can get this early into this release.

Opps, I replied to v2 instead of v3. But tested latest version, v3, and need v4.
  

Patch

diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index 66e026f..d77a64f 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -436,12 +436,43 @@  tap_txq_are_offloads_valid(struct rte_eth_dev *dev, uint64_t offloads)
 	return true;
 }
 
+/* Finalize l4 checksum calculation */
 static void
-tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
-	       unsigned int l3_len)
+tap_tx_l4_cksum(uint16_t *l4_cksum, uint16_t l4_phdr_cksum,
+		uint32_t l4_raw_cksum)
 {
-	void *l3_hdr = packet + l2_len;
+	if (l4_cksum) {
+		uint32_t cksum;
+
+		cksum = __rte_raw_cksum_reduce(l4_raw_cksum);
+		cksum += l4_phdr_cksum;
+
+		cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+		cksum = (~cksum) & 0xffff;
+		if (cksum == 0)
+			cksum = 0xffff;
+		*l4_cksum = cksum;
+	}
+}
 
+/* Accumaulate L4 raw checksums */
+static void
+tap_tx_l4_add_rcksum(char *l4_data, unsigned int l4_len, uint16_t *l4_cksum,
+			uint32_t *l4_raw_cksum)
+{
+	if (l4_cksum == NULL)
+		return;
+
+	*l4_raw_cksum = __rte_raw_cksum(l4_data, l4_len, *l4_raw_cksum);
+}
+
+/* L3 and L4 pseudo headers checksum offloads */
+static void
+tap_tx_l3_cksum(char *packet, uint64_t ol_flags, unsigned int l2_len,
+		unsigned int l3_len, unsigned int l4_len, uint16_t **l4_cksum,
+		uint16_t *l4_phdr_cksum, uint32_t *l4_raw_cksum)
+{
+	void *l3_hdr = packet + l2_len;
 	if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
 		struct ipv4_hdr *iph = l3_hdr;
 		uint16_t cksum;
@@ -451,38 +482,21 @@  tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
 		iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
 	}
 	if (ol_flags & PKT_TX_L4_MASK) {
-		uint16_t l4_len;
-		uint32_t cksum;
-		uint16_t *l4_cksum;
 		void *l4_hdr;
 
 		l4_hdr = packet + l2_len + l3_len;
 		if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
-			l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
+			*l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
 		else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
-			l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
+			*l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
 		else
 			return;
-		*l4_cksum = 0;
-		if (ol_flags & PKT_TX_IPV4) {
-			struct ipv4_hdr *iph = l3_hdr;
-
-			l4_len = rte_be_to_cpu_16(iph->total_length) - l3_len;
-			cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
-		} else {
-			struct ipv6_hdr *ip6h = l3_hdr;
-
-			/* payload_len does not include ext headers */
-			l4_len = rte_be_to_cpu_16(ip6h->payload_len) -
-				l3_len + sizeof(struct ipv6_hdr);
-			cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
-		}
-		cksum += rte_raw_cksum(l4_hdr, l4_len);
-		cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
-		cksum = (~cksum) & 0xffff;
-		if (cksum == 0)
-			cksum = 0xffff;
-		*l4_cksum = cksum;
+		**l4_cksum = 0;
+		if (ol_flags & PKT_TX_IPV4)
+			*l4_phdr_cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
+		else
+			*l4_phdr_cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
+		*l4_raw_cksum = __rte_raw_cksum(l4_hdr, l4_len, 0);
 	}
 }
 
@@ -503,17 +517,25 @@  pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	max_size = *txq->mtu + (ETHER_HDR_LEN + ETHER_CRC_LEN + 4);
 	for (i = 0; i < nb_pkts; i++) {
 		struct rte_mbuf *mbuf = bufs[num_tx];
-		struct iovec iovecs[mbuf->nb_segs + 1];
+		struct iovec iovecs[mbuf->nb_segs + 2];
 		struct tun_pi pi = { .flags = 0, .proto = 0x00 };
 		struct rte_mbuf *seg = mbuf;
 		char m_copy[mbuf->data_len];
+		int proto;
 		int n;
 		int j;
+		int k; /* first index in iovecs for copying segments */
+		uint16_t l234_hlen; /* length of layers 2,3,4 headers */
+		uint16_t seg_len; /* length of first segment */
+		uint16_t nb_segs;
+		uint16_t *l4_cksum; /* l4 checksum (pseudo header + payload) */
+		uint32_t l4_raw_cksum = 0; /* TCP/UDP payload raw checksum */
+		uint16_t l4_phdr_cksum = 0; /* TCP/UDP pseudo header checksum */
 
 		/* stats.errs will be incremented */
 		if (rte_pktmbuf_pkt_len(mbuf) > max_size)
 			break;
-
+		l4_cksum = NULL;
 		/*
 		 * TUN and TAP are created with IFF_NO_PI disabled.
 		 * For TUN PMD this mandatory as fields are used by
@@ -525,34 +547,66 @@  pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		 * value 0x00 is taken for protocol field.
 		 */
 		char *buff_data = rte_pktmbuf_mtod(seg, void *);
-		j = (*buff_data & 0xf0);
-		pi.proto = (j == 0x40) ? 0x0008 :
-				(j == 0x60) ? 0xdd86 : 0x00;
-
-		iovecs[0].iov_base = &pi;
-		iovecs[0].iov_len = sizeof(pi);
-		for (j = 1; j <= mbuf->nb_segs; j++) {
-			iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
-			iovecs[j].iov_base =
-				rte_pktmbuf_mtod(seg, void *);
-			seg = seg->next;
-		}
+		proto = (*buff_data & 0xf0);
+		pi.proto = (proto == 0x40) ? 0x0008 :
+				(proto == 0x60) ? 0xdd86 : 0x00;
+
+		k = 0;
+		iovecs[k].iov_base = &pi;
+		iovecs[k].iov_len = sizeof(pi);
+		k++;
+		nb_segs = mbuf->nb_segs;
 		if (txq->csum &&
 		    ((mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
 		     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
 		     (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM))) {
-			/* Support only packets with all data in the same seg */
-			if (mbuf->nb_segs > 1)
+			/* Support only packets with at least layer 4
+			 * header included in the first segment
+			 */
+			seg_len = rte_pktmbuf_data_len(mbuf);
+			l234_hlen = mbuf->l2_len + mbuf->l3_len + mbuf->l4_len;
+			if (seg_len < l234_hlen)
 				break;
-			/* To change checksums, work on a copy of data. */
+
+			/* To change checksums, work on a
+			 * copy of l2, l3 l4 headers.
+			 */
 			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
-				   rte_pktmbuf_data_len(mbuf));
-			tap_tx_offload(m_copy, mbuf->ol_flags,
-				       mbuf->l2_len, mbuf->l3_len);
-			iovecs[1].iov_base = m_copy;
+					l234_hlen);
+			tap_tx_l3_cksum(m_copy, mbuf->ol_flags,
+				       mbuf->l2_len, mbuf->l3_len, mbuf->l4_len,
+				       &l4_cksum, &l4_phdr_cksum,
+				       &l4_raw_cksum);
+			iovecs[k].iov_base = m_copy;
+			iovecs[k].iov_len = l234_hlen;
+			k++;
+			/* Update next iovecs[] beyond l2, l3, l4 headers */
+			if (seg_len > l234_hlen) {
+				iovecs[k].iov_len = seg_len - l234_hlen;
+				iovecs[k].iov_base =
+					rte_pktmbuf_mtod(seg, char *) +
+						l234_hlen;
+				tap_tx_l4_add_rcksum(iovecs[k].iov_base,
+					iovecs[k].iov_len, l4_cksum,
+					&l4_raw_cksum);
+				k++;
+				nb_segs++;
+			}
+			seg = seg->next;
 		}
+		for (j = k; j <= nb_segs; j++) {
+			iovecs[j].iov_len = rte_pktmbuf_data_len(seg);
+			iovecs[j].iov_base = rte_pktmbuf_mtod(seg, void *);
+			tap_tx_l4_add_rcksum(iovecs[k].iov_base,
+				iovecs[k].iov_len, l4_cksum,
+				&l4_raw_cksum);
+			seg = seg->next;
+		}
+
+		tap_tx_l4_cksum(l4_cksum, l4_phdr_cksum, l4_raw_cksum);
+
 		/* copy the tx frame data */
-		n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
+		n = writev(txq->fd, iovecs, j);
 		if (n <= 0)
 			break;