[dpdk-dev] net/tap: add Rx/Tx checksum offload support

Message ID 172025985dc6ca989008cc2d93008e38ac3fff39.1495452020.git.pascal.mazon@6wind.com (mailing list archive)
State Accepted, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Pascal Mazon May 22, 2017, 11:20 a.m. UTC
  This patch adds basic offloading support, widely expected in a PMD.

Verify IPv4 and UDP/TCP checksums upon packet reception, and set
ol_flags accordingly.

On Tx, set IPv4 and UDP/TCP checksums when required, considering
ol_flags.

Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
---
 doc/guides/nics/features/tap.ini |   2 +
 drivers/net/tap/rte_eth_tap.c    | 130 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 132 insertions(+)
  

Comments

Ferruh Yigit May 22, 2017, 12:24 p.m. UTC | #1
Hi Pascal,

On 5/22/2017 12:20 PM, Pascal Mazon wrote:
> This patch adds basic offloading support, widely expected in a PMD.
> 
> Verify IPv4 and UDP/TCP checksums upon packet reception, and set
> ol_flags accordingly.
> 
> On Tx, set IPv4 and UDP/TCP checksums when required, considering
> ol_flags.

These are not specific to tap and can be used by any virtual PMD, right?

What do you think moving implementation into more generic location (I
don't know where right now) and use from there, and other PMDs also can
benefit from these?

Thanks,
ferruh

> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>

<...>
  
Pascal Mazon May 23, 2017, 6:51 a.m. UTC | #2
Hi Ferruh,

I'm not sure it would actually be so useful for other vPMDs.
Virtio and vmxnet3 for instance have their own mechanisms for handling
offloading.

Software offload is especially relevant for tap as packets may come from
anywhere (from another process, from a remote netdevice), and the
application would be interested in knowing how good the packet is.

At first, I'd rather keep the code as-is in tap, which is generic enough
to be easily moved later when actual need arise.

Best regards,

Pascal

On Mon, 22 May 2017 13:24:41 +0100
Ferruh Yigit <ferruh.yigit@intel.com> wrote:

> Hi Pascal,
> 
> On 5/22/2017 12:20 PM, Pascal Mazon wrote:
> > This patch adds basic offloading support, widely expected in a PMD.
> > 
> > Verify IPv4 and UDP/TCP checksums upon packet reception, and set
> > ol_flags accordingly.
> > 
> > On Tx, set IPv4 and UDP/TCP checksums when required, considering
> > ol_flags.
> 
> These are not specific to tap and can be used by any virtual PMD, right?
> 
> What do you think moving implementation into more generic location (I
> don't know where right now) and use from there, and other PMDs also can
> benefit from these?
> 
> Thanks,
> ferruh
> 
> > 
> > Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>
> 
> <...>
>
  
Ferruh Yigit May 26, 2017, 9:23 a.m. UTC | #3
On 5/22/2017 12:20 PM, Pascal Mazon wrote:
> This patch adds basic offloading support, widely expected in a PMD.
> 
> Verify IPv4 and UDP/TCP checksums upon packet reception, and set
> ol_flags accordingly.
> 
> On Tx, set IPv4 and UDP/TCP checksums when required, considering
> ol_flags.
> 
> Signed-off-by: Pascal Mazon <pascal.mazon@6wind.com>

Applied to dpdk-next-net/master, thanks.
  

Patch

diff --git a/doc/guides/nics/features/tap.ini b/doc/guides/nics/features/tap.ini
index 3efae758ccde..7e289e99ef23 100644
--- a/doc/guides/nics/features/tap.ini
+++ b/doc/guides/nics/features/tap.ini
@@ -11,6 +11,8 @@  Promiscuous mode     = Y
 Allmulticast mode    = Y
 Basic stats          = Y
 Flow API             = Y
+L3 checksum offload  = Y
+L4 checksum offload  = Y
 MTU update           = Y
 Multicast MAC filter = Y
 Speed capabilities   = Y
diff --git a/drivers/net/tap/rte_eth_tap.c b/drivers/net/tap/rte_eth_tap.c
index e6261e7a46ab..56208df234d2 100644
--- a/drivers/net/tap/rte_eth_tap.c
+++ b/drivers/net/tap/rte_eth_tap.c
@@ -33,6 +33,7 @@ 
 
 #include <rte_atomic.h>
 #include <rte_branch_prediction.h>
+#include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_mbuf.h>
 #include <rte_ethdev.h>
@@ -42,6 +43,7 @@ 
 #include <rte_kvargs.h>
 #include <rte_net.h>
 #include <rte_debug.h>
+#include <rte_ip.h>
 
 #include <sys/types.h>
 #include <sys/stat.h>
@@ -229,6 +231,60 @@  tun_alloc(struct pmd_internals *pmd)
 	return -1;
 }
 
+static void
+tap_verify_csum(struct rte_mbuf *mbuf)
+{
+	uint32_t l2 = mbuf->packet_type & RTE_PTYPE_L2_MASK;
+	uint32_t l3 = mbuf->packet_type & RTE_PTYPE_L3_MASK;
+	uint32_t l4 = mbuf->packet_type & RTE_PTYPE_L4_MASK;
+	unsigned int l2_len = sizeof(struct ether_hdr);
+	unsigned int l3_len;
+	uint16_t cksum = 0;
+	void *l3_hdr;
+	void *l4_hdr;
+
+	if (l2 == RTE_PTYPE_L2_ETHER_VLAN)
+		l2_len += 4;
+	else if (l2 == RTE_PTYPE_L2_ETHER_QINQ)
+		l2_len += 8;
+	/* Don't verify checksum for packets with discontinuous L2 header */
+	if (unlikely(l2_len + sizeof(struct ipv4_hdr) >
+		     rte_pktmbuf_data_len(mbuf)))
+		return;
+	l3_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len);
+	if (l3 == RTE_PTYPE_L3_IPV4 || l3 == RTE_PTYPE_L3_IPV4_EXT) {
+		struct ipv4_hdr *iph = l3_hdr;
+
+		/* ihl contains the number of 4-byte words in the header */
+		l3_len = 4 * (iph->version_ihl & 0xf);
+		if (unlikely(l2_len + l3_len > rte_pktmbuf_data_len(mbuf)))
+			return;
+
+		cksum = ~rte_raw_cksum(iph, l3_len);
+		mbuf->ol_flags |= cksum ?
+			PKT_RX_IP_CKSUM_BAD :
+			PKT_RX_IP_CKSUM_GOOD;
+	} else if (l3 == RTE_PTYPE_L3_IPV6) {
+		l3_len = sizeof(struct ipv6_hdr);
+	} else {
+		/* IPv6 extensions are not supported */
+		return;
+	}
+	if (l4 == RTE_PTYPE_L4_UDP || l4 == RTE_PTYPE_L4_TCP) {
+		l4_hdr = rte_pktmbuf_mtod_offset(mbuf, void *, l2_len + l3_len);
+		/* Don't verify checksum for multi-segment packets. */
+		if (mbuf->nb_segs > 1)
+			return;
+		if (l3 == RTE_PTYPE_L3_IPV4)
+			cksum = ~rte_ipv4_udptcp_cksum(l3_hdr, l4_hdr);
+		else if (l3 == RTE_PTYPE_L3_IPV6)
+			cksum = ~rte_ipv6_udptcp_cksum(l3_hdr, l4_hdr);
+		mbuf->ol_flags |= cksum ?
+			PKT_RX_L4_CKSUM_BAD :
+			PKT_RX_L4_CKSUM_GOOD;
+	}
+}
+
 /* Callback to handle the rx burst of packets to the correct interface and
  * file descriptor(s) in a multi-queue setup.
  */
@@ -309,6 +365,8 @@  pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		seg->next = NULL;
 		mbuf->packet_type = rte_net_get_ptype(mbuf, NULL,
 						      RTE_PTYPE_ALL_MASK);
+		if (rxq->rxmode->hw_ip_checksum)
+			tap_verify_csum(mbuf);
 
 		/* account for the receive frame */
 		bufs[num_rx++] = mbuf;
@@ -321,6 +379,57 @@  pmd_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 	return num_rx;
 }
 
+static void
+tap_tx_offload(char *packet, uint64_t ol_flags, unsigned int l2_len,
+	       unsigned int l3_len)
+{
+	void *l3_hdr = packet + l2_len;
+
+	if (ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4)) {
+		struct ipv4_hdr *iph = l3_hdr;
+		uint16_t cksum;
+
+		iph->hdr_checksum = 0;
+		cksum = rte_raw_cksum(iph, l3_len);
+		iph->hdr_checksum = (cksum == 0xffff) ? cksum : ~cksum;
+	}
+	if (ol_flags & PKT_TX_L4_MASK) {
+		uint16_t l4_len;
+		uint32_t cksum;
+		uint16_t *l4_cksum;
+		void *l4_hdr;
+
+		l4_hdr = packet + l2_len + l3_len;
+		if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM)
+			l4_cksum = &((struct udp_hdr *)l4_hdr)->dgram_cksum;
+		else if ((ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM)
+			l4_cksum = &((struct tcp_hdr *)l4_hdr)->cksum;
+		else
+			return;
+		*l4_cksum = 0;
+		if (ol_flags & PKT_TX_IPV4) {
+			struct ipv4_hdr *iph = l3_hdr;
+
+			l4_len = rte_be_to_cpu_16(iph->total_length) - l3_len;
+			cksum = rte_ipv4_phdr_cksum(l3_hdr, 0);
+		} else {
+			struct ipv6_hdr *ip6h = l3_hdr;
+
+			/* payload_len does not include ext headers */
+			l4_len = rte_be_to_cpu_16(ip6h->payload_len) -
+				l3_len + sizeof(struct ipv6_hdr);
+			cksum = rte_ipv6_phdr_cksum(l3_hdr, 0);
+		}
+		cksum += rte_raw_cksum(l4_hdr, l4_len);
+		cksum = ((cksum & 0xffff0000) >> 16) + (cksum & 0xffff);
+		cksum = (~cksum) & 0xffff;
+		if (cksum == 0)
+			cksum = 0xffff;
+		*l4_cksum = cksum;
+	}
+	return;
+}
+
 /* Callback to handle sending packets from the tap interface
  */
 static uint16_t
@@ -341,6 +450,7 @@  pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 		struct iovec iovecs[mbuf->nb_segs + 1];
 		struct tun_pi pi = { .flags = 0 };
 		struct rte_mbuf *seg = mbuf;
+		char m_copy[mbuf->data_len];
 		int n;
 		int j;
 
@@ -356,6 +466,19 @@  pmd_tx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 				rte_pktmbuf_mtod(seg, void *);
 			seg = seg->next;
 		}
+		if (mbuf->ol_flags & (PKT_TX_IP_CKSUM | PKT_TX_IPV4) ||
+		    (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_UDP_CKSUM ||
+		    (mbuf->ol_flags & PKT_TX_L4_MASK) == PKT_TX_TCP_CKSUM) {
+			/* Support only packets with all data in the same seg */
+			if (mbuf->nb_segs > 1)
+				break;
+			/* To change checksums, work on a copy of data. */
+			rte_memcpy(m_copy, rte_pktmbuf_mtod(mbuf, void *),
+				   rte_pktmbuf_data_len(mbuf));
+			tap_tx_offload(m_copy, mbuf->ol_flags,
+				       mbuf->l2_len, mbuf->l3_len);
+			iovecs[1].iov_base = m_copy;
+		}
 		/* copy the tx frame data */
 		n = writev(txq->fd, iovecs, mbuf->nb_segs + 1);
 		if (n <= 0)
@@ -533,6 +656,13 @@  tap_dev_info(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 	dev_info->min_rx_bufsize = 0;
 	dev_info->pci_dev = NULL;
 	dev_info->speed_capa = tap_dev_speed_capa();
+	dev_info->rx_offload_capa = (DEV_RX_OFFLOAD_IPV4_CKSUM |
+				     DEV_RX_OFFLOAD_UDP_CKSUM |
+				     DEV_RX_OFFLOAD_TCP_CKSUM);
+	dev_info->tx_offload_capa =
+		(DEV_TX_OFFLOAD_IPV4_CKSUM |
+		 DEV_TX_OFFLOAD_UDP_CKSUM |
+		 DEV_TX_OFFLOAD_TCP_CKSUM);
 }
 
 static void