[dpdk-dev] [PATCH v12 1/6] ethdev: add Tx preparation

Tomasz Kulasek tomaszx.kulasek at intel.com
Wed Nov 23 18:36:20 CET 2016


Added API for `rte_eth_tx_prepare`

uint16_t rte_eth_tx_prepare(uint8_t port_id, uint16_t queue_id,
	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)

Added fields to the `struct rte_eth_desc_lim`:

	uint16_t nb_seg_max;
		/**< Max number of segments per whole packet. */

	uint16_t nb_mtu_seg_max;
		/**< Max number of segments per one MTU */

Added functions:

int rte_validate_tx_offload(struct rte_mbuf *m)
	to validate general requirements for tx offload set in mbuf of packet
  such a flag completness. In current implementation this function is
  called optionaly when RTE_LIBRTE_ETHDEV_DEBUG is enabled.

int rte_net_intel_cksum_prepare(struct rte_mbuf *m)
	to fix pseudo header checksum for TSO and non-TSO tcp/udp packets
	before hardware tx checksum offload.
	 - for non-TSO tcp/udp packets full pseudo-header checksum is
	   counted and set.
	 - for TSO the IP payload length is not included.

PERFORMANCE TESTS
-----------------

This feature was tested with modified csum engine from test-pmd.

The packet checksum preparation was moved from application to Tx
preparation step placed before burst.

We may expect some overhead costs caused by:
1) using additional callback before burst,
2) rescanning burst,
3) additional condition checking (packet validation),
4) worse optimization (e.g. packet data access, etc.)

We tested it using ixgbe Tx preparation implementation with some parts
disabled to have comparable information about the impact of different
parts of implementation.

IMPACT:

1) For unimplemented Tx preparation callback the performance impact is
   negligible,
2) For packet condition check without checksum modifications (nb_segs,
   available offloads, etc.) is 14626628/14252168 (~2.62% drop),
3) Full support in ixgbe driver (point 2 + packet checksum
   initialization) is 14060924/13588094 (~3.48% drop)

Signed-off-by: Tomasz Kulasek <tomaszx.kulasek at intel.com>
Acked-by: Konstantin Ananyev <konstantin.ananyev at intel.com>
Acked-by: Olivier Matz <olivier.matz at 6wind.com>
---
 config/common_base            |    1 +
 lib/librte_ether/rte_ethdev.h |  106 +++++++++++++++++++++++++++++++++++++++++
 lib/librte_mbuf/rte_mbuf.h    |   64 +++++++++++++++++++++++++
 lib/librte_net/rte_net.h      |   85 +++++++++++++++++++++++++++++++++
 4 files changed, 256 insertions(+)

diff --git a/config/common_base b/config/common_base
index 4bff83a..d609a88 100644
--- a/config/common_base
+++ b/config/common_base
@@ -120,6 +120,7 @@ CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
 CONFIG_RTE_LIBRTE_IEEE1588=n
 CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
 CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
+CONFIG_RTE_ETHDEV_TX_PREPARE=y
 
 #
 # Support NIC bypass logic
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index 9678179..4ffc1b3 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -182,6 +182,7 @@
 #include <rte_pci.h>
 #include <rte_dev.h>
 #include <rte_devargs.h>
+#include <rte_errno.h>
 #include "rte_ether.h"
 #include "rte_eth_ctrl.h"
 #include "rte_dev_info.h"
@@ -702,6 +703,8 @@ struct rte_eth_desc_lim {
 	uint16_t nb_max;   /**< Max allowed number of descriptors. */
 	uint16_t nb_min;   /**< Min allowed number of descriptors. */
 	uint16_t nb_align; /**< Number of descriptors should be aligned to. */
+	uint16_t nb_seg_max;     /**< Max number of segments per whole packet. */
+	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */
 };
 
 /**
@@ -1191,6 +1194,11 @@ typedef uint16_t (*eth_tx_burst_t)(void *txq,
 				   uint16_t nb_pkts);
 /**< @internal Send output packets on a transmit queue of an Ethernet device. */
 
+typedef uint16_t (*eth_tx_prep_t)(void *txq,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
+/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */
+
 typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
 			       struct rte_eth_fc_conf *fc_conf);
 /**< @internal Get current flow control parameter on an Ethernet device */
@@ -1625,6 +1633,7 @@ struct rte_eth_rxtx_callback {
 struct rte_eth_dev {
 	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
 	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
+	eth_tx_prep_t tx_pkt_prepare; /**< Pointer to PMD transmit prepare function. */
 	struct rte_eth_dev_data *data;  /**< Pointer to device data */
 	const struct eth_driver *driver;/**< Driver for this device */
 	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
@@ -2819,6 +2828,103 @@ int rte_eth_dev_set_vlan_ether_type(uint8_t port_id,
 	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
 }
 
+/**
+ * Process a burst of output packets on a transmit queue of an Ethernet device.
+ *
+ * The rte_eth_tx_prepare() function is invoked to prepare output packets to be
+ * transmitted on the output queue *queue_id* of the Ethernet device designated
+ * by its *port_id*.
+ * The *nb_pkts* parameter is the number of packets to be prepared which are
+ * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
+ * allocated from a pool created with rte_pktmbuf_pool_create().
+ * For each packet to send, the rte_eth_tx_prepare() function performs
+ * the following operations:
+ *
+ * - Check if packet meets devices requirements for tx offloads.
+ *
+ * - Check limitations about number of segments.
+ *
+ * - Check additional requirements when debug is enabled.
+ *
+ * - Update and/or reset required checksums when tx offload is set for packet.
+ *
+ * Since this function can modify packet data, provided mbufs must be safely
+ * writable (e.g. modified data cannot be in shared segment).
+ *
+ * The rte_eth_tx_prepare() function returns the number of packets ready to be
+ * sent. A return value equal to *nb_pkts* means that all packets are valid and
+ * ready to be sent, otherwise stops processing on the first invalid packet and
+ * leaves the rest packets untouched.
+ *
+ * When this functionality is not implemented in the driver, all packets are
+ * are returned untouched.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ *   The value must be a valid port id.
+ * @param queue_id
+ *   The index of the transmit queue through which output packets must be
+ *   sent.
+ *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param tx_pkts
+ *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
+ *   which contain the output packets.
+ * @param nb_pkts
+ *   The maximum number of packets to process.
+ * @return
+ *   The number of packets correct and ready to be sent. The return value can be
+ *   less than the value of the *tx_pkts* parameter when some packet doesn't
+ *   meet devices requirements with rte_errno set appropriately:
+ *   - -EINVAL: offload flags are not correctly set
+ *   - -ENOTSUP: the offload feature is not supported by the hardware
+ *
+ */
+
+#ifdef RTE_ETHDEV_TX_PREPARE
+
+static inline uint16_t
+rte_eth_tx_prepare(uint8_t port_id, uint16_t queue_id,
+		struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	struct rte_eth_dev *dev;
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	if (!rte_eth_dev_is_valid_port(port_id)) {
+		RTE_PMD_DEBUG_TRACE("Invalid TX port_id=%d\n", port_id);
+		rte_errno = -EINVAL;
+		return 0;
+	}
+#endif
+
+	dev = &rte_eth_devices[port_id];
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	if (queue_id >= dev->data->nb_tx_queues) {
+		RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id);
+		rte_errno = -EINVAL;
+		return 0;
+	}
+#endif
+
+	if (!dev->tx_pkt_prepare)
+		return nb_pkts;
+
+	return (*dev->tx_pkt_prepare)(dev->data->tx_queues[queue_id],
+			tx_pkts, nb_pkts);
+}
+
+#else
+
+static inline uint16_t
+rte_eth_tx_prepare(__rte_unused uint8_t port_id, __rte_unused uint16_t queue_id,
+		__rte_unused struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
+{
+	return nb_pkts;
+}
+
+#endif
+
 typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
 		void *userdata);
 
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index ead7c6e..39ee5ed 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -283,6 +283,19 @@
  */
 #define PKT_TX_OUTER_IPV6    (1ULL << 60)
 
+/**
+ * Bit Mask of all supported packet Tx offload features flags, which can be set
+ * for packet.
+ */
+#define PKT_TX_OFFLOAD_MASK (    \
+		PKT_TX_IP_CKSUM |        \
+		PKT_TX_L4_MASK |         \
+		PKT_TX_OUTER_IP_CKSUM |  \
+		PKT_TX_TCP_SEG |         \
+		PKT_TX_QINQ_PKT |        \
+		PKT_TX_VLAN_PKT |        \
+		PKT_TX_TUNNEL_MASK)
+
 #define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
 
 #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
@@ -1647,6 +1660,57 @@ static inline int rte_pktmbuf_chain(struct rte_mbuf *head, struct rte_mbuf *tail
 }
 
 /**
+ * Validate general requirements for tx offload in mbuf.
+ *
+ * This function checks correctness and completeness of Tx offload settings.
+ *
+ * @param m
+ *   The packet mbuf to be validated.
+ * @return
+ *   0 if packet is valid
+ */
+static inline int
+rte_validate_tx_offload(const struct rte_mbuf *m)
+{
+	uint64_t ol_flags = m->ol_flags;
+	uint64_t inner_l3_offset = m->l2_len;
+
+	/* Does packet set any of available offloads? */
+	if (!(ol_flags & PKT_TX_OFFLOAD_MASK))
+		return 0;
+
+	if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
+		inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
+
+	/* Headers are fragmented */
+	if (rte_pktmbuf_data_len(m) < inner_l3_offset + m->l3_len + m->l4_len)
+		return -ENOTSUP;
+
+	/* IP checksum can be counted only for IPv4 packet */
+	if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6))
+		return -EINVAL;
+
+	/* IP type not set when required */
+	if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG))
+		if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)))
+			return -EINVAL;
+
+	/* Check requirements for TSO packet */
+	if (ol_flags & PKT_TX_TCP_SEG)
+		if ((m->tso_segsz == 0) ||
+				((ol_flags & PKT_TX_IPV4) &&
+				!(ol_flags & PKT_TX_IP_CKSUM)))
+			return -EINVAL;
+
+	/* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */
+	if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) &&
+			!(ol_flags & PKT_TX_OUTER_IPV4))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
  * Dump an mbuf structure to a file.
  *
  * Dump all fields for the given packet mbuf and all its associated
diff --git a/lib/librte_net/rte_net.h b/lib/librte_net/rte_net.h
index d4156ae..85f356d 100644
--- a/lib/librte_net/rte_net.h
+++ b/lib/librte_net/rte_net.h
@@ -38,6 +38,11 @@
 extern "C" {
 #endif
 
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_tcp.h>
+#include <rte_sctp.h>
+
 /**
  * Structure containing header lengths associated to a packet, filled
  * by rte_net_get_ptype().
@@ -86,6 +91,86 @@ struct rte_net_hdr_lens {
 uint32_t rte_net_get_ptype(const struct rte_mbuf *m,
 	struct rte_net_hdr_lens *hdr_lens, uint32_t layers);
 
+/**
+ * Prepare pseudo header checksum
+ *
+ * This function prepares pseudo header checksum for TSO and non-TSO tcp/udp in
+ * provided mbufs packet data.
+ *
+ * - for non-TSO tcp/udp packets full pseudo-header checksum is counted and set
+ *   in packet data,
+ * - for TSO the IP payload length is not included in pseudo header.
+ *
+ * This function expects that used headers are in the first data segment of
+ * mbuf, are not fragmented and can be safely modified.
+ *
+ * @param m
+ *   The packet mbuf to be fixed.
+ * @return
+ *   0 if checksum is initialized properly
+ */
+static inline int
+rte_net_intel_cksum_prepare(struct rte_mbuf *m)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	struct tcp_hdr *tcp_hdr;
+	struct udp_hdr *udp_hdr;
+	uint64_t ol_flags = m->ol_flags;
+	uint64_t inner_l3_offset = m->l2_len;
+
+	if (ol_flags & PKT_TX_OUTER_IP_CKSUM)
+		inner_l3_offset += m->outer_l2_len + m->outer_l3_len;
+
+	if ((ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
+		if (ol_flags & PKT_TX_IPV4) {
+			ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
+					inner_l3_offset);
+
+			if (ol_flags & PKT_TX_IP_CKSUM)
+				ipv4_hdr->hdr_checksum = 0;
+
+			udp_hdr = (struct udp_hdr *)((char *)ipv4_hdr +
+					m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
+					ol_flags);
+		} else {
+			ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *,
+					inner_l3_offset);
+			/* non-TSO udp */
+			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *,
+					inner_l3_offset + m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
+					ol_flags);
+		}
+	} else if ((ol_flags & PKT_TX_TCP_CKSUM) ||
+			(ol_flags & PKT_TX_TCP_SEG)) {
+		if (ol_flags & PKT_TX_IPV4) {
+			ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *,
+					inner_l3_offset);
+
+			if (ol_flags & PKT_TX_IP_CKSUM)
+				ipv4_hdr->hdr_checksum = 0;
+
+			/* non-TSO tcp or TSO */
+			tcp_hdr = (struct tcp_hdr *)((char *)ipv4_hdr +
+					m->l3_len);
+			tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr,
+					ol_flags);
+		} else {
+			ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *,
+					inner_l3_offset);
+			/* non-TSO tcp or TSO */
+			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *,
+					inner_l3_offset + m->l3_len);
+			tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr,
+					ol_flags);
+		}
+	}
+
+	return 0;
+}
+
 #ifdef __cplusplus
 }
 #endif
-- 
1.7.9.5



More information about the dev mailing list