[dpdk-dev,v2,1/6] ethdev: add Tx preparation

Message ID 1473691487-10032-2-git-send-email-tomaszx.kulasek@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Commit Message

Tomasz Kulasek Sept. 12, 2016, 2:44 p.m. UTC
  Added API for `rte_eth_tx_prep`

uint16_t rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id,
	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)

Added fields to the `struct rte_eth_desc_lim`:

	uint16_t nb_seg_max;
		/**< Max number of segments per whole packet. */

	uint16_t nb_mtu_seg_max;
		/**< Max number of segments per one MTU */

Created `rte_pkt.h` header with common used functions:

int rte_validate_tx_offload(struct rte_mbuf *m)
	to validate general requirements for tx offload in packet such a
	flag completness. In current implementation this funtion is called
	optionaly when RTE_LIBRTE_ETHDEV_DEBUG is enabled.

int rte_phdr_cksum_fix(struct rte_mbuf *m)
	to fix pseudo header checksum for TSO and non-TSO tcp/udp packets
	before hardware tx checksum offload.
	 - for non-TSO tcp/udp packets full pseudo-header checksum is
	   counted and set.
	 - for TSO the IP payload length is not included.

Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
---
 config/common_base            |    1 +
 lib/librte_ether/rte_ethdev.h |   85 ++++++++++++++++++++++++++
 lib/librte_mbuf/rte_mbuf.h    |    8 +++
 lib/librte_net/Makefile       |    2 +-
 lib/librte_net/rte_pkt.h      |  132 +++++++++++++++++++++++++++++++++++++++++
 5 files changed, 227 insertions(+), 1 deletion(-)
 create mode 100644 lib/librte_net/rte_pkt.h
  

Comments

Ananyev, Konstantin Sept. 19, 2016, 1:03 p.m. UTC | #1
Hi Tomasz,

> 
> Added API for `rte_eth_tx_prep`
> 
> uint16_t rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id,
> 	struct rte_mbuf **tx_pkts, uint16_t nb_pkts)
> 
> Added fields to the `struct rte_eth_desc_lim`:
> 
> 	uint16_t nb_seg_max;
> 		/**< Max number of segments per whole packet. */
> 
> 	uint16_t nb_mtu_seg_max;
> 		/**< Max number of segments per one MTU */
> 
> Created `rte_pkt.h` header with common used functions:
> 
> int rte_validate_tx_offload(struct rte_mbuf *m)
> 	to validate general requirements for tx offload in packet such a
> 	flag completness. In current implementation this funtion is called
> 	optionaly when RTE_LIBRTE_ETHDEV_DEBUG is enabled.
> 
> int rte_phdr_cksum_fix(struct rte_mbuf *m)
> 	to fix pseudo header checksum for TSO and non-TSO tcp/udp packets
> 	before hardware tx checksum offload.
> 	 - for non-TSO tcp/udp packets full pseudo-header checksum is
> 	   counted and set.
> 	 - for TSO the IP payload length is not included.
> 
> Signed-off-by: Tomasz Kulasek <tomaszx.kulasek@intel.com>
> ---
>  config/common_base            |    1 +
>  lib/librte_ether/rte_ethdev.h |   85 ++++++++++++++++++++++++++
>  lib/librte_mbuf/rte_mbuf.h    |    8 +++
>  lib/librte_net/Makefile       |    2 +-
>  lib/librte_net/rte_pkt.h      |  132 +++++++++++++++++++++++++++++++++++++++++
>  5 files changed, 227 insertions(+), 1 deletion(-)
>  create mode 100644 lib/librte_net/rte_pkt.h
> 
> diff --git a/config/common_base b/config/common_base
> index 7830535..7ada9e0 100644
> --- a/config/common_base
> +++ b/config/common_base
> @@ -120,6 +120,7 @@ CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
>  CONFIG_RTE_LIBRTE_IEEE1588=n
>  CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
>  CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
> +CONFIG_RTE_ETHDEV_TX_PREP=y
> 
>  #
>  # Support NIC bypass logic
> diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
> index b0fe033..4fa674d 100644
> --- a/lib/librte_ether/rte_ethdev.h
> +++ b/lib/librte_ether/rte_ethdev.h
> @@ -182,6 +182,7 @@ extern "C" {
>  #include <rte_pci.h>
>  #include <rte_dev.h>
>  #include <rte_devargs.h>
> +#include <rte_errno.h>
>  #include "rte_ether.h"
>  #include "rte_eth_ctrl.h"
>  #include "rte_dev_info.h"
> @@ -696,6 +697,8 @@ struct rte_eth_desc_lim {
>  	uint16_t nb_max;   /**< Max allowed number of descriptors. */
>  	uint16_t nb_min;   /**< Min allowed number of descriptors. */
>  	uint16_t nb_align; /**< Number of descriptors should be aligned to. */
> +	uint16_t nb_seg_max;     /**< Max number of segments per whole packet. */
> +	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */
>  };
> 
>  /**
> @@ -1181,6 +1184,11 @@ typedef uint16_t (*eth_tx_burst_t)(void *txq,
>  				   uint16_t nb_pkts);
>  /**< @internal Send output packets on a transmit queue of an Ethernet device. */
> 
> +typedef uint16_t (*eth_tx_prep_t)(void *txq,
> +				   struct rte_mbuf **tx_pkts,
> +				   uint16_t nb_pkts);
> +/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */
> +
>  typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
>  			       struct rte_eth_fc_conf *fc_conf);
>  /**< @internal Get current flow control parameter on an Ethernet device */
> @@ -1626,6 +1634,7 @@ enum rte_eth_dev_type {
>  struct rte_eth_dev {
>  	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
>  	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
> +	eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare function. */
>  	struct rte_eth_dev_data *data;  /**< Pointer to device data */
>  	const struct eth_driver *driver;/**< Driver for this device */
>  	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
> @@ -2833,6 +2842,82 @@ rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
>  	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
>  }
> 
> +/**
> + * Process a burst of output packets on a transmit queue of an Ethernet device.
> + *
> + * The rte_eth_tx_prep() function is invoked to prepare output packets to be
> + * transmitted on the output queue *queue_id* of the Ethernet device designated
> + * by its *port_id*.
> + * The *nb_pkts* parameter is the number of packets to be prepared which are
> + * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
> + * allocated from a pool created with rte_pktmbuf_pool_create().
> + * For each packet to send, the rte_eth_tx_prep() function performs
> + * the following operations:
> + *
> + * - Check if packet meets devices requirements for tx offloads.
> + *
> + * - Check limitations about number of segments.
> + *
> + * - Check additional requirements when debug is enabled.
> + *
> + * - Update and/or reset required checksums when tx offload is set for packet.
> + *
> + * The rte_eth_tx_prep() function returns the number of packets ready to be
> + * sent. A return value equal to *nb_pkts* means that all packets are valid and
> + * ready to be sent.
> + *
> + * @param port_id
> + *   The port identifier of the Ethernet device.
> + * @param queue_id
> + *   The index of the transmit queue through which output packets must be
> + *   sent.
> + *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
> + *   to rte_eth_dev_configure().
> + * @param tx_pkts
> + *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
> + *   which contain the output packets.
> + * @param nb_pkts
> + *   The maximum number of packets to process.
> + * @return
> + *   The number of packets correct and ready to be sent. The return value can be
> + *   less than the value of the *tx_pkts* parameter when some packet doesn't
> + *   meet devices requirements with rte_errno set appropriately.
> + */
> +
> +#ifdef RTE_ETHDEV_TX_PREP

Sorry for being a bit late on that discussion, but what the point of having
that config macro (RTE_ETHDEV_TX_PREP ) at all?
As I can see right now, if driver doesn't setup tx_pkt_prep, then nb_pkts
would be return anyway...

BTW, there is my another question - should it be that way?
Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if dev->tx_pk_prep == NULL?

> +
> +static inline uint16_t
> +rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **tx_pkts,
> +		uint16_t nb_pkts)
> +{
> +	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
> +
> +	if (!dev->tx_pkt_prep)
> +		return nb_pkts;
> +
> +#ifdef RTE_LIBRTE_ETHDEV_DEBUG
> +	if (queue_id >= dev->data->nb_tx_queues) {
> +		RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id);
> +		rte_errno = -EINVAL;
> +		return 0;
> +	}
> +#endif
> +
> +	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id],
> +			tx_pkts, nb_pkts);
> +}
> +
> +#else
> +
> +static inline uint16_t
> +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts)
> +{
> +	return nb_pkts;
> +}
> +
> +#endif
> +
>  typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
>  		void *userdata);
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 7ea66ed..72fd352 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -211,6 +211,14 @@ extern "C" {
>   */
>  #define PKT_TX_OUTER_IPV4   (1ULL << 59)
> 
> +#define PKT_TX_OFFLOAD_MASK (    \
> +		PKT_TX_IP_CKSUM |        \
> +		PKT_TX_L4_MASK |         \
> +		PKT_TX_OUTER_IP_CKSUM |  \
> +		PKT_TX_TCP_SEG |         \
> +		PKT_TX_QINQ_PKT |        \
> +		PKT_TX_VLAN_PKT)
> +
>  /**
>   * Packet outer header is IPv6. This flag must be set when using any
>   * outer offload feature (L4 checksum) to tell the NIC that the outer
> diff --git a/lib/librte_net/Makefile b/lib/librte_net/Makefile
> index ad2e482..b5abe84 100644
> --- a/lib/librte_net/Makefile
> +++ b/lib/librte_net/Makefile
> @@ -34,7 +34,7 @@ include $(RTE_SDK)/mk/rte.vars.mk
>  CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
> 
>  # install includes
> -SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_sctp.h rte_icmp.h rte_arp.h
> +SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_sctp.h rte_icmp.h rte_arp.h rte_pkt.h
> 
> 
>  include $(RTE_SDK)/mk/rte.install.mk
> diff --git a/lib/librte_net/rte_pkt.h b/lib/librte_net/rte_pkt.h
> new file mode 100644
> index 0000000..a3c3e3c
> --- /dev/null
> +++ b/lib/librte_net/rte_pkt.h
> @@ -0,0 +1,132 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016 Intel Corporation. All rights reserved.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#ifndef _RTE_PKT_H_
> +#define _RTE_PKT_H_
> +
> +#include <rte_ip.h>
> +#include <rte_udp.h>
> +#include <rte_tcp.h>
> +#include <rte_sctp.h>
> +
> +/**
> + * Validate general requirements for tx offload in packet.
> + */
> +static inline int
> +rte_validate_tx_offload(struct rte_mbuf *m)
> +{
> +	uint64_t ol_flags = m->ol_flags;
> +
> +	/* Does packet set any of available offloads? */
> +	if (!(ol_flags & PKT_TX_OFFLOAD_MASK))
> +		return 0;
> +
> +	/* IP checksum can be counted only for IPv4 packet */
> +	if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6))
> +		return -EINVAL;
> +
> +	if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG))
> +		/* IP type not set */
> +		if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)))
> +			return -EINVAL;
> +
> +	if (ol_flags & PKT_TX_TCP_SEG) {
> +
> +		/* PKT_TX_IP_CKSUM offload not set for IPv4 TSO packet */
> +		if ((ol_flags & PKT_TX_IPV4) && !(ol_flags & PKT_TX_IP_CKSUM))
> +			return -EINVAL;
> +
> +		if (m->tso_segsz == 0)
> +			return -EINVAL;
> +
> +	}


I suppose these 2 if(ol_flags & PKT_TX_TCP_SEG) above could be united into one.

> +
> +	/* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */
> +	if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) && !(ol_flags & PKT_TX_OUTER_IPV4))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +/**
> + * Fix pseudo header checksum for TSO and non-TSO tcp/udp packets before
> + * hardware tx checksum.
> + * For non-TSO tcp/udp packets full pseudo-header checksum is counted and set.
> + * For TSO the IP payload length is not included.
> + */
> +static inline int
> +rte_phdr_cksum_fix(struct rte_mbuf *m)
> +{
> +	struct ipv4_hdr *ipv4_hdr;
> +	struct ipv6_hdr *ipv6_hdr;
> +	struct tcp_hdr *tcp_hdr;
> +	struct udp_hdr *udp_hdr;
> +
> +	if (m->ol_flags & PKT_TX_IPV4) {
> +		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
> +
> +		if (m->ol_flags & PKT_TX_IP_CKSUM)
> +			ipv4_hdr->hdr_checksum = 0;
> +
> +		if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
> +			/* non-TSO udp */
> +			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, m->l2_len +
> +					m->l3_len);
> +			udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr, m->ol_flags);
> +		} else if ((m->ol_flags & PKT_TX_TCP_CKSUM) ||
> +				(m->ol_flags & PKT_TX_TCP_SEG)) {
> +			/* non-TSO tcp or TSO */
> +			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, m->l2_len +
> +					m->l3_len);
> +			tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr, m->ol_flags);
> +		}
> +	} else if (m->ol_flags & PKT_TX_IPV6) {
> +		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, m->l2_len);
> +
> +		if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
> +			/* non-TSO udp */
> +			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, m->l2_len +
> +					m->l3_len);
> +			udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr, m->ol_flags);
> +		} else if ((m->ol_flags & PKT_TX_TCP_CKSUM) ||
> +				(m->ol_flags & PKT_TX_TCP_SEG)) {
> +			/* non-TSO tcp or TSO */
> +			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, m->l2_len +
> +					m->l3_len);
> +			tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr, m->ol_flags);
> +		}
> +	}
> +	return 0;
> +}

We probably need to take into account that in some cases outer_l*_len could be setup here
(in case of tunneled packets).

Konstantin

> +
> +#endif /* _RTE_PKT_H_ */
> --
> 1.7.9.5
  
Tomasz Kulasek Sept. 19, 2016, 3:29 p.m. UTC | #2
Hi Konstantin,

> -----Original Message-----
> From: Ananyev, Konstantin
> Sent: Monday, September 19, 2016 15:03
> To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; dev@dpdk.org
> Cc: jerin.jacob@caviumnetworks.com
> Subject: RE: [dpdk-dev] [PATCH v2 1/6] ethdev: add Tx preparation
> 
> Hi Tomasz,
> 
> >

[...]

> > +
> > +#ifdef RTE_ETHDEV_TX_PREP
> 
> Sorry for being a bit late on that discussion, but what the point of
> having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> As I can see right now, if driver doesn't setup tx_pkt_prep, then nb_pkts
> would be return anyway...
> 
> BTW, there is my another question - should it be that way?
> Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if dev->tx_pk_prep
> == NULL?
> 

It's an answer to the Jerin's request discussed here: http://dpdk.org/ml/archives/dev/2016-September/046437.html

When driver doesn't support tx_prep, default behavior is "we don't know requirements, so we have nothing to do here". It will simplify application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.

As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same thread, I still don't think It's the best solution of the problem described by him. I have added it here for further discussion.

Jerin, have you something to add?

Tomasz.
  
Jerin Jacob Sept. 19, 2016, 4:06 p.m. UTC | #3
On Mon, Sep 19, 2016 at 03:29:07PM +0000, Kulasek, TomaszX wrote:
> Hi Konstantin,
> 
> > -----Original Message-----
> > From: Ananyev, Konstantin
> > Sent: Monday, September 19, 2016 15:03
> > To: Kulasek, TomaszX <tomaszx.kulasek@intel.com>; dev@dpdk.org
> > Cc: jerin.jacob@caviumnetworks.com
> > Subject: RE: [dpdk-dev] [PATCH v2 1/6] ethdev: add Tx preparation
> > 
> > Hi Tomasz,
> > 
> > >
> 
> [...]
> 
> > > +
> > > +#ifdef RTE_ETHDEV_TX_PREP
> > 
> > Sorry for being a bit late on that discussion, but what the point of
> > having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > As I can see right now, if driver doesn't setup tx_pkt_prep, then nb_pkts
> > would be return anyway...
> > 
> > BTW, there is my another question - should it be that way?
> > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if dev->tx_pk_prep
> > == NULL?
> > 
> 
> It's an answer to the Jerin's request discussed here: http://dpdk.org/ml/archives/dev/2016-September/046437.html
> 
> When driver doesn't support tx_prep, default behavior is "we don't know requirements, so we have nothing to do here". It will simplify application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> 
> As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same thread, I still don't think It's the best solution of the problem described by him. I have added it here for further discussion.
> 
> Jerin, have you something to add?

Nothing very specific to add here. I think, I have tried to share the rational in,
http://dpdk.org/ml/archives/dev/2016-September/046437.html

> 
> Tomasz.
  
Ananyev, Konstantin Sept. 20, 2016, 9:06 a.m. UTC | #4
Hi Jerin,

> > > >
> >
> > [...]
> >
> > > > +
> > > > +#ifdef RTE_ETHDEV_TX_PREP
> > >
> > > Sorry for being a bit late on that discussion, but what the point of
> > > having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > As I can see right now, if driver doesn't setup tx_pkt_prep, then
> > > nb_pkts would be return anyway...
> > >
> > > BTW, there is my another question - should it be that way?
> > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > dev->tx_pk_prep == NULL?
> > >
> >
> > It's an answer to the Jerin's request discussed here:
> > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> >
> > When driver doesn't support tx_prep, default behavior is "we don't know requirements, so we have nothing to do here". It will simplify
> application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> >
> > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same thread, I still don't think It's the best solution of the problem
> described by him. I have added it here for further discussion.
> >
> > Jerin, have you something to add?
> 
> Nothing very specific to add here. I think, I have tried to share the rational in, http://dpdk.org/ml/archives/dev/2016-
> September/046437.html
> 

Ok, not sure I am fully understand your intention here.
I think I understand why you propose rte_eth_tx_prep() to do:
	if (!dev->tx_pkt_prep)
		return nb_pkts;

That allows drivers to NOOP the tx_prep functionality without paying the
price for callback invocation.
What I don't understand, why with that in place we also need a NOOP for
the whole rte_eth_tx_prep():
+static inline uint16_t
+rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
+		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts)
+{
+	return nb_pkts;
+}
+
+#endif

What are you trying to save here: just reading ' dev->tx_pkt_prep'?
If so, then it seems not that performance critical for me.
Something else?
From my point of view NOOP on the driver level is more than enough.
Again I would prefer to introduce new config option, if possible.

Konstantin
  
Jerin Jacob Sept. 21, 2016, 8:29 a.m. UTC | #5
On Tue, Sep 20, 2016 at 09:06:42AM +0000, Ananyev, Konstantin wrote:
> 
> Hi Jerin,

Hi Konstantin,

> 
> > > > >
> > >
> > > [...]
> > >
> > > > > +
> > > > > +#ifdef RTE_ETHDEV_TX_PREP
> > > >
> > > > Sorry for being a bit late on that discussion, but what the point of
> > > > having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > > As I can see right now, if driver doesn't setup tx_pkt_prep, then
> > > > nb_pkts would be return anyway...
> > > >
> > > > BTW, there is my another question - should it be that way?
> > > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > > dev->tx_pk_prep == NULL?
> > > >
> > >
> > > It's an answer to the Jerin's request discussed here:
> > > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> > >
> > > When driver doesn't support tx_prep, default behavior is "we don't know requirements, so we have nothing to do here". It will simplify
> > application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> > >
> > > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same thread, I still don't think It's the best solution of the problem
> > described by him. I have added it here for further discussion.
> > >
> > > Jerin, have you something to add?
> > 
> > Nothing very specific to add here. I think, I have tried to share the rational in, http://dpdk.org/ml/archives/dev/2016-
> > September/046437.html
> > 
> 
> Ok, not sure I am fully understand your intention here.
> I think I understand why you propose rte_eth_tx_prep() to do:
> 	if (!dev->tx_pkt_prep)
> 		return nb_pkts;
> 
> That allows drivers to NOOP the tx_prep functionality without paying the
> price for callback invocation.

In true sense, returning the nb_pkts makes it functional NOP as well(i.e The PMD
does not have any limitation on Tx side, so all packets are _good_(no
preparation is required))


> What I don't understand, why with that in place we also need a NOOP for
> the whole rte_eth_tx_prep():
> +static inline uint16_t
> +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts)
> +{
> +	return nb_pkts;
> +}
> +
> +#endif
> 
> What are you trying to save here: just reading ' dev->tx_pkt_prep'?
> If so, then it seems not that performance critical for me.
> Something else?

The proposed scheme can make it as true NOOP from compiler perspective too if a
target decided to do that,
I have checked the instruction generation with arm Assembly, a non true
compiler NOOP has following instructions overhead at minimum.

	# 1 load
	# 1  mov
	if (!dev->tx_pkt_prep)
		return nb_pkts;

	# compile can't predict this function needs be executed or not so
	# pressure on register allocation and mostly likely it call for
	# some stack push and pop based load on outer function(as it is an
	# inline function)

	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);

	# 1 branch
	if (unlikely(nb_prep < nb_rx)) {
		# bunch of code not executed, but pressure on i cache
		int i;
		for (i = nb_prep; i < nb_rx; i++)
	                 rte_pktmbuf_free(pkts_burst[i]);
	}

From a server target(IA or high-end armv8) with external PCIe based system makes sense to have
RTE_ETHDEV_TX_PREP option enabled(which is the case in proposed patch) but
the low end arm platforms with
- limited cores
- less i cache
- IPC == 1
- running around 1GHz
- most importantly, _integrated_ nic controller with no external PCIE
  support
does not make much sense to waste the cycles/time for it.
cycle saved is cycle earned.

Since DPDK compilation is based on _target_, I really don't see any
issue with this approach nor it does not hurt anything on server target.
So, IMO, It should be upto the target to decide what works better for the target.

Jerin

> From my point of view NOOP on the driver level is more than enough.
> Again I would prefer to introduce new config option, if possible.
> 
> Konstantin
> 
> 
> 
> 
> 
> 
>
  
Ananyev, Konstantin Sept. 22, 2016, 9:36 a.m. UTC | #6
Hi Jerin,

> >
> > Hi Jerin,
> 
> Hi Konstantin,
> 
> >
> > > > > >
> > > >
> > > > [...]
> > > >
> > > > > > +
> > > > > > +#ifdef RTE_ETHDEV_TX_PREP
> > > > >
> > > > > Sorry for being a bit late on that discussion, but what the
> > > > > point of having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > > > As I can see right now, if driver doesn't setup tx_pkt_prep,
> > > > > then nb_pkts would be return anyway...
> > > > >
> > > > > BTW, there is my another question - should it be that way?
> > > > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > > > dev->tx_pk_prep == NULL?
> > > > >
> > > >
> > > > It's an answer to the Jerin's request discussed here:
> > > > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> > > >
> > > > When driver doesn't support tx_prep, default behavior is "we don't
> > > > know requirements, so we have nothing to do here". It will
> > > > simplify
> > > application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> > > >
> > > > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same
> > > > thread, I still don't think It's the best solution of the problem
> > > described by him. I have added it here for further discussion.
> > > >
> > > > Jerin, have you something to add?
> > >
> > > Nothing very specific to add here. I think, I have tried to share
> > > the rational in, http://dpdk.org/ml/archives/dev/2016-
> > > September/046437.html
> > >
> >
> > Ok, not sure I am fully understand your intention here.
> > I think I understand why you propose rte_eth_tx_prep() to do:
> > 	if (!dev->tx_pkt_prep)
> > 		return nb_pkts;
> >
> > That allows drivers to NOOP the tx_prep functionality without paying
> > the price for callback invocation.
> 
> In true sense, returning the nb_pkts makes it functional NOP as well(i.e The PMD does not have any limitation on Tx side, so all
> packets are _good_(no preparation is required))
> 
> 
> > What I don't understand, why with that in place we also need a NOOP
> > for the whole rte_eth_tx_prep():
> > +static inline uint16_t
> > +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> > +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts) {
> > +	return nb_pkts;
> > +}
> > +
> > +#endif
> >
> > What are you trying to save here: just reading ' dev->tx_pkt_prep'?
> > If so, then it seems not that performance critical for me.
> > Something else?
> 
> The proposed scheme can make it as true NOOP from compiler perspective too if a target decided to do that, I have checked the
> instruction generation with arm Assembly, a non true compiler NOOP has following instructions overhead at minimum.
> 
> 	# 1 load
> 	# 1  mov
> 	if (!dev->tx_pkt_prep)
> 		return nb_pkts;

Yep.

> 
> 	# compile can't predict this function needs be executed or not so
> 	# pressure on register allocation and mostly likely it call for
> 	# some stack push and pop based load on outer function(as it is an
> 	# inline function)


Well, I suppose compiler wouldn't try to fill function argument registers before the branch above. 

> 
> 	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
> 
> 	# 1 branch
> 	if (unlikely(nb_prep < nb_rx)) {
> 		# bunch of code not executed, but pressure on i cache
> 		int i;
> 		for (i = nb_prep; i < nb_rx; i++)
> 	                 rte_pktmbuf_free(pkts_burst[i]);
> 	}
> 
> From a server target(IA or high-end armv8) with external PCIe based system makes sense to have RTE_ETHDEV_TX_PREP option
> enabled(which is the case in proposed patch) but the low end arm platforms with
> - limited cores
> - less i cache
> - IPC == 1
> - running around 1GHz
> - most importantly, _integrated_ nic controller with no external PCIE
>   support
> does not make much sense to waste the cycles/time for it.
> cycle saved is cycle earned.

Ok, so it is all to save one memory de-refrence and a comparison plus branch.
Do you have aby estimation how badly it would hit low-end cpu performance?
The reason I am asking: obviously I would prefer to avoid to introduce new build config option
(that's the common dpdk coding practice these days) unless it is really important.  

> 
> Since DPDK compilation is based on _target_, I really don't see any issue with this approach nor it does not hurt anything on server
> target.
> So, IMO, It should be upto the target to decide what works better for the target.
> 
> Jerin
> 
> > From my point of view NOOP on the driver level is more than enough.
> > Again I would prefer to introduce new config option, if possible.
> >
> > Konstantin
> >
  
Jerin Jacob Sept. 22, 2016, 9:59 a.m. UTC | #7
On Thu, Sep 22, 2016 at 09:36:15AM +0000, Ananyev, Konstantin wrote:

Hi Konstantin,

> 
> Hi Jerin,
> 
> > >
> > > Hi Jerin,
> > 
> > Hi Konstantin,
> > 
> > >
> > > > > > >
> > > > >
> > > > > [...]
> > > > >
> > > > > > > +
> > > > > > > +#ifdef RTE_ETHDEV_TX_PREP
> > > > > >
> > > > > > Sorry for being a bit late on that discussion, but what the
> > > > > > point of having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > > > > As I can see right now, if driver doesn't setup tx_pkt_prep,
> > > > > > then nb_pkts would be return anyway...
> > > > > >
> > > > > > BTW, there is my another question - should it be that way?
> > > > > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > > > > dev->tx_pk_prep == NULL?
> > > > > >
> > > > >
> > > > > It's an answer to the Jerin's request discussed here:
> > > > > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> > > > >
> > > > > When driver doesn't support tx_prep, default behavior is "we don't
> > > > > know requirements, so we have nothing to do here". It will
> > > > > simplify
> > > > application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> > > > >
> > > > > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the same
> > > > > thread, I still don't think It's the best solution of the problem
> > > > described by him. I have added it here for further discussion.
> > > > >
> > > > > Jerin, have you something to add?
> > > >
> > > > Nothing very specific to add here. I think, I have tried to share
> > > > the rational in, http://dpdk.org/ml/archives/dev/2016-
> > > > September/046437.html
> > > >
> > >
> > > Ok, not sure I am fully understand your intention here.
> > > I think I understand why you propose rte_eth_tx_prep() to do:
> > > 	if (!dev->tx_pkt_prep)
> > > 		return nb_pkts;
> > >
> > > That allows drivers to NOOP the tx_prep functionality without paying
> > > the price for callback invocation.
> > 
> > In true sense, returning the nb_pkts makes it functional NOP as well(i.e The PMD does not have any limitation on Tx side, so all
> > packets are _good_(no preparation is required))
> > 
> > 
> > > What I don't understand, why with that in place we also need a NOOP
> > > for the whole rte_eth_tx_prep():
> > > +static inline uint16_t
> > > +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> > > +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts) {
> > > +	return nb_pkts;
> > > +}
> > > +
> > > +#endif
> > >
> > > What are you trying to save here: just reading ' dev->tx_pkt_prep'?
> > > If so, then it seems not that performance critical for me.
> > > Something else?
> > 
> > The proposed scheme can make it as true NOOP from compiler perspective too if a target decided to do that, I have checked the
> > instruction generation with arm Assembly, a non true compiler NOOP has following instructions overhead at minimum.
> > 
> > 	# 1 load
> > 	# 1  mov
> > 	if (!dev->tx_pkt_prep)
> > 		return nb_pkts;
> 
> Yep.
> 
> > 
> > 	# compile can't predict this function needs be executed or not so
> > 	# pressure on register allocation and mostly likely it call for
> > 	# some stack push and pop based load on outer function(as it is an
> > 	# inline function)
> 
> 
> Well, I suppose compiler wouldn't try to fill function argument registers before the branch above. 

Not the case with arm gcc compiler(may be based outer function load).
The recent, external pool manager function pointer conversion
reduced around 700kpps/core in local cache mode(even though the
function pointers are not executed)

> 
> > 
> > 	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
> > 
> > 	# 1 branch
> > 	if (unlikely(nb_prep < nb_rx)) {
> > 		# bunch of code not executed, but pressure on i cache
> > 		int i;
> > 		for (i = nb_prep; i < nb_rx; i++)
> > 	                 rte_pktmbuf_free(pkts_burst[i]);
> > 	}
> > 
> > From a server target(IA or high-end armv8) with external PCIe based system makes sense to have RTE_ETHDEV_TX_PREP option
> > enabled(which is the case in proposed patch) but the low end arm platforms with
> > - limited cores
> > - less i cache
> > - IPC == 1
> > - running around 1GHz
> > - most importantly, _integrated_ nic controller with no external PCIE
> >   support
> > does not make much sense to waste the cycles/time for it.
> > cycle saved is cycle earned.
> 
> Ok, so it is all to save one memory de-refrence and a comparison plus branch.
> Do you have aby estimation how badly it would hit low-end cpu performance?

around 400kpps/core. On four core systems, around 2 mpps.(4 core with
10G*2 ports)

> The reason I am asking: obviously I would prefer to avoid to introduce new build config option
> (that's the common dpdk coding practice these days) unless it is really important.  
Practice is something we need to revisit based on the new use case/usage.
I think, the scheme of non external pcie based NW cards is new to DPDK.

> 
> > 
> > Since DPDK compilation is based on _target_, I really don't see any issue with this approach nor it does not hurt anything on server
> > target.
> > So, IMO, It should be upto the target to decide what works better for the target.
> > 
> > Jerin
> > 
> > > From my point of view NOOP on the driver level is more than enough.
> > > Again I would prefer to introduce new config option, if possible.
> > >
> > > Konstantin
> > >
  
Ananyev, Konstantin Sept. 23, 2016, 9:41 a.m. UTC | #8
Hi Jerin,

> 
> Hi Konstantin,
> 
> >
> > Hi Jerin,
> >
> > > >
> > > > Hi Jerin,
> > >
> > > Hi Konstantin,
> > >
> > > >
> > > > > > > >
> > > > > >
> > > > > > [...]
> > > > > >
> > > > > > > > +
> > > > > > > > +#ifdef RTE_ETHDEV_TX_PREP
> > > > > > >
> > > > > > > Sorry for being a bit late on that discussion, but what the
> > > > > > > point of having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > > > > > As I can see right now, if driver doesn't setup tx_pkt_prep,
> > > > > > > then nb_pkts would be return anyway...
> > > > > > >
> > > > > > > BTW, there is my another question - should it be that way?
> > > > > > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > > > > > dev->tx_pk_prep == NULL?
> > > > > > >
> > > > > >
> > > > > > It's an answer to the Jerin's request discussed here:
> > > > > > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> > > > > >
> > > > > > When driver doesn't support tx_prep, default behavior is "we
> > > > > > don't know requirements, so we have nothing to do here". It
> > > > > > will simplify
> > > > > application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> > > > > >
> > > > > > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the
> > > > > > same thread, I still don't think It's the best solution of the
> > > > > > problem
> > > > > described by him. I have added it here for further discussion.
> > > > > >
> > > > > > Jerin, have you something to add?
> > > > >
> > > > > Nothing very specific to add here. I think, I have tried to
> > > > > share the rational in, http://dpdk.org/ml/archives/dev/2016-
> > > > > September/046437.html
> > > > >
> > > >
> > > > Ok, not sure I am fully understand your intention here.
> > > > I think I understand why you propose rte_eth_tx_prep() to do:
> > > > 	if (!dev->tx_pkt_prep)
> > > > 		return nb_pkts;
> > > >
> > > > That allows drivers to NOOP the tx_prep functionality without
> > > > paying the price for callback invocation.
> > >
> > > In true sense, returning the nb_pkts makes it functional NOP as
> > > well(i.e The PMD does not have any limitation on Tx side, so all
> > > packets are _good_(no preparation is required))
> > >
> > >
> > > > What I don't understand, why with that in place we also need a
> > > > NOOP for the whole rte_eth_tx_prep():
> > > > +static inline uint16_t
> > > > +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> > > > +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts) {
> > > > +	return nb_pkts;
> > > > +}
> > > > +
> > > > +#endif
> > > >
> > > > What are you trying to save here: just reading ' dev->tx_pkt_prep'?
> > > > If so, then it seems not that performance critical for me.
> > > > Something else?
> > >
> > > The proposed scheme can make it as true NOOP from compiler
> > > perspective too if a target decided to do that, I have checked the instruction generation with arm Assembly, a non true compiler
> NOOP has following instructions overhead at minimum.
> > >
> > > 	# 1 load
> > > 	# 1  mov
> > > 	if (!dev->tx_pkt_prep)
> > > 		return nb_pkts;
> >
> > Yep.
> >
> > >
> > > 	# compile can't predict this function needs be executed or not so
> > > 	# pressure on register allocation and mostly likely it call for
> > > 	# some stack push and pop based load on outer function(as it is an
> > > 	# inline function)
> >
> >
> > Well, I suppose compiler wouldn't try to fill function argument registers before the branch above.
> 
> Not the case with arm gcc compiler(may be based outer function load).

Ok, so for my own curiosity (I am not very familiar with the ARM arch):
 gcc generates several conditional execution instructions in a row to spill/fill 
function arguments registers, and that comes at a price at execution time if condition is not met? 

> The recent, external pool manager function pointer conversion reduced around 700kpps/core in local cache mode(even though the
> function pointers are not executed)
> 
> >
> > >
> > > 	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], tx_pkts,
> > > nb_pkts);
> > >
> > > 	# 1 branch
> > > 	if (unlikely(nb_prep < nb_rx)) {
> > > 		# bunch of code not executed, but pressure on i cache
> > > 		int i;
> > > 		for (i = nb_prep; i < nb_rx; i++)
> > > 	                 rte_pktmbuf_free(pkts_burst[i]);
> > > 	}
> > >
> > > From a server target(IA or high-end armv8) with external PCIe based
> > > system makes sense to have RTE_ETHDEV_TX_PREP option enabled(which
> > > is the case in proposed patch) but the low end arm platforms with
> > > - limited cores
> > > - less i cache
> > > - IPC == 1
> > > - running around 1GHz
> > > - most importantly, _integrated_ nic controller with no external PCIE
> > >   support
> > > does not make much sense to waste the cycles/time for it.
> > > cycle saved is cycle earned.
> >
> > Ok, so it is all to save one memory de-refrence and a comparison plus branch.
> > Do you have aby estimation how badly it would hit low-end cpu performance?
> 
> around 400kpps/core. On four core systems, around 2 mpps.(4 core with
> 10G*2 ports)

So it is about ~7% for 2x10G, correct?
I agree that seems big enough to keep the config option,
even though I am not quite happy with introducing new config option. 
So no more objections from my side here.
Thanks 
Konstantin

> 
> > The reason I am asking: obviously I would prefer to avoid to introduce
> > new build config option (that's the common dpdk coding practice these days) unless it is really important.
> Practice is something we need to revisit based on the new use case/usage.
> I think, the scheme of non external pcie based NW cards is new to DPDK.
> 
> >
> > >
> > > Since DPDK compilation is based on _target_, I really don't see any
> > > issue with this approach nor it does not hurt anything on server target.
> > > So, IMO, It should be upto the target to decide what works better for the target.
> > >
> > > Jerin
> > >
> > > > From my point of view NOOP on the driver level is more than enough.
> > > > Again I would prefer to introduce new config option, if possible.
> > > >
> > > > Konstantin
> > > >
  
Jerin Jacob Sept. 23, 2016, 10:29 a.m. UTC | #9
On Fri, Sep 23, 2016 at 09:41:52AM +0000, Ananyev, Konstantin wrote:
Hi Konstantin,
> Hi Jerin,
> 
> > 
> > Hi Konstantin,
> > 
> > >
> > > Hi Jerin,
> > >
> > > > >
> > > > > Hi Jerin,
> > > >
> > > > Hi Konstantin,
> > > >
> > > > >
> > > > > > > > >
> > > > > > >
> > > > > > > [...]
> > > > > > >
> > > > > > > > > +
> > > > > > > > > +#ifdef RTE_ETHDEV_TX_PREP
> > > > > > > >
> > > > > > > > Sorry for being a bit late on that discussion, but what the
> > > > > > > > point of having that config macro (RTE_ETHDEV_TX_PREP ) at all?
> > > > > > > > As I can see right now, if driver doesn't setup tx_pkt_prep,
> > > > > > > > then nb_pkts would be return anyway...
> > > > > > > >
> > > > > > > > BTW, there is my another question - should it be that way?
> > > > > > > > Shouldn't we return 0 (and set rte_errno=ENOTSUP) here if
> > > > > > > > dev->tx_pk_prep == NULL?
> > > > > > > >
> > > > > > >
> > > > > > > It's an answer to the Jerin's request discussed here:
> > > > > > > http://dpdk.org/ml/archives/dev/2016-September/046437.html
> > > > > > >
> > > > > > > When driver doesn't support tx_prep, default behavior is "we
> > > > > > > don't know requirements, so we have nothing to do here". It
> > > > > > > will simplify
> > > > > > application logic and improve performance for these drivers, I think. Catching this error with every burst may be problematic.
> > > > > > >
> > > > > > > As for RTE_ETHDEV_TX_PREP macro, suggested by Jerin in the
> > > > > > > same thread, I still don't think It's the best solution of the
> > > > > > > problem
> > > > > > described by him. I have added it here for further discussion.
> > > > > > >
> > > > > > > Jerin, have you something to add?
> > > > > >
> > > > > > Nothing very specific to add here. I think, I have tried to
> > > > > > share the rational in, http://dpdk.org/ml/archives/dev/2016-
> > > > > > September/046437.html
> > > > > >
> > > > >
> > > > > Ok, not sure I am fully understand your intention here.
> > > > > I think I understand why you propose rte_eth_tx_prep() to do:
> > > > > 	if (!dev->tx_pkt_prep)
> > > > > 		return nb_pkts;
> > > > >
> > > > > That allows drivers to NOOP the tx_prep functionality without
> > > > > paying the price for callback invocation.
> > > >
> > > > In true sense, returning the nb_pkts makes it functional NOP as
> > > > well(i.e The PMD does not have any limitation on Tx side, so all
> > > > packets are _good_(no preparation is required))
> > > >
> > > >
> > > > > What I don't understand, why with that in place we also need a
> > > > > NOOP for the whole rte_eth_tx_prep():
> > > > > +static inline uint16_t
> > > > > +rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
> > > > > +		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts) {
> > > > > +	return nb_pkts;
> > > > > +}
> > > > > +
> > > > > +#endif
> > > > >
> > > > > What are you trying to save here: just reading ' dev->tx_pkt_prep'?
> > > > > If so, then it seems not that performance critical for me.
> > > > > Something else?
> > > >
> > > > The proposed scheme can make it as true NOOP from compiler
> > > > perspective too if a target decided to do that, I have checked the instruction generation with arm Assembly, a non true compiler
> > NOOP has following instructions overhead at minimum.
> > > >
> > > > 	# 1 load
> > > > 	# 1  mov
> > > > 	if (!dev->tx_pkt_prep)
> > > > 		return nb_pkts;
> > >
> > > Yep.
> > >
> > > >
> > > > 	# compile can't predict this function needs be executed or not so
> > > > 	# pressure on register allocation and mostly likely it call for
> > > > 	# some stack push and pop based load on outer function(as it is an
> > > > 	# inline function)
> > >
> > >
> > > Well, I suppose compiler wouldn't try to fill function argument registers before the branch above.
> > 
> > Not the case with arm gcc compiler(may be based outer function load).
> 
> Ok, so for my own curiosity (I am not very familiar with the ARM arch):
>  gcc generates several conditional execution instructions in a row to spill/fill 
> function arguments registers, and that comes at a price at execution time if condition is not met? 

Yes. That's what I see(at least for gcc 5.3 + arm64 back-end case) when I was debugging
external mempool function pointer performance regression issue.
The sad part is, I couldn't see any gcc option to override it.


> 
> > The recent, external pool manager function pointer conversion reduced around 700kpps/core in local cache mode(even though the
> > function pointers are not executed)
> > 
> > >
> > > >
> > > > 	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id], tx_pkts,
> > > > nb_pkts);
> > > >
> > > > 	# 1 branch
> > > > 	if (unlikely(nb_prep < nb_rx)) {
> > > > 		# bunch of code not executed, but pressure on i cache
> > > > 		int i;
> > > > 		for (i = nb_prep; i < nb_rx; i++)
> > > > 	                 rte_pktmbuf_free(pkts_burst[i]);
> > > > 	}
> > > >
> > > > From a server target(IA or high-end armv8) with external PCIe based
> > > > system makes sense to have RTE_ETHDEV_TX_PREP option enabled(which
> > > > is the case in proposed patch) but the low end arm platforms with
> > > > - limited cores
> > > > - less i cache
> > > > - IPC == 1
> > > > - running around 1GHz
> > > > - most importantly, _integrated_ nic controller with no external PCIE
> > > >   support
> > > > does not make much sense to waste the cycles/time for it.
> > > > cycle saved is cycle earned.
> > >
> > > Ok, so it is all to save one memory de-refrence and a comparison plus branch.
> > > Do you have aby estimation how badly it would hit low-end cpu performance?
> > 
> > around 400kpps/core. On four core systems, around 2 mpps.(4 core with
> > 10G*2 ports)
> 
> So it is about ~7% for 2x10G, correct?
> I agree that seems big enough to keep the config option,
> even though I am not quite happy with introducing new config option. 
> So no more objections from my side here.

Thanks.

That's was for very low end cpus.
So even if it is for high-end cpu case, event if it calls for 100kpps drop/core,
The Cavium configuration like 96 cores + >200G case will at least 9.6mpps worth
of cycles drop.


> Thanks 
> Konstantin
> 
> > 
> > > The reason I am asking: obviously I would prefer to avoid to introduce
> > > new build config option (that's the common dpdk coding practice these days) unless it is really important.
> > Practice is something we need to revisit based on the new use case/usage.
> > I think, the scheme of non external pcie based NW cards is new to DPDK.
> > 
> > >
> > > >
> > > > Since DPDK compilation is based on _target_, I really don't see any
> > > > issue with this approach nor it does not hurt anything on server target.
> > > > So, IMO, It should be upto the target to decide what works better for the target.
> > > >
> > > > Jerin
> > > >
> > > > > From my point of view NOOP on the driver level is more than enough.
> > > > > Again I would prefer to introduce new config option, if possible.
> > > > >
> > > > > Konstantin
> > > > >
  

Patch

diff --git a/config/common_base b/config/common_base
index 7830535..7ada9e0 100644
--- a/config/common_base
+++ b/config/common_base
@@ -120,6 +120,7 @@  CONFIG_RTE_MAX_QUEUES_PER_PORT=1024
 CONFIG_RTE_LIBRTE_IEEE1588=n
 CONFIG_RTE_ETHDEV_QUEUE_STAT_CNTRS=16
 CONFIG_RTE_ETHDEV_RXTX_CALLBACKS=y
+CONFIG_RTE_ETHDEV_TX_PREP=y
 
 #
 # Support NIC bypass logic
diff --git a/lib/librte_ether/rte_ethdev.h b/lib/librte_ether/rte_ethdev.h
index b0fe033..4fa674d 100644
--- a/lib/librte_ether/rte_ethdev.h
+++ b/lib/librte_ether/rte_ethdev.h
@@ -182,6 +182,7 @@  extern "C" {
 #include <rte_pci.h>
 #include <rte_dev.h>
 #include <rte_devargs.h>
+#include <rte_errno.h>
 #include "rte_ether.h"
 #include "rte_eth_ctrl.h"
 #include "rte_dev_info.h"
@@ -696,6 +697,8 @@  struct rte_eth_desc_lim {
 	uint16_t nb_max;   /**< Max allowed number of descriptors. */
 	uint16_t nb_min;   /**< Min allowed number of descriptors. */
 	uint16_t nb_align; /**< Number of descriptors should be aligned to. */
+	uint16_t nb_seg_max;     /**< Max number of segments per whole packet. */
+	uint16_t nb_mtu_seg_max; /**< Max number of segments per one MTU */
 };
 
 /**
@@ -1181,6 +1184,11 @@  typedef uint16_t (*eth_tx_burst_t)(void *txq,
 				   uint16_t nb_pkts);
 /**< @internal Send output packets on a transmit queue of an Ethernet device. */
 
+typedef uint16_t (*eth_tx_prep_t)(void *txq,
+				   struct rte_mbuf **tx_pkts,
+				   uint16_t nb_pkts);
+/**< @internal Prepare output packets on a transmit queue of an Ethernet device. */
+
 typedef int (*flow_ctrl_get_t)(struct rte_eth_dev *dev,
 			       struct rte_eth_fc_conf *fc_conf);
 /**< @internal Get current flow control parameter on an Ethernet device */
@@ -1626,6 +1634,7 @@  enum rte_eth_dev_type {
 struct rte_eth_dev {
 	eth_rx_burst_t rx_pkt_burst; /**< Pointer to PMD receive function. */
 	eth_tx_burst_t tx_pkt_burst; /**< Pointer to PMD transmit function. */
+	eth_tx_prep_t tx_pkt_prep; /**< Pointer to PMD transmit prepare function. */
 	struct rte_eth_dev_data *data;  /**< Pointer to device data */
 	const struct eth_driver *driver;/**< Driver for this device */
 	const struct eth_dev_ops *dev_ops; /**< Functions exported by PMD */
@@ -2833,6 +2842,82 @@  rte_eth_tx_burst(uint8_t port_id, uint16_t queue_id,
 	return (*dev->tx_pkt_burst)(dev->data->tx_queues[queue_id], tx_pkts, nb_pkts);
 }
 
+/**
+ * Process a burst of output packets on a transmit queue of an Ethernet device.
+ *
+ * The rte_eth_tx_prep() function is invoked to prepare output packets to be
+ * transmitted on the output queue *queue_id* of the Ethernet device designated
+ * by its *port_id*.
+ * The *nb_pkts* parameter is the number of packets to be prepared which are
+ * supplied in the *tx_pkts* array of *rte_mbuf* structures, each of them
+ * allocated from a pool created with rte_pktmbuf_pool_create().
+ * For each packet to send, the rte_eth_tx_prep() function performs
+ * the following operations:
+ *
+ * - Check if packet meets devices requirements for tx offloads.
+ *
+ * - Check limitations about number of segments.
+ *
+ * - Check additional requirements when debug is enabled.
+ *
+ * - Update and/or reset required checksums when tx offload is set for packet.
+ *
+ * The rte_eth_tx_prep() function returns the number of packets ready to be
+ * sent. A return value equal to *nb_pkts* means that all packets are valid and
+ * ready to be sent.
+ *
+ * @param port_id
+ *   The port identifier of the Ethernet device.
+ * @param queue_id
+ *   The index of the transmit queue through which output packets must be
+ *   sent.
+ *   The value must be in the range [0, nb_tx_queue - 1] previously supplied
+ *   to rte_eth_dev_configure().
+ * @param tx_pkts
+ *   The address of an array of *nb_pkts* pointers to *rte_mbuf* structures
+ *   which contain the output packets.
+ * @param nb_pkts
+ *   The maximum number of packets to process.
+ * @return
+ *   The number of packets correct and ready to be sent. The return value can be
+ *   less than the value of the *tx_pkts* parameter when some packet doesn't
+ *   meet devices requirements with rte_errno set appropriately.
+ */
+
+#ifdef RTE_ETHDEV_TX_PREP
+
+static inline uint16_t
+rte_eth_tx_prep(uint8_t port_id, uint16_t queue_id, struct rte_mbuf **tx_pkts,
+		uint16_t nb_pkts)
+{
+	struct rte_eth_dev *dev = &rte_eth_devices[port_id];
+
+	if (!dev->tx_pkt_prep)
+		return nb_pkts;
+
+#ifdef RTE_LIBRTE_ETHDEV_DEBUG
+	if (queue_id >= dev->data->nb_tx_queues) {
+		RTE_PMD_DEBUG_TRACE("Invalid TX queue_id=%d\n", queue_id);
+		rte_errno = -EINVAL;
+		return 0;
+	}
+#endif
+
+	return (*dev->tx_pkt_prep)(dev->data->tx_queues[queue_id],
+			tx_pkts, nb_pkts);
+}
+
+#else
+
+static inline uint16_t
+rte_eth_tx_prep(uint8_t port_id __rte_unused, uint16_t queue_id __rte_unused,
+		struct rte_mbuf **tx_pkts __rte_unused, uint16_t nb_pkts)
+{
+	return nb_pkts;
+}
+
+#endif
+
 typedef void (*buffer_tx_error_fn)(struct rte_mbuf **unsent, uint16_t count,
 		void *userdata);
 
diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 7ea66ed..72fd352 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -211,6 +211,14 @@  extern "C" {
  */
 #define PKT_TX_OUTER_IPV4   (1ULL << 59)
 
+#define PKT_TX_OFFLOAD_MASK (    \
+		PKT_TX_IP_CKSUM |        \
+		PKT_TX_L4_MASK |         \
+		PKT_TX_OUTER_IP_CKSUM |  \
+		PKT_TX_TCP_SEG |         \
+		PKT_TX_QINQ_PKT |        \
+		PKT_TX_VLAN_PKT)
+
 /**
  * Packet outer header is IPv6. This flag must be set when using any
  * outer offload feature (L4 checksum) to tell the NIC that the outer
diff --git a/lib/librte_net/Makefile b/lib/librte_net/Makefile
index ad2e482..b5abe84 100644
--- a/lib/librte_net/Makefile
+++ b/lib/librte_net/Makefile
@@ -34,7 +34,7 @@  include $(RTE_SDK)/mk/rte.vars.mk
 CFLAGS += $(WERROR_FLAGS) -I$(SRCDIR) -O3
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_sctp.h rte_icmp.h rte_arp.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_NET)-include := rte_ip.h rte_tcp.h rte_udp.h rte_sctp.h rte_icmp.h rte_arp.h rte_pkt.h
 
 
 include $(RTE_SDK)/mk/rte.install.mk
diff --git a/lib/librte_net/rte_pkt.h b/lib/librte_net/rte_pkt.h
new file mode 100644
index 0000000..a3c3e3c
--- /dev/null
+++ b/lib/librte_net/rte_pkt.h
@@ -0,0 +1,132 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_PKT_H_
+#define _RTE_PKT_H_
+
+#include <rte_ip.h>
+#include <rte_udp.h>
+#include <rte_tcp.h>
+#include <rte_sctp.h>
+
+/**
+ * Validate general requirements for tx offload in packet.
+ */
+static inline int
+rte_validate_tx_offload(struct rte_mbuf *m)
+{
+	uint64_t ol_flags = m->ol_flags;
+
+	/* Does packet set any of available offloads? */
+	if (!(ol_flags & PKT_TX_OFFLOAD_MASK))
+		return 0;
+
+	/* IP checksum can be counted only for IPv4 packet */
+	if ((ol_flags & PKT_TX_IP_CKSUM) && (ol_flags & PKT_TX_IPV6))
+		return -EINVAL;
+
+	if (ol_flags & (PKT_TX_L4_MASK | PKT_TX_TCP_SEG))
+		/* IP type not set */
+		if (!(ol_flags & (PKT_TX_IPV4 | PKT_TX_IPV6)))
+			return -EINVAL;
+
+	if (ol_flags & PKT_TX_TCP_SEG) {
+
+		/* PKT_TX_IP_CKSUM offload not set for IPv4 TSO packet */
+		if ((ol_flags & PKT_TX_IPV4) && !(ol_flags & PKT_TX_IP_CKSUM))
+			return -EINVAL;
+
+		if (m->tso_segsz == 0)
+			return -EINVAL;
+
+	}
+
+	/* PKT_TX_OUTER_IP_CKSUM set for non outer IPv4 packet. */
+	if ((ol_flags & PKT_TX_OUTER_IP_CKSUM) && !(ol_flags & PKT_TX_OUTER_IPV4))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * Fix pseudo header checksum for TSO and non-TSO tcp/udp packets before
+ * hardware tx checksum.
+ * For non-TSO tcp/udp packets full pseudo-header checksum is counted and set.
+ * For TSO the IP payload length is not included.
+ */
+static inline int
+rte_phdr_cksum_fix(struct rte_mbuf *m)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ipv6_hdr *ipv6_hdr;
+	struct tcp_hdr *tcp_hdr;
+	struct udp_hdr *udp_hdr;
+
+	if (m->ol_flags & PKT_TX_IPV4) {
+		ipv4_hdr = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, m->l2_len);
+
+		if (m->ol_flags & PKT_TX_IP_CKSUM)
+			ipv4_hdr->hdr_checksum = 0;
+
+		if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
+			/* non-TSO udp */
+			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, m->l2_len +
+					m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv4_phdr_cksum(ipv4_hdr, m->ol_flags);
+		} else if ((m->ol_flags & PKT_TX_TCP_CKSUM) ||
+				(m->ol_flags & PKT_TX_TCP_SEG)) {
+			/* non-TSO tcp or TSO */
+			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, m->l2_len +
+					m->l3_len);
+			tcp_hdr->cksum = rte_ipv4_phdr_cksum(ipv4_hdr, m->ol_flags);
+		}
+	} else if (m->ol_flags & PKT_TX_IPV6) {
+		ipv6_hdr = rte_pktmbuf_mtod_offset(m, struct ipv6_hdr *, m->l2_len);
+
+		if ((m->ol_flags & PKT_TX_UDP_CKSUM) == PKT_TX_UDP_CKSUM) {
+			/* non-TSO udp */
+			udp_hdr = rte_pktmbuf_mtod_offset(m, struct udp_hdr *, m->l2_len +
+					m->l3_len);
+			udp_hdr->dgram_cksum = rte_ipv6_phdr_cksum(ipv6_hdr, m->ol_flags);
+		} else if ((m->ol_flags & PKT_TX_TCP_CKSUM) ||
+				(m->ol_flags & PKT_TX_TCP_SEG)) {
+			/* non-TSO tcp or TSO */
+			tcp_hdr = rte_pktmbuf_mtod_offset(m, struct tcp_hdr *, m->l2_len +
+					m->l3_len);
+			tcp_hdr->cksum = rte_ipv6_phdr_cksum(ipv6_hdr, m->ol_flags);
+		}
+	}
+	return 0;
+}
+
+#endif /* _RTE_PKT_H_ */