[v5] net/mlx4: support hardware TSO

Message ID 1531132986-5054-1-git-send-email-motih@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series [v5] net/mlx4: support hardware TSO |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Moti Haimovsky July 9, 2018, 10:43 a.m. UTC
  Implement support for hardware TSO.

Signed-off-by: Moti Haimovsky <motih@mellanox.com>
---
v5:
* Modification to the code according to review inputs from Matan
  Azrad.
* Code optimization to the TSO header copy routine.
* Rearranged the TSO data-segments creation routine.
in reply to 
1530715998-15703-1-git-send-email-motih@mellanox.com

v4:
* Bug fixes in filling TSO data segments.
* Modifications according to review inputs from Adrien Mazarguil
  and Matan Azrad.
in reply to
1530190137-17848-1-git-send-email-motih@mellanox.com

v3:
* Fixed compilation errors in compilers without GNU C extensions
  caused by a declaration of zero-length array in the code.
in reply to
1530187032-6489-1-git-send-email-motih@mellanox.com

v2:
* Fixed coding style warning.
in reply to
1530184583-30166-1-git-send-email-motih@mellanox.com

v1:
* Fixed coding style warnings.
in reply to
1530181779-19716-1-git-send-email-motih@mellanox.com
---
 doc/guides/nics/features/mlx4.ini |   1 +
 doc/guides/nics/mlx4.rst          |   3 +
 drivers/net/mlx4/Makefile         |   5 +
 drivers/net/mlx4/mlx4.c           |   9 +
 drivers/net/mlx4/mlx4.h           |   5 +
 drivers/net/mlx4/mlx4_prm.h       |  15 ++
 drivers/net/mlx4/mlx4_rxtx.c      | 372 +++++++++++++++++++++++++++++++++++++-
 drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
 drivers/net/mlx4/mlx4_txq.c       |   8 +-
 9 files changed, 416 insertions(+), 4 deletions(-)
  

Comments

Matan Azrad July 9, 2018, 1:07 p.m. UTC | #1
Hi Moti 

Please see some comments below.

From: Mordechay Haimovsky
> Implement support for hardware TSO.
> 
> Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> ---
> v5:
> * Modification to the code according to review inputs from Matan
>   Azrad.
> * Code optimization to the TSO header copy routine.
> * Rearranged the TSO data-segments creation routine.
> in reply to
> 1530715998-15703-1-git-send-email-motih@mellanox.com
> 
> v4:
> * Bug fixes in filling TSO data segments.
> * Modifications according to review inputs from Adrien Mazarguil
>   and Matan Azrad.
> in reply to
> 1530190137-17848-1-git-send-email-motih@mellanox.com
> 
> v3:
> * Fixed compilation errors in compilers without GNU C extensions
>   caused by a declaration of zero-length array in the code.
> in reply to
> 1530187032-6489-1-git-send-email-motih@mellanox.com
> 
> v2:
> * Fixed coding style warning.
> in reply to
> 1530184583-30166-1-git-send-email-motih@mellanox.com
> 
> v1:
> * Fixed coding style warnings.
> in reply to
> 1530181779-19716-1-git-send-email-motih@mellanox.com
> ---
>  doc/guides/nics/features/mlx4.ini |   1 +
>  doc/guides/nics/mlx4.rst          |   3 +
>  drivers/net/mlx4/Makefile         |   5 +
>  drivers/net/mlx4/mlx4.c           |   9 +
>  drivers/net/mlx4/mlx4.h           |   5 +
>  drivers/net/mlx4/mlx4_prm.h       |  15 ++
>  drivers/net/mlx4/mlx4_rxtx.c      | 372
> +++++++++++++++++++++++++++++++++++++-
>  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
>  drivers/net/mlx4/mlx4_txq.c       |   8 +-
>  9 files changed, 416 insertions(+), 4 deletions(-)
> 
> diff --git a/doc/guides/nics/features/mlx4.ini
> b/doc/guides/nics/features/mlx4.ini
> index f6efd21..98a3f61 100644
> --- a/doc/guides/nics/features/mlx4.ini
> +++ b/doc/guides/nics/features/mlx4.ini
> @@ -13,6 +13,7 @@ Queue start/stop     = Y
>  MTU update           = Y
>  Jumbo frame          = Y
>  Scattered Rx         = Y
> +TSO                  = Y
>  Promiscuous mode     = Y
>  Allmulticast mode    = Y
>  Unicast MAC filter   = Y
> diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> 491106a..12adaeb 100644
> --- a/doc/guides/nics/mlx4.rst
> +++ b/doc/guides/nics/mlx4.rst
> @@ -142,6 +142,9 @@ Limitations
>    The ability to enable/disable CRC stripping requires OFED version
>    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> 
> +- TSO (Transmit Segmentation Offload) is supported in OFED version
> +  4.4 and above or in rdma-core version v18 and above.
> +
>  Prerequisites
>  -------------
> 
> diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile index
> 73f9d40..63bc003 100644
> --- a/drivers/net/mlx4/Makefile
> +++ b/drivers/net/mlx4/Makefile
> @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
>  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
>  	$Q $(RM) -f -- '$@'
>  	$Q : > '$@'
> +	$Q sh -- '$<' '$@' \
> +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> +		infiniband/mlx4dv.h \
> +		type 'struct mlx4_wqe_lso_seg' \
> +		$(AUTOCONF_OUTPUT)
> 
>  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> d151a90..5d8c76d 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -677,6 +677,15 @@ struct mlx4_conf {
> 
> 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DEBUG("FCS stripping toggling is %ssupported",
>  		      priv->hw_fcs_strip ? "" : "not ");
> +		priv->tso =
> +			((device_attr_ex.tso_caps.max_tso > 0) &&
> +			 (device_attr_ex.tso_caps.supported_qpts &
> +			  (1 << IBV_QPT_RAW_PACKET)));
> +		if (priv->tso)
> +			priv->tso_max_payload_sz =
> +					device_attr_ex.tso_caps.max_tso;
> +		DEBUG("TSO is %ssupported",
> +		      priv->tso ? "" : "not ");
>  		/* Configure the first MAC address by default. */
>  		err = mlx4_get_mac(priv, &mac.addr_bytes);
>  		if (err) {
> diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> 300cb4d..89d8c38 100644
> --- a/drivers/net/mlx4/mlx4.h
> +++ b/drivers/net/mlx4/mlx4.h
> @@ -47,6 +47,9 @@
>  /** Interrupt alarm timeout value in microseconds. */  #define
> MLX4_INTR_ALARM_TIMEOUT 100000
> 
> +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> +MLX4_MAX_TSO_HEADER 192
> +
>  /** Port parameter. */
>  #define MLX4_PMD_PORT_KVARG "port"
> 
> @@ -90,6 +93,8 @@ struct priv {
>  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
>  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> */
>  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
> +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> size. */
>  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format). */
>  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
>  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> */ diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
> index b771d8c..aef77ba 100644
> --- a/drivers/net/mlx4/mlx4_prm.h
> +++ b/drivers/net/mlx4/mlx4_prm.h
> @@ -19,6 +19,7 @@
>  #ifdef PEDANTIC
>  #pragma GCC diagnostic error "-Wpedantic"
>  #endif
> +#include "mlx4_autoconf.h"
> 
>  /* ConnectX-3 Tx queue basic block. */
>  #define MLX4_TXBB_SHIFT 6
> @@ -40,6 +41,7 @@
>  /* Work queue element (WQE) flags. */
>  #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> +#define MLX4_WQE_CTRL_RR (1 << 6)
> 
>  /* CQE checksum flags. */
>  enum {
> @@ -98,6 +100,19 @@ struct mlx4_cq {
>  	int arm_sn; /**< Rx event counter. */
>  };
> 
> +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> +/*
> + * WQE LSO segment structure.
> + * Defined here as backward compatibility for rdma-core v17 and below.
> + * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
> + * and above.
> + */
> +struct mlx4_wqe_lso_seg {
> +	rte_be32_t mss_hdr_size;
> +	rte_be32_t header[];
> +};
> +#endif
> +
>  /**
>   * Retrieve a CQE entry from a CQ.
>   *
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 78b6dd5..b695539 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -38,10 +38,29 @@
>   * DWORD (32 byte) of a TXBB.
>   */
>  struct pv {
> -	volatile struct mlx4_wqe_data_seg *dseg;
> +	union {
> +		volatile struct mlx4_wqe_data_seg *dseg;
> +		volatile uint32_t *dst;
> +	};
>  	uint32_t val;
>  };
> 
> +/** A helper structure for TSO packet handling. */ struct tso_info {
> +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> +	struct pv *pv;
> +	/** Current entry in the pv array. */
> +	int pv_counter;
> +	/** Total size of the WQE including padding. */
> +	uint32_t wqe_size;
> +	/** Size of TSO header to prepend to each packet to send. */
> +	uint16_t tso_header_size;
> +	/** Total size of the TSO segment in the WQE. */
> +	uint16_t wqe_tso_seg_size;
> +	/** Raw WQE size in units of 16 Bytes and without padding. */
> +	uint8_t fence_size;
> +};
> +
>  /** A table to translate Rx completion flags to packet type. */  uint32_t
> mlx4_ptype_table[0x100] __rte_cache_aligned = {
>  	/*
> @@ -368,6 +387,345 @@ struct pv {
>  }
> 
>  /**
> + * Obtain and calculate TSO information needed for assembling a TSO WQE.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to a structure to fill the info with.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline int
> +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo)
> +{
> +	struct mlx4_sq *sq = &txq->msq;
> +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> +
> +	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
> +	if (tunneled)
> +		tinfo->tso_header_size +=
> +				buf->outer_l2_len + buf->outer_l3_len;
> +	if (unlikely(buf->tso_segsz == 0 ||
> +		     tinfo->tso_header_size == 0 ||
> +		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
> +		     tinfo->tso_header_size > buf->data_len))
> +		return -EINVAL;
> +	/*
> +	 * Calculate the WQE TSO segment size
> +	 * Note:
> +	 * 1. An LSO segment must be padded such that the subsequent data
> +	 *    segment is 16-byte aligned.
> +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> +	 */
> +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct
> mlx4_wqe_lso_seg) +
> +					    tinfo->tso_header_size,
> +					    sizeof(struct
> mlx4_wqe_data_seg));
> +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> +			     buf->nb_segs;
> +	tinfo->wqe_size =
> +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> MLX4_SEG_SHIFT),
> +			  MLX4_TXBB_SIZE);
> +	/* Validate WQE size and WQE space in the send queue. */
> +	if (sq->remain_size < tinfo->wqe_size ||
> +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> +		return -ENOMEM;
> +	/* Init pv. */
> +	tinfo->pv = (struct pv *)txq->bounce_buf;
> +	tinfo->pv_counter = 0;
> +	return 0;
> +}
> +
> +/**
> + * Fill the TSO WQE data segments with info on buffers to transmit .
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param dseg
> + *   Pointer to the first data segment in the TSO WQE.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_ctrl_seg *
> +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> +			     struct txq *txq,
> +			     struct tso_info *tinfo,
> +			     volatile struct mlx4_wqe_data_seg *dseg,
> +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	uint32_t lkey;
> +	int nb_segs = buf->nb_segs;
> +	int nb_segs_txbb;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct rte_mbuf *sbuf = buf;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
> +			(volatile struct mlx4_wqe_ctrl_seg *)
> +				((volatile uint8_t *)ctrl + tinfo->wqe_size);
> +	uint16_t sb_of = tinfo->tso_header_size;
> +	uint16_t data_len;
> +
> +	do {
> +		/* how many dseg entries do we have in the current TXBB ?
> */
> +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> +			       MLX4_SEG_SHIFT;
> +		switch (nb_segs_txbb) {
> +		default:
> +			/* Should never happen. */
> +			rte_panic("%p: Invalid number of SGEs(%d) for a
> TXBB",
> +			(void *)txq, nb_segs_txbb);
> +			/* rte_panic never returns. */

Since this default case should not happen because of the above calculation I think we don't need it.
Just "break" if the compiler complain of default case lack.

> +		case 4:
> +			/* Memory region key for this memory pool. */
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			dseg->addr =
> +
> rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> +								     uintptr_t,
> +								     sb_of));
> +			dseg->lkey = lkey;
> +			/*
> +			 * This data segment starts at the beginning of a new
> +			 * TXBB, so we need to postpone its byte_count
> writing
> +			 * for later.
> +			 */
> +			pv[*pv_counter].dseg = dseg;
> +			/*
> +			 * Zero length segment is treated as inline segment
> +			 * with zero data.
> +			 */
> +			data_len = sbuf->data_len - sb_of;

Is there a chance that the data_len will be negative? Rolled in this case?
Maybe it is better to change it for int16_t and to replace the next check to be:
data_len > 0 ? data_len : 0x80000000


And I think I found a way to remove the sb_of calculations for each segment:

Each segment will create the next segment parameters while only the pre loop calculation for the first segment parameters will calculate the header offset:

The parameters: data_len and sb_of.

So before the loop:
sb_of = tinfo->tso_header_size;
data_len = sbuf->data_len - sb_of;

And inside the loop (after the check of nb_segs):
sb_of = 0;
data_len = sbuf->data_len(the next sbuf);

so each segment calculates the next segment parameters and we don't need the "- sb_of" calculation per segment.

> +			pv[(*pv_counter)++].val =
> +				rte_cpu_to_be_32(data_len ?
> +						 data_len :
> +						 0x80000000);
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 3:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 2:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +			/* fallthrough */
> +		case 1:
> +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> +			if (unlikely(lkey == (uint32_t)-1))
> +				goto err;
> +			data_len = sbuf->data_len - sb_of;
> +			mlx4_fill_tx_data_seg(dseg,
> +					lkey,
> +					rte_pktmbuf_mtod_offset(sbuf,
> +								uintptr_t,
> +								sb_of),
> +					rte_cpu_to_be_32(data_len ?
> +							 data_len :
> +							 0x80000000));
> +			sb_of = 0;
> +			sbuf = sbuf->next;
> +			dseg++;
> +			if (--nb_segs == 0)
> +				return ctrl_next;
> +		}
> +		/* Wrap dseg if it points at the end of the queue. */
> +		if ((volatile uint8_t *)dseg >= sq->eob)
> +			dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((volatile uint8_t *)dseg - sq->size);
> +	} while (true);
> +err:
> +	return NULL;
> +}
> +
> +/**
> + * Fill the packet's l2, l3 and l4 headers to the WQE.
> + *
> + * This will be used as the header for each TSO segment that is transmitted.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param tinfo
> + *   Pointer to TSO info to use.
> + * @param ctrl
> + *   Pointer to the control segment in the TSO WQE.
> + *
> + * @return
> + *   0 on success, negative value upon error.
> + */
> +static inline volatile struct mlx4_wqe_data_seg *
> +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> +			   struct txq *txq,
> +			   struct tso_info *tinfo,
> +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_lso_seg *tseg =
> +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct pv *pv = tinfo->pv;
> +	int *pv_counter = &tinfo->pv_counter;
> +	int remain_size = tinfo->tso_header_size;
> +	char *from = rte_pktmbuf_mtod(buf, char *);
> +	uint16_t txbb_avail_space;
> +	/* Union to overcome volatile constraints when copying TSO header.
> */
> +	union {
> +		volatile uint8_t *vto;
> +		uint8_t *to;
> +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> +
> +	/*
> +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> +	 * we can write the first 44 TSO header bytes without worry for TxQ
> +	 * wrapping or overwriting the first TXBB 32bit word.
> +	 */
> +	txbb_avail_space = MLX4_TXBB_SIZE -
> +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> +			    sizeof(struct mlx4_wqe_lso_seg));

I think that better name is txbb_tail_size.

> +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> +		/* Copy to end of txbb. */
> +		rte_memcpy(thdr.to, from, txbb_avail_space);
> +		from += txbb_avail_space;
> +		thdr.to += txbb_avail_space;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		/* New TXBB, stash the first 32bits for later use. */
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> +		from += sizeof(uint32_t);
> +		thdr.to += sizeof(uint32_t);
> +		remain_size -= (txbb_avail_space + sizeof(uint32_t));

You don't need the () here.

> +		/* Avail space in new TXBB is TXBB size - 4 */
> +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> +	}
> +	if (remain_size > txbb_avail_space) {
> +		rte_memcpy(thdr.to, from, txbb_avail_space);
> +		from += txbb_avail_space;
> +		thdr.to += txbb_avail_space;
> +		remain_size -= txbb_avail_space;
> +		/* New TXBB, Check for TxQ wrap. */
> +		if (thdr.to >= sq->eob)
> +			thdr.vto = sq->buf;
> +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> +		(*pv_counter)++;
> +	} else {

Here it should be else if (remain_size > 0).

> +		rte_memcpy(thdr.to, from, remain_size);
> +	}
> +
> +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> +					      tinfo->tso_header_size);
> +	/* Calculate data segment location */
> +	return (volatile struct mlx4_wqe_data_seg *)
> +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> }
> +
> +/**
> + * Write data segments and header for TSO uni/multi segment packet.
> + *
> + * @param buf
> + *   Pointer to the first packet mbuf.
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param ctrl
> + *   Pointer to the WQE control segment.
> + *
> + * @return
> + *   Pointer to the next WQE control segment on success, NULL otherwise.
> + */
> +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> +rte_mbuf *buf, struct txq *txq,
> +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> +	volatile struct mlx4_wqe_data_seg *dseg;
> +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> +	struct mlx4_sq *sq = &txq->msq;
> +	struct tso_info tinfo;
> +	struct pv *pv;
> +	int pv_counter;
> +	int ret;
> +
> +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> +	if (unlikely(ret))
> +		goto error;
> +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> +	if (unlikely(dseg == NULL))
> +		goto error;
> +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> +		dseg = (volatile struct mlx4_wqe_data_seg *)
> +					((uintptr_t)dseg - sq->size);
> +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> +	if (unlikely(ctrl_next == NULL))
> +		goto error;
> +	/* Write the first DWORD of each TXBB save earlier. */
> +	pv = tinfo.pv;
> +	pv_counter = tinfo.pv_counter;
> +	/* Need a barrier here before writing the first TXBB word. */
> +	rte_io_wmb();

> +	for (--pv_counter; pv_counter  >= 0; pv_counter--)

Since we don't need the first check do while statement is better.
To be fully safe you can use likely check before the memory barrier. 

> +		*pv[pv_counter].dst = pv[pv_counter].val;
> +	ctrl->fence_size = tinfo.fence_size;
> +	sq->remain_size -= tinfo.wqe_size;
> +	return ctrl_next;
> +error:
> +	txq->stats.odropped++;
> +	return NULL;
> +}
> +
> +/**
>   * Write data segments of multi-segment packet.
>   *
>   * @param buf
> @@ -560,6 +918,7 @@ struct pv {
>  			uint16_t flags16[2];
>  		} srcrb;
>  		uint32_t lkey;
> +		bool tso = txq->priv->tso && (buf->ol_flags &
> PKT_TX_TCP_SEG);
> 
>  		/* Clean up old buffer. */
>  		if (likely(elt->buf != NULL)) {
> @@ -578,7 +937,16 @@ struct pv {
>  			} while (tmp != NULL);
>  		}
>  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> -		if (buf->nb_segs == 1) {
> +		if (tso) {
> +			/* Change opcode to TSO */
> +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> +			owner_opcode |= MLX4_OPCODE_LSO |
> MLX4_WQE_CTRL_RR;
> +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> +			if (!ctrl_next) {
> +				elt->buf = NULL;
> +				break;
> +			}
> +		} else if (buf->nb_segs == 1) {
>  			/* Validate WQE space in the send queue. */
>  			if (sq->remain_size < MLX4_TXBB_SIZE) {
>  				elt->buf = NULL;
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index 4c025e3..ffa8abf 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
>  	unsigned int idx; /**< Mapping index. */
>  	uint64_t opackets; /**< Total of successfully sent packets. */
>  	uint64_t obytes; /**< Total of successfully sent bytes. */
> -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> */
> +	uint64_t odropped; /**< Total number of packets failed to transmit.
> */
>  };
> 
>  /** Tx queue descriptor. */
> diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> index 6edaadb..9aa7440 100644
> --- a/drivers/net/mlx4/mlx4_txq.c
> +++ b/drivers/net/mlx4/mlx4_txq.c
> @@ -116,8 +116,14 @@
>  			     DEV_TX_OFFLOAD_UDP_CKSUM |
>  			     DEV_TX_OFFLOAD_TCP_CKSUM);
>  	}
> -	if (priv->hw_csum_l2tun)
> +	if (priv->tso)
> +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> +	if (priv->hw_csum_l2tun) {
>  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> +		if (priv->tso)
> +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> +	}
>  	return offloads;
>  }
> 
> --
> 1.8.3.1
  
Moti Haimovsky July 9, 2018, 4:22 p.m. UTC | #2
inline

> -----Original Message-----
> From: Matan Azrad
> Sent: Monday, July 9, 2018 4:07 PM
> To: Mordechay Haimovsky <motih@mellanox.com>; Adrien Mazarguil
> <adrien.mazarguil@6wind.com>
> Cc: dev@dpdk.org
> Subject: RE: [PATCH v5] net/mlx4: support hardware TSO
> 
> 
> 
> Hi Moti
> 
> Please see some comments below.
> 
> From: Mordechay Haimovsky
> > Implement support for hardware TSO.
> >
> > Signed-off-by: Moti Haimovsky <motih@mellanox.com>
> > ---
> > v5:
> > * Modification to the code according to review inputs from Matan
> >   Azrad.
> > * Code optimization to the TSO header copy routine.
> > * Rearranged the TSO data-segments creation routine.
> > in reply to
> > 1530715998-15703-1-git-send-email-motih@mellanox.com
> >
> > v4:
> > * Bug fixes in filling TSO data segments.
> > * Modifications according to review inputs from Adrien Mazarguil
> >   and Matan Azrad.
> > in reply to
> > 1530190137-17848-1-git-send-email-motih@mellanox.com
> >
> > v3:
> > * Fixed compilation errors in compilers without GNU C extensions
> >   caused by a declaration of zero-length array in the code.
> > in reply to
> > 1530187032-6489-1-git-send-email-motih@mellanox.com
> >
> > v2:
> > * Fixed coding style warning.
> > in reply to
> > 1530184583-30166-1-git-send-email-motih@mellanox.com
> >
> > v1:
> > * Fixed coding style warnings.
> > in reply to
> > 1530181779-19716-1-git-send-email-motih@mellanox.com
> > ---
> >  doc/guides/nics/features/mlx4.ini |   1 +
> >  doc/guides/nics/mlx4.rst          |   3 +
> >  drivers/net/mlx4/Makefile         |   5 +
> >  drivers/net/mlx4/mlx4.c           |   9 +
> >  drivers/net/mlx4/mlx4.h           |   5 +
> >  drivers/net/mlx4/mlx4_prm.h       |  15 ++
> >  drivers/net/mlx4/mlx4_rxtx.c      | 372
> > +++++++++++++++++++++++++++++++++++++-
> >  drivers/net/mlx4/mlx4_rxtx.h      |   2 +-
> >  drivers/net/mlx4/mlx4_txq.c       |   8 +-
> >  9 files changed, 416 insertions(+), 4 deletions(-)
> >
> > diff --git a/doc/guides/nics/features/mlx4.ini
> > b/doc/guides/nics/features/mlx4.ini
> > index f6efd21..98a3f61 100644
> > --- a/doc/guides/nics/features/mlx4.ini
> > +++ b/doc/guides/nics/features/mlx4.ini
> > @@ -13,6 +13,7 @@ Queue start/stop     = Y
> >  MTU update           = Y
> >  Jumbo frame          = Y
> >  Scattered Rx         = Y
> > +TSO                  = Y
> >  Promiscuous mode     = Y
> >  Allmulticast mode    = Y
> >  Unicast MAC filter   = Y
> > diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst index
> > 491106a..12adaeb 100644
> > --- a/doc/guides/nics/mlx4.rst
> > +++ b/doc/guides/nics/mlx4.rst
> > @@ -142,6 +142,9 @@ Limitations
> >    The ability to enable/disable CRC stripping requires OFED version
> >    4.3-1.5.0.0 and above  or rdma-core version v18 and above.
> >
> > +- TSO (Transmit Segmentation Offload) is supported in OFED version
> > +  4.4 and above or in rdma-core version v18 and above.
> > +
> >  Prerequisites
> >  -------------
> >
> > diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
> > index
> > 73f9d40..63bc003 100644
> > --- a/drivers/net/mlx4/Makefile
> > +++ b/drivers/net/mlx4/Makefile
> > @@ -85,6 +85,11 @@ mlx4_autoconf.h.new: FORCE
> >  mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
> >  	$Q $(RM) -f -- '$@'
> >  	$Q : > '$@'
> > +	$Q sh -- '$<' '$@' \
> > +		HAVE_IBV_MLX4_WQE_LSO_SEG \
> > +		infiniband/mlx4dv.h \
> > +		type 'struct mlx4_wqe_lso_seg' \
> > +		$(AUTOCONF_OUTPUT)
> >
> >  # Create mlx4_autoconf.h or update it in case it differs from the new one.
> >
> > diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c index
> > d151a90..5d8c76d 100644
> > --- a/drivers/net/mlx4/mlx4.c
> > +++ b/drivers/net/mlx4/mlx4.c
> > @@ -677,6 +677,15 @@ struct mlx4_conf {
> >
> > 	IBV_RAW_PACKET_CAP_SCATTER_FCS);
> >  		DEBUG("FCS stripping toggling is %ssupported",
> >  		      priv->hw_fcs_strip ? "" : "not ");
> > +		priv->tso =
> > +			((device_attr_ex.tso_caps.max_tso > 0) &&
> > +			 (device_attr_ex.tso_caps.supported_qpts &
> > +			  (1 << IBV_QPT_RAW_PACKET)));
> > +		if (priv->tso)
> > +			priv->tso_max_payload_sz =
> > +					device_attr_ex.tso_caps.max_tso;
> > +		DEBUG("TSO is %ssupported",
> > +		      priv->tso ? "" : "not ");
> >  		/* Configure the first MAC address by default. */
> >  		err = mlx4_get_mac(priv, &mac.addr_bytes);
> >  		if (err) {
> > diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h index
> > 300cb4d..89d8c38 100644
> > --- a/drivers/net/mlx4/mlx4.h
> > +++ b/drivers/net/mlx4/mlx4.h
> > @@ -47,6 +47,9 @@
> >  /** Interrupt alarm timeout value in microseconds. */  #define
> > MLX4_INTR_ALARM_TIMEOUT 100000
> >
> > +/* Maximum packet headers size (L2+L3+L4) for TSO. */ #define
> > +MLX4_MAX_TSO_HEADER 192
> > +
> >  /** Port parameter. */
> >  #define MLX4_PMD_PORT_KVARG "port"
> >
> > @@ -90,6 +93,8 @@ struct priv {
> >  	uint32_t hw_csum:1; /**< Checksum offload is supported. */
> >  	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels.
> > */
> >  	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported.
> > */
> > +	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
> > +	uint32_t tso_max_payload_sz; /**< Max supported TSO payload
> > size. */
> >  	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs
> format).
> > */
> >  	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
> >  	struct mlx4_drop *drop; /**< Shared resources for drop flow rules.
> > */ diff --git a/drivers/net/mlx4/mlx4_prm.h
> > b/drivers/net/mlx4/mlx4_prm.h index b771d8c..aef77ba 100644
> > --- a/drivers/net/mlx4/mlx4_prm.h
> > +++ b/drivers/net/mlx4/mlx4_prm.h
> > @@ -19,6 +19,7 @@
> >  #ifdef PEDANTIC
> >  #pragma GCC diagnostic error "-Wpedantic"
> >  #endif
> > +#include "mlx4_autoconf.h"
> >
> >  /* ConnectX-3 Tx queue basic block. */  #define MLX4_TXBB_SHIFT 6 @@
> > -40,6 +41,7 @@
> >  /* Work queue element (WQE) flags. */  #define
> > MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)  #define
> > MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
> > +#define MLX4_WQE_CTRL_RR (1 << 6)
> >
> >  /* CQE checksum flags. */
> >  enum {
> > @@ -98,6 +100,19 @@ struct mlx4_cq {
> >  	int arm_sn; /**< Rx event counter. */  };
> >
> > +#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
> > +/*
> > + * WQE LSO segment structure.
> > + * Defined here as backward compatibility for rdma-core v17 and below.
> > + * Similar definition is found in infiniband/mlx4dv.h in rdma-core
> > +v18
> > + * and above.
> > + */
> > +struct mlx4_wqe_lso_seg {
> > +	rte_be32_t mss_hdr_size;
> > +	rte_be32_t header[];
> > +};
> > +#endif
> > +
> >  /**
> >   * Retrieve a CQE entry from a CQ.
> >   *
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 78b6dd5..b695539 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -38,10 +38,29 @@
> >   * DWORD (32 byte) of a TXBB.
> >   */
> >  struct pv {
> > -	volatile struct mlx4_wqe_data_seg *dseg;
> > +	union {
> > +		volatile struct mlx4_wqe_data_seg *dseg;
> > +		volatile uint32_t *dst;
> > +	};
> >  	uint32_t val;
> >  };
> >
> > +/** A helper structure for TSO packet handling. */ struct tso_info {
> > +	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
> > +	struct pv *pv;
> > +	/** Current entry in the pv array. */
> > +	int pv_counter;
> > +	/** Total size of the WQE including padding. */
> > +	uint32_t wqe_size;
> > +	/** Size of TSO header to prepend to each packet to send. */
> > +	uint16_t tso_header_size;
> > +	/** Total size of the TSO segment in the WQE. */
> > +	uint16_t wqe_tso_seg_size;
> > +	/** Raw WQE size in units of 16 Bytes and without padding. */
> > +	uint8_t fence_size;
> > +};
> > +
> >  /** A table to translate Rx completion flags to packet type. */
> > uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
> >  	/*
> > @@ -368,6 +387,345 @@ struct pv {
> >  }
> >
> >  /**
> > + * Obtain and calculate TSO information needed for assembling a TSO
> WQE.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to a structure to fill the info with.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline int
> > +mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
> > +			     struct txq *txq,
> > +			     struct tso_info *tinfo)
> > +{
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
> > +				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
> > +
> > +	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
> > +	if (tunneled)
> > +		tinfo->tso_header_size +=
> > +				buf->outer_l2_len + buf->outer_l3_len;
> > +	if (unlikely(buf->tso_segsz == 0 ||
> > +		     tinfo->tso_header_size == 0 ||
> > +		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
> > +		     tinfo->tso_header_size > buf->data_len))
> > +		return -EINVAL;
> > +	/*
> > +	 * Calculate the WQE TSO segment size
> > +	 * Note:
> > +	 * 1. An LSO segment must be padded such that the subsequent data
> > +	 *    segment is 16-byte aligned.
> > +	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
> > +	 */
> > +	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct
> > mlx4_wqe_lso_seg) +
> > +					    tinfo->tso_header_size,
> > +					    sizeof(struct
> > mlx4_wqe_data_seg));
> > +	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
> > +			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
> > +			     buf->nb_segs;
> > +	tinfo->wqe_size =
> > +		RTE_ALIGN((uint32_t)(tinfo->fence_size <<
> > MLX4_SEG_SHIFT),
> > +			  MLX4_TXBB_SIZE);
> > +	/* Validate WQE size and WQE space in the send queue. */
> > +	if (sq->remain_size < tinfo->wqe_size ||
> > +	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
> > +		return -ENOMEM;
> > +	/* Init pv. */
> > +	tinfo->pv = (struct pv *)txq->bounce_buf;
> > +	tinfo->pv_counter = 0;
> > +	return 0;
> > +}
> > +
> > +/**
> > + * Fill the TSO WQE data segments with info on buffers to transmit .
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to TSO info to use.
> > + * @param dseg
> > + *   Pointer to the first data segment in the TSO WQE.
> > + * @param ctrl
> > + *   Pointer to the control segment in the TSO WQE.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline volatile struct mlx4_wqe_ctrl_seg *
> > +mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
> > +			     struct txq *txq,
> > +			     struct tso_info *tinfo,
> > +			     volatile struct mlx4_wqe_data_seg *dseg,
> > +			     volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	uint32_t lkey;
> > +	int nb_segs = buf->nb_segs;
> > +	int nb_segs_txbb;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct rte_mbuf *sbuf = buf;
> > +	struct pv *pv = tinfo->pv;
> > +	int *pv_counter = &tinfo->pv_counter;
> > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
> > +			(volatile struct mlx4_wqe_ctrl_seg *)
> > +				((volatile uint8_t *)ctrl + tinfo->wqe_size);
> > +	uint16_t sb_of = tinfo->tso_header_size;
> > +	uint16_t data_len;
> > +
> > +	do {
> > +		/* how many dseg entries do we have in the current TXBB ?
> > */
> > +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> > +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> > +			       MLX4_SEG_SHIFT;
> > +		switch (nb_segs_txbb) {
> > +		default:
> > +			/* Should never happen. */
> > +			rte_panic("%p: Invalid number of SGEs(%d) for a
> > TXBB",
> > +			(void *)txq, nb_segs_txbb);
> > +			/* rte_panic never returns. */
> 
> Since this default case should not happen because of the above calculation I
> think we don't need it.
> Just "break" if the compiler complain of default case lack.
> 
Although "default" is not mandatory in switch case statement it is a good practice to have it even just for code clarity.
so I will keep it there.

> > +		case 4:
> > +			/* Memory region key for this memory pool. */
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			dseg->addr =
> > +
> > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> > +								     uintptr_t,
> > +								     sb_of));
> > +			dseg->lkey = lkey;
> > +			/*
> > +			 * This data segment starts at the beginning of a new
> > +			 * TXBB, so we need to postpone its byte_count
> > writing
> > +			 * for later.
> > +			 */
> > +			pv[*pv_counter].dseg = dseg;
> > +			/*
> > +			 * Zero length segment is treated as inline segment
> > +			 * with zero data.
> > +			 */
> > +			data_len = sbuf->data_len - sb_of;
> 
> Is there a chance that the data_len will be negative? Rolled in this case?
Since we verify ahead the all l2,l3 and L4 headers reside in the same fragment there is no reason for
data_len to become negative, this is why I use uint16_t which is  the same data type used in struct rte_mbuf
for representing data_len , and as we do it in mlx4_tx_burst_segs.

> Maybe it is better to change it for int16_t and to replace the next check to
> be:
> data_len > 0 ? data_len : 0x80000000
> 
I will keep this the way it is for 2 reasons:
1. Seems to me more cumbersome then what I wrote.
2. Code consistency wise, this is how we also wrote it in mlx4_tx_burst_segs,
     What's good there is also good here.

> 
> And I think I found a way to remove the sb_of calculations for each segment:
> 
> Each segment will create the next segment parameters while only the pre
> loop calculation for the first segment parameters will calculate the header
> offset:
> 
> The parameters: data_len and sb_of.
> 
> So before the loop:
> sb_of = tinfo->tso_header_size;
> data_len = sbuf->data_len - sb_of;
> 
> And inside the loop (after the check of nb_segs):
> sb_of = 0;
> data_len = sbuf->data_len(the next sbuf);
> 
> so each segment calculates the next segment parameters and we don't need
> the "- sb_of" calculation per segment.
> 
NICE :)

> > +			pv[(*pv_counter)++].val =
> > +				rte_cpu_to_be_32(data_len ?
> > +						 data_len :
> > +						 0x80000000);
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 3:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 2:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +			/* fallthrough */
> > +		case 1:
> > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > +			if (unlikely(lkey == (uint32_t)-1))
> > +				goto err;
> > +			data_len = sbuf->data_len - sb_of;
> > +			mlx4_fill_tx_data_seg(dseg,
> > +					lkey,
> > +					rte_pktmbuf_mtod_offset(sbuf,
> > +								uintptr_t,
> > +								sb_of),
> > +					rte_cpu_to_be_32(data_len ?
> > +							 data_len :
> > +							 0x80000000));
> > +			sb_of = 0;
> > +			sbuf = sbuf->next;
> > +			dseg++;
> > +			if (--nb_segs == 0)
> > +				return ctrl_next;
> > +		}
> > +		/* Wrap dseg if it points at the end of the queue. */
> > +		if ((volatile uint8_t *)dseg >= sq->eob)
> > +			dseg = (volatile struct mlx4_wqe_data_seg *)
> > +					((volatile uint8_t *)dseg - sq->size);
> > +	} while (true);
> > +err:
> > +	return NULL;
> > +}
> > +
> > +/**
> > + * Fill the packet's l2, l3 and l4 headers to the WQE.
> > + *
> > + * This will be used as the header for each TSO segment that is
> transmitted.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param tinfo
> > + *   Pointer to TSO info to use.
> > + * @param ctrl
> > + *   Pointer to the control segment in the TSO WQE.
> > + *
> > + * @return
> > + *   0 on success, negative value upon error.
> > + */
> > +static inline volatile struct mlx4_wqe_data_seg *
> > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> > +			   struct txq *txq,
> > +			   struct tso_info *tinfo,
> > +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	volatile struct mlx4_wqe_lso_seg *tseg =
> > +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct pv *pv = tinfo->pv;
> > +	int *pv_counter = &tinfo->pv_counter;
> > +	int remain_size = tinfo->tso_header_size;
> > +	char *from = rte_pktmbuf_mtod(buf, char *);
> > +	uint16_t txbb_avail_space;
> > +	/* Union to overcome volatile constraints when copying TSO header.
> > */
> > +	union {
> > +		volatile uint8_t *vto;
> > +		uint8_t *to;
> > +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> > +
> > +	/*
> > +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> > +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> > +	 * we can write the first 44 TSO header bytes without worry for TxQ
> > +	 * wrapping or overwriting the first TXBB 32bit word.
> > +	 */
> > +	txbb_avail_space = MLX4_TXBB_SIZE -
> > +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> > +			    sizeof(struct mlx4_wqe_lso_seg));
> 
> I think that better name is txbb_tail_size.
I think that txbb_avail_space is good enough, so no change here.

> 
> > +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> > +		/* Copy to end of txbb. */
> > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > +		from += txbb_avail_space;
> > +		thdr.to += txbb_avail_space;
> > +		/* New TXBB, Check for TxQ wrap. */
> > +		if (thdr.to >= sq->eob)
> > +			thdr.vto = sq->buf;
> > +		/* New TXBB, stash the first 32bits for later use. */
> > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> > +		from += sizeof(uint32_t);
> > +		thdr.to += sizeof(uint32_t);
> > +		remain_size -= (txbb_avail_space + sizeof(uint32_t));
> 
> You don't need the () here.
True
> 
> > +		/* Avail space in new TXBB is TXBB size - 4 */
> > +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> > +	}
> > +	if (remain_size > txbb_avail_space) {
> > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > +		from += txbb_avail_space;
> > +		thdr.to += txbb_avail_space;
> > +		remain_size -= txbb_avail_space;
> > +		/* New TXBB, Check for TxQ wrap. */
> > +		if (thdr.to >= sq->eob)
> > +			thdr.vto = sq->buf;
> > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> > +		(*pv_counter)++;
> > +	} else {
> 
> Here it should be else if (remain_size > 0).
true
> 
> > +		rte_memcpy(thdr.to, from, remain_size);
> > +	}
> > +
> > +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> > +					      tinfo->tso_header_size);
> > +	/* Calculate data segment location */
> > +	return (volatile struct mlx4_wqe_data_seg *)
> > +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> > }
> > +
> > +/**
> > + * Write data segments and header for TSO uni/multi segment packet.
> > + *
> > + * @param buf
> > + *   Pointer to the first packet mbuf.
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param ctrl
> > + *   Pointer to the WQE control segment.
> > + *
> > + * @return
> > + *   Pointer to the next WQE control segment on success, NULL otherwise.
> > + */
> > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> > +rte_mbuf *buf, struct txq *txq,
> > +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > +	volatile struct mlx4_wqe_data_seg *dseg;
> > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> > +	struct mlx4_sq *sq = &txq->msq;
> > +	struct tso_info tinfo;
> > +	struct pv *pv;
> > +	int pv_counter;
> > +	int ret;
> > +
> > +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> > +	if (unlikely(ret))
> > +		goto error;
> > +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> > +	if (unlikely(dseg == NULL))
> > +		goto error;
> > +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> > +		dseg = (volatile struct mlx4_wqe_data_seg *)
> > +					((uintptr_t)dseg - sq->size);
> > +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> > +	if (unlikely(ctrl_next == NULL))
> > +		goto error;
> > +	/* Write the first DWORD of each TXBB save earlier. */
> > +	pv = tinfo.pv;
> > +	pv_counter = tinfo.pv_counter;
> > +	/* Need a barrier here before writing the first TXBB word. */
> > +	rte_io_wmb();
> 
> > +	for (--pv_counter; pv_counter  >= 0; pv_counter--)
> 
> Since we don't need the first check do while statement is better.
> To be fully safe you can use likely check before the memory barrier.
> 
Will return the if statement But will not change the loop as it is the same as in
mlx4_tx_burst_segs and I do want to have a consistent code.

> > +		*pv[pv_counter].dst = pv[pv_counter].val;
> > +	ctrl->fence_size = tinfo.fence_size;
> > +	sq->remain_size -= tinfo.wqe_size;
> > +	return ctrl_next;
> > +error:
> > +	txq->stats.odropped++;
> > +	return NULL;
> > +}
> > +
> > +/**
> >   * Write data segments of multi-segment packet.
> >   *
> >   * @param buf
> > @@ -560,6 +918,7 @@ struct pv {
> >  			uint16_t flags16[2];
> >  		} srcrb;
> >  		uint32_t lkey;
> > +		bool tso = txq->priv->tso && (buf->ol_flags &
> > PKT_TX_TCP_SEG);
> >
> >  		/* Clean up old buffer. */
> >  		if (likely(elt->buf != NULL)) {
> > @@ -578,7 +937,16 @@ struct pv {
> >  			} while (tmp != NULL);
> >  		}
> >  		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
> > -		if (buf->nb_segs == 1) {
> > +		if (tso) {
> > +			/* Change opcode to TSO */
> > +			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
> > +			owner_opcode |= MLX4_OPCODE_LSO |
> > MLX4_WQE_CTRL_RR;
> > +			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
> > +			if (!ctrl_next) {
> > +				elt->buf = NULL;
> > +				break;
> > +			}
> > +		} else if (buf->nb_segs == 1) {
> >  			/* Validate WQE space in the send queue. */
> >  			if (sq->remain_size < MLX4_TXBB_SIZE) {
> >  				elt->buf = NULL;
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.h
> > b/drivers/net/mlx4/mlx4_rxtx.h index 4c025e3..ffa8abf 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > @@ -90,7 +90,7 @@ struct mlx4_txq_stats {
> >  	unsigned int idx; /**< Mapping index. */
> >  	uint64_t opackets; /**< Total of successfully sent packets. */
> >  	uint64_t obytes; /**< Total of successfully sent bytes. */
> > -	uint64_t odropped; /**< Total of packets not sent when Tx ring full.
> > */
> > +	uint64_t odropped; /**< Total number of packets failed to transmit.
> > */
> >  };
> >
> >  /** Tx queue descriptor. */
> > diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
> > index 6edaadb..9aa7440 100644
> > --- a/drivers/net/mlx4/mlx4_txq.c
> > +++ b/drivers/net/mlx4/mlx4_txq.c
> > @@ -116,8 +116,14 @@
> >  			     DEV_TX_OFFLOAD_UDP_CKSUM |
> >  			     DEV_TX_OFFLOAD_TCP_CKSUM);
> >  	}
> > -	if (priv->hw_csum_l2tun)
> > +	if (priv->tso)
> > +		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
> > +	if (priv->hw_csum_l2tun) {
> >  		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
> > +		if (priv->tso)
> > +			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
> > +				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
> > +	}
> >  	return offloads;
> >  }
> >
> > --
> > 1.8.3.1
  
Matan Azrad July 9, 2018, 6:18 p.m. UTC | #3
Hi Moti

I continue the discussion here in spite of the next version was out just to see the full discussions. 

From: Mordechay Haimovsky
> inline
> 
> > From: Matan Azrad
> > Hi Moti
> >
> > Please see some comments below.
> >
> > From: Mordechay Haimovsky
> > > Implement support for hardware TSO.
> > >
> > > Signed-off-by: Moti Haimovsky <motih@mellanox.com>
...
> > > +	do {
> > > +		/* how many dseg entries do we have in the current TXBB ?
> > > */
> > > +		nb_segs_txbb = (MLX4_TXBB_SIZE -
> > > +				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
> > > +			       MLX4_SEG_SHIFT;
> > > +		switch (nb_segs_txbb) {
> > > +		default:
> > > +			/* Should never happen. */
> > > +			rte_panic("%p: Invalid number of SGEs(%d) for a
> > > TXBB",
> > > +			(void *)txq, nb_segs_txbb);
> > > +			/* rte_panic never returns. */
> >
> > Since this default case should not happen because of the above
> > calculation I think we don't need it.
> > Just "break" if the compiler complain of default case lack.
> >
> Although "default" is not mandatory in switch case statement it is a good
> practice to have it even just for code clarity.
> so I will keep it there.

But the rte_panic code (and all the default block) is redundant and we don't need redundant code in our data-path.
You can remain a comment if you want for clarifying.
 

> > > +		case 4:
> > > +			/* Memory region key for this memory pool. */
> > > +			lkey = mlx4_tx_mb2mr(txq, sbuf);
> > > +			if (unlikely(lkey == (uint32_t)-1))
> > > +				goto err;
> > > +			dseg->addr =
> > > +
> > > rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
> > > +								     uintptr_t,
> > > +								     sb_of));
> > > +			dseg->lkey = lkey;
> > > +			/*
> > > +			 * This data segment starts at the beginning of a new
> > > +			 * TXBB, so we need to postpone its byte_count
> > > writing
> > > +			 * for later.
> > > +			 */
> > > +			pv[*pv_counter].dseg = dseg;
> > > +			/*
> > > +			 * Zero length segment is treated as inline segment
> > > +			 * with zero data.
> > > +			 */
> > > +			data_len = sbuf->data_len - sb_of;
> >
> > Is there a chance that the data_len will be negative? Rolled in this case?
> Since we verify ahead the all l2,l3 and L4 headers reside in the same fragment
> there is no reason for data_len to become negative, this is why I use uint16_t
> which is  the same data type used in struct rte_mbuf for representing
> data_len , and as we do it in mlx4_tx_burst_segs.
> 
> > Maybe it is better to change it for int16_t and to replace the next
> > check to
> > be:
> > data_len > 0 ? data_len : 0x80000000
> >
> I will keep this the way it is for 2 reasons:
> 1. Seems to me more cumbersome then what I wrote.

OK, you right here, if it cannot be negative we shouldn't change it :)

> 2. Code consistency wise, this is how we also wrote it in mlx4_tx_burst_segs,
>      What's good there is also good here.

Not agree, here is really a different case from there, a lot of assumption are different and the code may reflects it.

> > And I think I found a way to remove the sb_of calculations for each
> segment:
> >
> > Each segment will create the next segment parameters while only the
> > pre loop calculation for the first segment parameters will calculate
> > the header
> > offset:
> >
> > The parameters: data_len and sb_of.
> >
> > So before the loop:
> > sb_of = tinfo->tso_header_size;
> > data_len = sbuf->data_len - sb_of;
> >
> > And inside the loop (after the check of nb_segs):
> > sb_of = 0;
> > data_len = sbuf->data_len(the next sbuf);
> >
> > so each segment calculates the next segment parameters and we don't
> > need the "- sb_of" calculation per segment.
> >
> NICE :)
> 

Sorry for see it only now, but we don't need even the "sb_of=0" per segment:
We can add one more parameter for the next segment 
addr = rte_pktmbuf_mtod_offset(sbuf, uintptr_t, tinfo->tso_header_size)
before the loop
and
addr= rte_pktmbuf_mtod(sbuf, uintptr_t)
inside the loop

so finally we save 2 cycles per segment :)
...
> > > +static inline volatile struct mlx4_wqe_data_seg *
> > > +mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
> > > +			   struct txq *txq,
> > > +			   struct tso_info *tinfo,
> > > +			   volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > > +	volatile struct mlx4_wqe_lso_seg *tseg =
> > > +		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
> > > +	struct mlx4_sq *sq = &txq->msq;
> > > +	struct pv *pv = tinfo->pv;
> > > +	int *pv_counter = &tinfo->pv_counter;
> > > +	int remain_size = tinfo->tso_header_size;
> > > +	char *from = rte_pktmbuf_mtod(buf, char *);
> > > +	uint16_t txbb_avail_space;
> > > +	/* Union to overcome volatile constraints when copying TSO header.
> > > */
> > > +	union {
> > > +		volatile uint8_t *vto;
> > > +		uint8_t *to;
> > > +	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
> > > +
> > > +	/*
> > > +	 * TSO data always starts at offset 20 from the beginning of the TXBB
> > > +	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
> > > +	 * we can write the first 44 TSO header bytes without worry for TxQ
> > > +	 * wrapping or overwriting the first TXBB 32bit word.
> > > +	 */
> > > +	txbb_avail_space = MLX4_TXBB_SIZE -
> > > +			   (sizeof(struct mlx4_wqe_ctrl_seg) +
> > > +			    sizeof(struct mlx4_wqe_lso_seg));
> >
> > I think that better name is txbb_tail_size.
> I think that txbb_avail_space is good enough, so no change here.

My suggestion is because this size is only the tail size in the txbb without the first 4 bytes, so it may be more reasonable.
I can understand also your suggestion while avail points to the available txbb size to write (the first 4B are not available now only later).
I'm not going to argue about it :)
 
> 
> >
> > > +	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
> > > +		/* Copy to end of txbb. */
> > > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > > +		from += txbb_avail_space;
> > > +		thdr.to += txbb_avail_space;
> > > +		/* New TXBB, Check for TxQ wrap. */
> > > +		if (thdr.to >= sq->eob)
> > > +			thdr.vto = sq->buf;
> > > +		/* New TXBB, stash the first 32bits for later use. */
> > > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > > +		pv[(*pv_counter)++].val = *(uint32_t *)from,
> > > +		from += sizeof(uint32_t);
> > > +		thdr.to += sizeof(uint32_t);
> > > +		remain_size -= (txbb_avail_space + sizeof(uint32_t));
> >
> > You don't need the () here.
> True
> >
> > > +		/* Avail space in new TXBB is TXBB size - 4 */
> > > +		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
> > > +	}
> > > +	if (remain_size > txbb_avail_space) {
> > > +		rte_memcpy(thdr.to, from, txbb_avail_space);
> > > +		from += txbb_avail_space;
> > > +		thdr.to += txbb_avail_space;
> > > +		remain_size -= txbb_avail_space;
> > > +		/* New TXBB, Check for TxQ wrap. */
> > > +		if (thdr.to >= sq->eob)
> > > +			thdr.vto = sq->buf;
> > > +		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
> > > +		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
> > > +		(*pv_counter)++;
> > > +	} else {
> >
> > Here it should be else if (remain_size > 0).
> true
> >
> > > +		rte_memcpy(thdr.to, from, remain_size);
> > > +	}
> > > +
> > > +	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
> > > +					      tinfo->tso_header_size);
> > > +	/* Calculate data segment location */
> > > +	return (volatile struct mlx4_wqe_data_seg *)
> > > +				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
> > > }
> > > +
> > > +/**
> > > + * Write data segments and header for TSO uni/multi segment packet.
> > > + *
> > > + * @param buf
> > > + *   Pointer to the first packet mbuf.
> > > + * @param txq
> > > + *   Pointer to Tx queue structure.
> > > + * @param ctrl
> > > + *   Pointer to the WQE control segment.
> > > + *
> > > + * @return
> > > + *   Pointer to the next WQE control segment on success, NULL
> otherwise.
> > > + */
> > > +static volatile struct mlx4_wqe_ctrl_seg * mlx4_tx_burst_tso(struct
> > > +rte_mbuf *buf, struct txq *txq,
> > > +		  volatile struct mlx4_wqe_ctrl_seg *ctrl) {
> > > +	volatile struct mlx4_wqe_data_seg *dseg;
> > > +	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
> > > +	struct mlx4_sq *sq = &txq->msq;
> > > +	struct tso_info tinfo;
> > > +	struct pv *pv;
> > > +	int pv_counter;
> > > +	int ret;
> > > +
> > > +	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
> > > +	if (unlikely(ret))
> > > +		goto error;
> > > +	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
> > > +	if (unlikely(dseg == NULL))
> > > +		goto error;
> > > +	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
> > > +		dseg = (volatile struct mlx4_wqe_data_seg *)
> > > +					((uintptr_t)dseg - sq->size);
> > > +	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
> > > +	if (unlikely(ctrl_next == NULL))
> > > +		goto error;
> > > +	/* Write the first DWORD of each TXBB save earlier. */
> > > +	pv = tinfo.pv;
> > > +	pv_counter = tinfo.pv_counter;
> > > +	/* Need a barrier here before writing the first TXBB word. */
> > > +	rte_io_wmb();
> >
> > > +	for (--pv_counter; pv_counter  >= 0; pv_counter--)
> >
> > Since we don't need the first check do while statement is better.
> > To be fully safe you can use likely check before the memory barrier.
> >
> Will return the if statement But will not change the loop as it is the same as in
> mlx4_tx_burst_segs and I do want to have a consistent code.

I'm not agree with this statement as above - different assumptions - different optimized code.

Here and in the mlx4_tx_burst_segs code we don't need the first check in the for loop and we don't need redundant checks in our datapath.

So both should be updated.

While the difference is in the prior check, here it should be likely and there it should not.
So actually there the "for" loop can stay as is but we don't need the first if check of pv_counter because it is already checked in the for loop.

I suggest to optimize both(prior patch for mlx4_tx_burst_segs optimization).

> > > +		*pv[pv_counter].dst = pv[pv_counter].val;
> > > +	ctrl->fence_size = tinfo.fence_size;
> > > +	sq->remain_size -= tinfo.wqe_size;
> > > +	return ctrl_next;
  

Patch

diff --git a/doc/guides/nics/features/mlx4.ini b/doc/guides/nics/features/mlx4.ini
index f6efd21..98a3f61 100644
--- a/doc/guides/nics/features/mlx4.ini
+++ b/doc/guides/nics/features/mlx4.ini
@@ -13,6 +13,7 @@  Queue start/stop     = Y
 MTU update           = Y
 Jumbo frame          = Y
 Scattered Rx         = Y
+TSO                  = Y
 Promiscuous mode     = Y
 Allmulticast mode    = Y
 Unicast MAC filter   = Y
diff --git a/doc/guides/nics/mlx4.rst b/doc/guides/nics/mlx4.rst
index 491106a..12adaeb 100644
--- a/doc/guides/nics/mlx4.rst
+++ b/doc/guides/nics/mlx4.rst
@@ -142,6 +142,9 @@  Limitations
   The ability to enable/disable CRC stripping requires OFED version
   4.3-1.5.0.0 and above  or rdma-core version v18 and above.
 
+- TSO (Transmit Segmentation Offload) is supported in OFED version
+  4.4 and above or in rdma-core version v18 and above.
+
 Prerequisites
 -------------
 
diff --git a/drivers/net/mlx4/Makefile b/drivers/net/mlx4/Makefile
index 73f9d40..63bc003 100644
--- a/drivers/net/mlx4/Makefile
+++ b/drivers/net/mlx4/Makefile
@@ -85,6 +85,11 @@  mlx4_autoconf.h.new: FORCE
 mlx4_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 	$Q $(RM) -f -- '$@'
 	$Q : > '$@'
+	$Q sh -- '$<' '$@' \
+		HAVE_IBV_MLX4_WQE_LSO_SEG \
+		infiniband/mlx4dv.h \
+		type 'struct mlx4_wqe_lso_seg' \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx4_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index d151a90..5d8c76d 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -677,6 +677,15 @@  struct mlx4_conf {
 					IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DEBUG("FCS stripping toggling is %ssupported",
 		      priv->hw_fcs_strip ? "" : "not ");
+		priv->tso =
+			((device_attr_ex.tso_caps.max_tso > 0) &&
+			 (device_attr_ex.tso_caps.supported_qpts &
+			  (1 << IBV_QPT_RAW_PACKET)));
+		if (priv->tso)
+			priv->tso_max_payload_sz =
+					device_attr_ex.tso_caps.max_tso;
+		DEBUG("TSO is %ssupported",
+		      priv->tso ? "" : "not ");
 		/* Configure the first MAC address by default. */
 		err = mlx4_get_mac(priv, &mac.addr_bytes);
 		if (err) {
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 300cb4d..89d8c38 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -47,6 +47,9 @@ 
 /** Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
+/* Maximum packet headers size (L2+L3+L4) for TSO. */
+#define MLX4_MAX_TSO_HEADER 192
+
 /** Port parameter. */
 #define MLX4_PMD_PORT_KVARG "port"
 
@@ -90,6 +93,8 @@  struct priv {
 	uint32_t hw_csum:1; /**< Checksum offload is supported. */
 	uint32_t hw_csum_l2tun:1; /**< Checksum support for L2 tunnels. */
 	uint32_t hw_fcs_strip:1; /**< FCS stripping toggling is supported. */
+	uint32_t tso:1; /**< Transmit segmentation offload is supported. */
+	uint32_t tso_max_payload_sz; /**< Max supported TSO payload size. */
 	uint64_t hw_rss_sup; /**< Supported RSS hash fields (Verbs format). */
 	struct rte_intr_handle intr_handle; /**< Port interrupt handle. */
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
diff --git a/drivers/net/mlx4/mlx4_prm.h b/drivers/net/mlx4/mlx4_prm.h
index b771d8c..aef77ba 100644
--- a/drivers/net/mlx4/mlx4_prm.h
+++ b/drivers/net/mlx4/mlx4_prm.h
@@ -19,6 +19,7 @@ 
 #ifdef PEDANTIC
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
+#include "mlx4_autoconf.h"
 
 /* ConnectX-3 Tx queue basic block. */
 #define MLX4_TXBB_SHIFT 6
@@ -40,6 +41,7 @@ 
 /* Work queue element (WQE) flags. */
 #define MLX4_WQE_CTRL_IIP_HDR_CSUM (1 << 28)
 #define MLX4_WQE_CTRL_IL4_HDR_CSUM (1 << 27)
+#define MLX4_WQE_CTRL_RR (1 << 6)
 
 /* CQE checksum flags. */
 enum {
@@ -98,6 +100,19 @@  struct mlx4_cq {
 	int arm_sn; /**< Rx event counter. */
 };
 
+#ifndef HAVE_IBV_MLX4_WQE_LSO_SEG
+/*
+ * WQE LSO segment structure.
+ * Defined here as backward compatibility for rdma-core v17 and below.
+ * Similar definition is found in infiniband/mlx4dv.h in rdma-core v18
+ * and above.
+ */
+struct mlx4_wqe_lso_seg {
+	rte_be32_t mss_hdr_size;
+	rte_be32_t header[];
+};
+#endif
+
 /**
  * Retrieve a CQE entry from a CQ.
  *
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 78b6dd5..b695539 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -38,10 +38,29 @@ 
  * DWORD (32 byte) of a TXBB.
  */
 struct pv {
-	volatile struct mlx4_wqe_data_seg *dseg;
+	union {
+		volatile struct mlx4_wqe_data_seg *dseg;
+		volatile uint32_t *dst;
+	};
 	uint32_t val;
 };
 
+/** A helper structure for TSO packet handling. */
+struct tso_info {
+	/** Pointer to the array of saved first DWORD (32 byte) of a TXBB. */
+	struct pv *pv;
+	/** Current entry in the pv array. */
+	int pv_counter;
+	/** Total size of the WQE including padding. */
+	uint32_t wqe_size;
+	/** Size of TSO header to prepend to each packet to send. */
+	uint16_t tso_header_size;
+	/** Total size of the TSO segment in the WQE. */
+	uint16_t wqe_tso_seg_size;
+	/** Raw WQE size in units of 16 Bytes and without padding. */
+	uint8_t fence_size;
+};
+
 /** A table to translate Rx completion flags to packet type. */
 uint32_t mlx4_ptype_table[0x100] __rte_cache_aligned = {
 	/*
@@ -368,6 +387,345 @@  struct pv {
 }
 
 /**
+ * Obtain and calculate TSO information needed for assembling a TSO WQE.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to a structure to fill the info with.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline int
+mlx4_tx_burst_tso_get_params(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo)
+{
+	struct mlx4_sq *sq = &txq->msq;
+	const uint8_t tunneled = txq->priv->hw_csum_l2tun &&
+				 (buf->ol_flags & PKT_TX_TUNNEL_MASK);
+
+	tinfo->tso_header_size = buf->l2_len + buf->l3_len + buf->l4_len;
+	if (tunneled)
+		tinfo->tso_header_size +=
+				buf->outer_l2_len + buf->outer_l3_len;
+	if (unlikely(buf->tso_segsz == 0 ||
+		     tinfo->tso_header_size == 0 ||
+		     tinfo->tso_header_size > MLX4_MAX_TSO_HEADER ||
+		     tinfo->tso_header_size > buf->data_len))
+		return -EINVAL;
+	/*
+	 * Calculate the WQE TSO segment size
+	 * Note:
+	 * 1. An LSO segment must be padded such that the subsequent data
+	 *    segment is 16-byte aligned.
+	 * 2. The start address of the TSO segment is always 16 Bytes aligned.
+	 */
+	tinfo->wqe_tso_seg_size = RTE_ALIGN(sizeof(struct mlx4_wqe_lso_seg) +
+					    tinfo->tso_header_size,
+					    sizeof(struct mlx4_wqe_data_seg));
+	tinfo->fence_size = ((sizeof(struct mlx4_wqe_ctrl_seg) +
+			     tinfo->wqe_tso_seg_size) >> MLX4_SEG_SHIFT) +
+			     buf->nb_segs;
+	tinfo->wqe_size =
+		RTE_ALIGN((uint32_t)(tinfo->fence_size << MLX4_SEG_SHIFT),
+			  MLX4_TXBB_SIZE);
+	/* Validate WQE size and WQE space in the send queue. */
+	if (sq->remain_size < tinfo->wqe_size ||
+	    tinfo->wqe_size > MLX4_MAX_WQE_SIZE)
+		return -ENOMEM;
+	/* Init pv. */
+	tinfo->pv = (struct pv *)txq->bounce_buf;
+	tinfo->pv_counter = 0;
+	return 0;
+}
+
+/**
+ * Fill the TSO WQE data segments with info on buffers to transmit .
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param dseg
+ *   Pointer to the first data segment in the TSO WQE.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_fill_tso_dsegs(struct rte_mbuf *buf,
+			     struct txq *txq,
+			     struct tso_info *tinfo,
+			     volatile struct mlx4_wqe_data_seg *dseg,
+			     volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	uint32_t lkey;
+	int nb_segs = buf->nb_segs;
+	int nb_segs_txbb;
+	struct mlx4_sq *sq = &txq->msq;
+	struct rte_mbuf *sbuf = buf;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next =
+			(volatile struct mlx4_wqe_ctrl_seg *)
+				((volatile uint8_t *)ctrl + tinfo->wqe_size);
+	uint16_t sb_of = tinfo->tso_header_size;
+	uint16_t data_len;
+
+	do {
+		/* how many dseg entries do we have in the current TXBB ? */
+		nb_segs_txbb = (MLX4_TXBB_SIZE -
+				((uintptr_t)dseg & (MLX4_TXBB_SIZE - 1))) >>
+			       MLX4_SEG_SHIFT;
+		switch (nb_segs_txbb) {
+		default:
+			/* Should never happen. */
+			rte_panic("%p: Invalid number of SGEs(%d) for a TXBB",
+			(void *)txq, nb_segs_txbb);
+			/* rte_panic never returns. */
+		case 4:
+			/* Memory region key for this memory pool. */
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			dseg->addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod_offset(sbuf,
+								     uintptr_t,
+								     sb_of));
+			dseg->lkey = lkey;
+			/*
+			 * This data segment starts at the beginning of a new
+			 * TXBB, so we need to postpone its byte_count writing
+			 * for later.
+			 */
+			pv[*pv_counter].dseg = dseg;
+			/*
+			 * Zero length segment is treated as inline segment
+			 * with zero data.
+			 */
+			data_len = sbuf->data_len - sb_of;
+			pv[(*pv_counter)++].val =
+				rte_cpu_to_be_32(data_len ?
+						 data_len :
+						 0x80000000);
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 3:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 2:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+			/* fallthrough */
+		case 1:
+			lkey = mlx4_tx_mb2mr(txq, sbuf);
+			if (unlikely(lkey == (uint32_t)-1))
+				goto err;
+			data_len = sbuf->data_len - sb_of;
+			mlx4_fill_tx_data_seg(dseg,
+					lkey,
+					rte_pktmbuf_mtod_offset(sbuf,
+								uintptr_t,
+								sb_of),
+					rte_cpu_to_be_32(data_len ?
+							 data_len :
+							 0x80000000));
+			sb_of = 0;
+			sbuf = sbuf->next;
+			dseg++;
+			if (--nb_segs == 0)
+				return ctrl_next;
+		}
+		/* Wrap dseg if it points at the end of the queue. */
+		if ((volatile uint8_t *)dseg >= sq->eob)
+			dseg = (volatile struct mlx4_wqe_data_seg *)
+					((volatile uint8_t *)dseg - sq->size);
+	} while (true);
+err:
+	return NULL;
+}
+
+/**
+ * Fill the packet's l2, l3 and l4 headers to the WQE.
+ *
+ * This will be used as the header for each TSO segment that is transmitted.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param tinfo
+ *   Pointer to TSO info to use.
+ * @param ctrl
+ *   Pointer to the control segment in the TSO WQE.
+ *
+ * @return
+ *   0 on success, negative value upon error.
+ */
+static inline volatile struct mlx4_wqe_data_seg *
+mlx4_tx_burst_fill_tso_hdr(struct rte_mbuf *buf,
+			   struct txq *txq,
+			   struct tso_info *tinfo,
+			   volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_lso_seg *tseg =
+		(volatile struct mlx4_wqe_lso_seg *)(ctrl + 1);
+	struct mlx4_sq *sq = &txq->msq;
+	struct pv *pv = tinfo->pv;
+	int *pv_counter = &tinfo->pv_counter;
+	int remain_size = tinfo->tso_header_size;
+	char *from = rte_pktmbuf_mtod(buf, char *);
+	uint16_t txbb_avail_space;
+	/* Union to overcome volatile constraints when copying TSO header. */
+	union {
+		volatile uint8_t *vto;
+		uint8_t *to;
+	} thdr = { .vto = (volatile uint8_t *)tseg->header, };
+
+	/*
+	 * TSO data always starts at offset 20 from the beginning of the TXBB
+	 * (16 byte ctrl + 4byte TSO desc). Since each TXBB is 64Byte aligned
+	 * we can write the first 44 TSO header bytes without worry for TxQ
+	 * wrapping or overwriting the first TXBB 32bit word.
+	 */
+	txbb_avail_space = MLX4_TXBB_SIZE -
+			   (sizeof(struct mlx4_wqe_ctrl_seg) +
+			    sizeof(struct mlx4_wqe_lso_seg));
+	while (remain_size >= (int)(txbb_avail_space + sizeof(uint32_t))) {
+		/* Copy to end of txbb. */
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		/* New TXBB, stash the first 32bits for later use. */
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		pv[(*pv_counter)++].val = *(uint32_t *)from,
+		from += sizeof(uint32_t);
+		thdr.to += sizeof(uint32_t);
+		remain_size -= (txbb_avail_space + sizeof(uint32_t));
+		/* Avail space in new TXBB is TXBB size - 4 */
+		txbb_avail_space = MLX4_TXBB_SIZE - sizeof(uint32_t);
+	}
+	if (remain_size > txbb_avail_space) {
+		rte_memcpy(thdr.to, from, txbb_avail_space);
+		from += txbb_avail_space;
+		thdr.to += txbb_avail_space;
+		remain_size -= txbb_avail_space;
+		/* New TXBB, Check for TxQ wrap. */
+		if (thdr.to >= sq->eob)
+			thdr.vto = sq->buf;
+		pv[*pv_counter].dst = (volatile uint32_t *)thdr.to;
+		rte_memcpy(&pv[*pv_counter].val, from, remain_size);
+		(*pv_counter)++;
+	} else {
+		rte_memcpy(thdr.to, from, remain_size);
+	}
+
+	tseg->mss_hdr_size = rte_cpu_to_be_32((buf->tso_segsz << 16) |
+					      tinfo->tso_header_size);
+	/* Calculate data segment location */
+	return (volatile struct mlx4_wqe_data_seg *)
+				((uintptr_t)tseg + tinfo->wqe_tso_seg_size);
+}
+
+/**
+ * Write data segments and header for TSO uni/multi segment packet.
+ *
+ * @param buf
+ *   Pointer to the first packet mbuf.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param ctrl
+ *   Pointer to the WQE control segment.
+ *
+ * @return
+ *   Pointer to the next WQE control segment on success, NULL otherwise.
+ */
+static volatile struct mlx4_wqe_ctrl_seg *
+mlx4_tx_burst_tso(struct rte_mbuf *buf, struct txq *txq,
+		  volatile struct mlx4_wqe_ctrl_seg *ctrl)
+{
+	volatile struct mlx4_wqe_data_seg *dseg;
+	volatile struct mlx4_wqe_ctrl_seg *ctrl_next;
+	struct mlx4_sq *sq = &txq->msq;
+	struct tso_info tinfo;
+	struct pv *pv;
+	int pv_counter;
+	int ret;
+
+	ret = mlx4_tx_burst_tso_get_params(buf, txq, &tinfo);
+	if (unlikely(ret))
+		goto error;
+	dseg = mlx4_tx_burst_fill_tso_hdr(buf, txq, &tinfo, ctrl);
+	if (unlikely(dseg == NULL))
+		goto error;
+	if ((uintptr_t)dseg >= (uintptr_t)sq->eob)
+		dseg = (volatile struct mlx4_wqe_data_seg *)
+					((uintptr_t)dseg - sq->size);
+	ctrl_next = mlx4_tx_burst_fill_tso_dsegs(buf, txq, &tinfo, dseg, ctrl);
+	if (unlikely(ctrl_next == NULL))
+		goto error;
+	/* Write the first DWORD of each TXBB save earlier. */
+	pv = tinfo.pv;
+	pv_counter = tinfo.pv_counter;
+	/* Need a barrier here before writing the first TXBB word. */
+	rte_io_wmb();
+	for (--pv_counter; pv_counter  >= 0; pv_counter--)
+		*pv[pv_counter].dst = pv[pv_counter].val;
+	ctrl->fence_size = tinfo.fence_size;
+	sq->remain_size -= tinfo.wqe_size;
+	return ctrl_next;
+error:
+	txq->stats.odropped++;
+	return NULL;
+}
+
+/**
  * Write data segments of multi-segment packet.
  *
  * @param buf
@@ -560,6 +918,7 @@  struct pv {
 			uint16_t flags16[2];
 		} srcrb;
 		uint32_t lkey;
+		bool tso = txq->priv->tso && (buf->ol_flags & PKT_TX_TCP_SEG);
 
 		/* Clean up old buffer. */
 		if (likely(elt->buf != NULL)) {
@@ -578,7 +937,16 @@  struct pv {
 			} while (tmp != NULL);
 		}
 		RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
-		if (buf->nb_segs == 1) {
+		if (tso) {
+			/* Change opcode to TSO */
+			owner_opcode &= ~MLX4_OPCODE_CONFIG_CMD;
+			owner_opcode |= MLX4_OPCODE_LSO | MLX4_WQE_CTRL_RR;
+			ctrl_next = mlx4_tx_burst_tso(buf, txq, ctrl);
+			if (!ctrl_next) {
+				elt->buf = NULL;
+				break;
+			}
+		} else if (buf->nb_segs == 1) {
 			/* Validate WQE space in the send queue. */
 			if (sq->remain_size < MLX4_TXBB_SIZE) {
 				elt->buf = NULL;
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 4c025e3..ffa8abf 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -90,7 +90,7 @@  struct mlx4_txq_stats {
 	unsigned int idx; /**< Mapping index. */
 	uint64_t opackets; /**< Total of successfully sent packets. */
 	uint64_t obytes; /**< Total of successfully sent bytes. */
-	uint64_t odropped; /**< Total of packets not sent when Tx ring full. */
+	uint64_t odropped; /**< Total number of packets failed to transmit. */
 };
 
 /** Tx queue descriptor. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index 6edaadb..9aa7440 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -116,8 +116,14 @@ 
 			     DEV_TX_OFFLOAD_UDP_CKSUM |
 			     DEV_TX_OFFLOAD_TCP_CKSUM);
 	}
-	if (priv->hw_csum_l2tun)
+	if (priv->tso)
+		offloads |= DEV_TX_OFFLOAD_TCP_TSO;
+	if (priv->hw_csum_l2tun) {
 		offloads |= DEV_TX_OFFLOAD_OUTER_IPV4_CKSUM;
+		if (priv->tso)
+			offloads |= (DEV_TX_OFFLOAD_VXLAN_TNL_TSO |
+				     DEV_TX_OFFLOAD_GRE_TNL_TSO);
+	}
 	return offloads;
 }