[dpdk-dev] [PATCH v2 05/13] mlx5: add support for scattered RX and TX buffers

Adrien Mazarguil adrien.mazarguil at 6wind.com
Fri Oct 30 19:52:34 CET 2015


A dedicated RX callback is added to handle scattered buffers. For better
performance, it is only used when jumbo frames are enabled and MTU is larger
than a single mbuf.

On the TX path, scattered buffers are also handled in a separate function.
When there are more than MLX5_PMD_SGE_WR_N segments in a given mbuf, the
remaining segments are linearized in the last SGE entry.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
---
 drivers/net/mlx5/mlx5_rxq.c  | 175 +++++++++++++++++++-
 drivers/net/mlx5/mlx5_rxtx.c | 376 +++++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h |  10 ++
 3 files changed, 557 insertions(+), 4 deletions(-)

diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 8450fe3..1eddfc7 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -65,6 +65,153 @@
 #include "mlx5_defs.h"
 
 /**
+ * Allocate RX queue elements with scattered packets support.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ * @param elts_n
+ *   Number of elements to allocate.
+ * @param[in] pool
+ *   If not NULL, fetch buffers from this array instead of allocating them
+ *   with rte_pktmbuf_alloc().
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+rxq_alloc_elts_sp(struct rxq *rxq, unsigned int elts_n,
+		  struct rte_mbuf **pool)
+{
+	unsigned int i;
+	struct rxq_elt_sp (*elts)[elts_n] =
+		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
+				  rxq->socket);
+	int ret = 0;
+
+	if (elts == NULL) {
+		ERROR("%p: can't allocate packets array", (void *)rxq);
+		ret = ENOMEM;
+		goto error;
+	}
+	/* For each WR (packet). */
+	for (i = 0; (i != elts_n); ++i) {
+		unsigned int j;
+		struct rxq_elt_sp *elt = &(*elts)[i];
+		struct ibv_recv_wr *wr = &elt->wr;
+		struct ibv_sge (*sges)[RTE_DIM(elt->sges)] = &elt->sges;
+
+		/* These two arrays must have the same size. */
+		assert(RTE_DIM(elt->sges) == RTE_DIM(elt->bufs));
+		/* Configure WR. */
+		wr->wr_id = i;
+		wr->next = &(*elts)[(i + 1)].wr;
+		wr->sg_list = &(*sges)[0];
+		wr->num_sge = RTE_DIM(*sges);
+		/* For each SGE (segment). */
+		for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
+			struct ibv_sge *sge = &(*sges)[j];
+			struct rte_mbuf *buf;
+
+			if (pool != NULL) {
+				buf = *(pool++);
+				assert(buf != NULL);
+				rte_pktmbuf_reset(buf);
+			} else
+				buf = rte_pktmbuf_alloc(rxq->mp);
+			if (buf == NULL) {
+				assert(pool == NULL);
+				ERROR("%p: empty mbuf pool", (void *)rxq);
+				ret = ENOMEM;
+				goto error;
+			}
+			elt->bufs[j] = buf;
+			/* Headroom is reserved by rte_pktmbuf_alloc(). */
+			assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
+			/* Buffer is supposed to be empty. */
+			assert(rte_pktmbuf_data_len(buf) == 0);
+			assert(rte_pktmbuf_pkt_len(buf) == 0);
+			/* sge->addr must be able to store a pointer. */
+			assert(sizeof(sge->addr) >= sizeof(uintptr_t));
+			if (j == 0) {
+				/* The first SGE keeps its headroom. */
+				sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
+				sge->length = (buf->buf_len -
+					       RTE_PKTMBUF_HEADROOM);
+			} else {
+				/* Subsequent SGEs lose theirs. */
+				assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
+				SET_DATA_OFF(buf, 0);
+				sge->addr = (uintptr_t)buf->buf_addr;
+				sge->length = buf->buf_len;
+			}
+			sge->lkey = rxq->mr->lkey;
+			/* Redundant check for tailroom. */
+			assert(sge->length == rte_pktmbuf_tailroom(buf));
+		}
+	}
+	/* The last WR pointer must be NULL. */
+	(*elts)[(i - 1)].wr.next = NULL;
+	DEBUG("%p: allocated and configured %u WRs (%zu segments)",
+	      (void *)rxq, elts_n, (elts_n * RTE_DIM((*elts)[0].sges)));
+	rxq->elts_n = elts_n;
+	rxq->elts_head = 0;
+	rxq->elts.sp = elts;
+	assert(ret == 0);
+	return 0;
+error:
+	if (elts != NULL) {
+		assert(pool == NULL);
+		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+			unsigned int j;
+			struct rxq_elt_sp *elt = &(*elts)[i];
+
+			for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
+				struct rte_mbuf *buf = elt->bufs[j];
+
+				if (buf != NULL)
+					rte_pktmbuf_free_seg(buf);
+			}
+		}
+		rte_free(elts);
+	}
+	DEBUG("%p: failed, freed everything", (void *)rxq);
+	assert(ret > 0);
+	return ret;
+}
+
+/**
+ * Free RX queue elements with scattered packets support.
+ *
+ * @param rxq
+ *   Pointer to RX queue structure.
+ */
+static void
+rxq_free_elts_sp(struct rxq *rxq)
+{
+	unsigned int i;
+	unsigned int elts_n = rxq->elts_n;
+	struct rxq_elt_sp (*elts)[elts_n] = rxq->elts.sp;
+
+	DEBUG("%p: freeing WRs", (void *)rxq);
+	rxq->elts_n = 0;
+	rxq->elts.sp = NULL;
+	if (elts == NULL)
+		return;
+	for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+		unsigned int j;
+		struct rxq_elt_sp *elt = &(*elts)[i];
+
+		for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
+			struct rte_mbuf *buf = elt->bufs[j];
+
+			if (buf != NULL)
+				rte_pktmbuf_free_seg(buf);
+		}
+	}
+	rte_free(elts);
+}
+
+/**
  * Allocate RX queue elements.
  *
  * @param rxq
@@ -224,7 +371,10 @@ rxq_cleanup(struct rxq *rxq)
 	struct ibv_exp_release_intf_params params;
 
 	DEBUG("cleaning up %p", (void *)rxq);
-	rxq_free_elts(rxq);
+	if (rxq->sp)
+		rxq_free_elts_sp(rxq);
+	else
+		rxq_free_elts(rxq);
 	if (rxq->if_qp != NULL) {
 		assert(rxq->priv != NULL);
 		assert(rxq->priv->ctx != NULL);
@@ -445,6 +595,15 @@ rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 		rte_pktmbuf_tailroom(buf)) == tmpl.mb_len);
 	assert(rte_pktmbuf_headroom(buf) == RTE_PKTMBUF_HEADROOM);
 	rte_pktmbuf_free(buf);
+	/* Enable scattered packets support for this queue if necessary. */
+	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+	     (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
+		tmpl.sp = 1;
+		desc /= MLX5_PMD_SGE_WR_N;
+	}
+	DEBUG("%p: %s scattered packets support (%u WRs)",
+	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc);
 	/* Use the entire RX mempool as the memory region. */
 	tmpl.mr = ibv_reg_mr(priv->pd,
 			     (void *)mp->elt_va_start,
@@ -528,14 +687,19 @@ skip_mr:
 	/* Allocate descriptors for RX queues, except for the RSS parent. */
 	if (parent)
 		goto skip_alloc;
-	ret = rxq_alloc_elts(&tmpl, desc, NULL);
+	if (tmpl.sp)
+		ret = rxq_alloc_elts_sp(&tmpl, desc, NULL);
+	else
+		ret = rxq_alloc_elts(&tmpl, desc, NULL);
 	if (ret) {
 		ERROR("%p: RXQ allocation failed: %s",
 		      (void *)dev, strerror(ret));
 		goto error;
 	}
 	ret = ibv_post_recv(tmpl.qp,
-			    &(*tmpl.elts.no_sp)[0].wr,
+			    (tmpl.sp ?
+			     &(*tmpl.elts.sp)[0].wr :
+			     &(*tmpl.elts.no_sp)[0].wr),
 			    &bad_wr);
 	if (ret) {
 		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
@@ -655,7 +819,10 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		      (void *)dev, (void *)rxq);
 		(*priv->rxqs)[idx] = rxq;
 		/* Update receive callback. */
-		dev->rx_pkt_burst = mlx5_rx_burst;
+		if (rxq->sp)
+			dev->rx_pkt_burst = mlx5_rx_burst_sp;
+		else
+			dev->rx_pkt_burst = mlx5_rx_burst;
 	}
 	priv_unlock(priv);
 	return -ret;
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 0f1e541..ed6faa1 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -173,6 +173,154 @@ txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
 	return txq->mp2mr[i].lkey;
 }
 
+#if MLX5_PMD_SGE_WR_N > 1
+
+/**
+ * Copy scattered mbuf contents to a single linear buffer.
+ *
+ * @param[out] linear
+ *   Linear output buffer.
+ * @param[in] buf
+ *   Scattered input buffer.
+ *
+ * @return
+ *   Number of bytes copied to the output buffer or 0 if not large enough.
+ */
+static unsigned int
+linearize_mbuf(linear_t *linear, struct rte_mbuf *buf)
+{
+	unsigned int size = 0;
+	unsigned int offset;
+
+	do {
+		unsigned int len = DATA_LEN(buf);
+
+		offset = size;
+		size += len;
+		if (unlikely(size > sizeof(*linear)))
+			return 0;
+		memcpy(&(*linear)[offset],
+		       rte_pktmbuf_mtod(buf, uint8_t *),
+		       len);
+		buf = NEXT(buf);
+	} while (buf != NULL);
+	return size;
+}
+
+/**
+ * Handle scattered buffers for mlx5_tx_burst().
+ *
+ * @param txq
+ *   TX queue structure.
+ * @param segs
+ *   Number of segments in buf.
+ * @param elt
+ *   TX queue element to fill.
+ * @param[in] buf
+ *   Buffer to process.
+ * @param elts_head
+ *   Index of the linear buffer to use if necessary (normally txq->elts_head).
+ * @param[out] sges
+ *   Array filled with SGEs on success.
+ *
+ * @return
+ *   A structure containing the processed packet size in bytes and the
+ *   number of SGEs. Both fields are set to (unsigned int)-1 in case of
+ *   failure.
+ */
+static struct tx_burst_sg_ret {
+	unsigned int length;
+	unsigned int num;
+}
+tx_burst_sg(struct txq *txq, unsigned int segs, struct txq_elt *elt,
+	    struct rte_mbuf *buf, unsigned int elts_head,
+	    struct ibv_sge (*sges)[MLX5_PMD_SGE_WR_N])
+{
+	unsigned int sent_size = 0;
+	unsigned int j;
+	int linearize = 0;
+
+	/* When there are too many segments, extra segments are
+	 * linearized in the last SGE. */
+	if (unlikely(segs > RTE_DIM(*sges))) {
+		segs = (RTE_DIM(*sges) - 1);
+		linearize = 1;
+	}
+	/* Update element. */
+	elt->buf = buf;
+	/* Register segments as SGEs. */
+	for (j = 0; (j != segs); ++j) {
+		struct ibv_sge *sge = &(*sges)[j];
+		uint32_t lkey;
+
+		/* Retrieve Memory Region key for this memory pool. */
+		lkey = txq_mp2mr(txq, buf->pool);
+		if (unlikely(lkey == (uint32_t)-1)) {
+			/* MR does not exist. */
+			DEBUG("%p: unable to get MP <-> MR association",
+			      (void *)txq);
+			/* Clean up TX element. */
+			elt->buf = NULL;
+			goto stop;
+		}
+		/* Update SGE. */
+		sge->addr = rte_pktmbuf_mtod(buf, uintptr_t);
+		if (txq->priv->vf)
+			rte_prefetch0((volatile void *)
+				      (uintptr_t)sge->addr);
+		sge->length = DATA_LEN(buf);
+		sge->lkey = lkey;
+		sent_size += sge->length;
+		buf = NEXT(buf);
+	}
+	/* If buf is not NULL here and is not going to be linearized,
+	 * nb_segs is not valid. */
+	assert(j == segs);
+	assert((buf == NULL) || (linearize));
+	/* Linearize extra segments. */
+	if (linearize) {
+		struct ibv_sge *sge = &(*sges)[segs];
+		linear_t *linear = &(*txq->elts_linear)[elts_head];
+		unsigned int size = linearize_mbuf(linear, buf);
+
+		assert(segs == (RTE_DIM(*sges) - 1));
+		if (size == 0) {
+			/* Invalid packet. */
+			DEBUG("%p: packet too large to be linearized.",
+			      (void *)txq);
+			/* Clean up TX element. */
+			elt->buf = NULL;
+			goto stop;
+		}
+		/* If MLX5_PMD_SGE_WR_N is 1, free mbuf immediately. */
+		if (RTE_DIM(*sges) == 1) {
+			do {
+				struct rte_mbuf *next = NEXT(buf);
+
+				rte_pktmbuf_free_seg(buf);
+				buf = next;
+			} while (buf != NULL);
+			elt->buf = NULL;
+		}
+		/* Update SGE. */
+		sge->addr = (uintptr_t)&(*linear)[0];
+		sge->length = size;
+		sge->lkey = txq->mr_linear->lkey;
+		sent_size += size;
+	}
+	return (struct tx_burst_sg_ret){
+		.length = sent_size,
+		.num = segs,
+	};
+stop:
+	return (struct tx_burst_sg_ret){
+		.length = -1,
+		.num = -1,
+	};
+}
+
+#endif /* MLX5_PMD_SGE_WR_N > 1 */
+
 /**
  * DPDK callback for TX.
  *
@@ -282,9 +430,28 @@ mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 			if (unlikely(err))
 				goto stop;
 		} else {
+#if MLX5_PMD_SGE_WR_N > 1
+			struct ibv_sge sges[MLX5_PMD_SGE_WR_N];
+			struct tx_burst_sg_ret ret;
+
+			ret = tx_burst_sg(txq, segs, elt, buf, elts_head,
+					  &sges);
+			if (ret.length == (unsigned int)-1)
+				goto stop;
+			RTE_MBUF_PREFETCH_TO_FREE(elt_next->buf);
+			/* Put SG list into send queue. */
+			err = txq->if_qp->send_pending_sg_list
+				(txq->qp,
+				 sges,
+				 ret.num,
+				 send_flags);
+			if (unlikely(err))
+				goto stop;
+#else /* MLX5_PMD_SGE_WR_N > 1 */
 			DEBUG("%p: TX scattered buffers support not"
 			      " compiled in", (void *)txq);
 			goto stop;
+#endif /* MLX5_PMD_SGE_WR_N > 1 */
 		}
 		elts_head = elts_head_next;
 	}
@@ -307,8 +474,215 @@ stop:
 }
 
 /**
+ * DPDK callback for RX with scattered packets support.
+ *
+ * @param dpdk_rxq
+ *   Generic pointer to RX queue structure.
+ * @param[out] pkts
+ *   Array to store received packets.
+ * @param pkts_n
+ *   Maximum number of packets in array.
+ *
+ * @return
+ *   Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_sp(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+	struct rxq *rxq = (struct rxq *)dpdk_rxq;
+	struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
+	const unsigned int elts_n = rxq->elts_n;
+	unsigned int elts_head = rxq->elts_head;
+	struct ibv_recv_wr head;
+	struct ibv_recv_wr **next = &head.next;
+	struct ibv_recv_wr *bad_wr;
+	unsigned int i;
+	unsigned int pkts_ret = 0;
+	int ret;
+
+	if (unlikely(!rxq->sp))
+		return mlx5_rx_burst(dpdk_rxq, pkts, pkts_n);
+	if (unlikely(elts == NULL)) /* See RTE_DEV_CMD_SET_MTU. */
+		return 0;
+	for (i = 0; (i != pkts_n); ++i) {
+		struct rxq_elt_sp *elt = &(*elts)[elts_head];
+		struct ibv_recv_wr *wr = &elt->wr;
+		uint64_t wr_id = wr->wr_id;
+		unsigned int len;
+		unsigned int pkt_buf_len;
+		struct rte_mbuf *pkt_buf = NULL; /* Buffer returned in pkts. */
+		struct rte_mbuf **pkt_buf_next = &pkt_buf;
+		unsigned int seg_headroom = RTE_PKTMBUF_HEADROOM;
+		unsigned int j = 0;
+		uint32_t flags;
+
+		/* Sanity checks. */
+#ifdef NDEBUG
+		(void)wr_id;
+#endif
+		assert(wr_id < rxq->elts_n);
+		assert(wr->sg_list == elt->sges);
+		assert(wr->num_sge == RTE_DIM(elt->sges));
+		assert(elts_head < rxq->elts_n);
+		assert(rxq->elts_head < rxq->elts_n);
+		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
+						    &flags);
+		if (unlikely(ret < 0)) {
+			struct ibv_wc wc;
+			int wcs_n;
+
+			DEBUG("rxq=%p, poll_length() failed (ret=%d)",
+			      (void *)rxq, ret);
+			/* ibv_poll_cq() must be used in case of failure. */
+			wcs_n = ibv_poll_cq(rxq->cq, 1, &wc);
+			if (unlikely(wcs_n == 0))
+				break;
+			if (unlikely(wcs_n < 0)) {
+				DEBUG("rxq=%p, ibv_poll_cq() failed (wcs_n=%d)",
+				      (void *)rxq, wcs_n);
+				break;
+			}
+			assert(wcs_n == 1);
+			if (unlikely(wc.status != IBV_WC_SUCCESS)) {
+				/* Whatever, just repost the offending WR. */
+				DEBUG("rxq=%p, wr_id=%" PRIu64 ": bad work"
+				      " completion status (%d): %s",
+				      (void *)rxq, wc.wr_id, wc.status,
+				      ibv_wc_status_str(wc.status));
+				/* Link completed WRs together for repost. */
+				*next = wr;
+				next = &wr->next;
+				goto repost;
+			}
+			ret = wc.byte_len;
+		}
+		if (ret == 0)
+			break;
+		len = ret;
+		pkt_buf_len = len;
+		/* Link completed WRs together for repost. */
+		*next = wr;
+		next = &wr->next;
+		/*
+		 * Replace spent segments with new ones, concatenate and
+		 * return them as pkt_buf.
+		 */
+		while (1) {
+			struct ibv_sge *sge = &elt->sges[j];
+			struct rte_mbuf *seg = elt->bufs[j];
+			struct rte_mbuf *rep;
+			unsigned int seg_tailroom;
+
+			/*
+			 * Fetch initial bytes of packet descriptor into a
+			 * cacheline while allocating rep.
+			 */
+			rte_prefetch0(seg);
+			rep = __rte_mbuf_raw_alloc(rxq->mp);
+			if (unlikely(rep == NULL)) {
+				/*
+				 * Unable to allocate a replacement mbuf,
+				 * repost WR.
+				 */
+				DEBUG("rxq=%p, wr_id=%" PRIu64 ":"
+				      " can't allocate a new mbuf",
+				      (void *)rxq, wr_id);
+				if (pkt_buf != NULL) {
+					*pkt_buf_next = NULL;
+					rte_pktmbuf_free(pkt_buf);
+				}
+				/* Increment out of memory counters. */
+				++rxq->priv->dev->data->rx_mbuf_alloc_failed;
+				goto repost;
+			}
+#ifndef NDEBUG
+			/* Poison user-modifiable fields in rep. */
+			NEXT(rep) = (void *)((uintptr_t)-1);
+			SET_DATA_OFF(rep, 0xdead);
+			DATA_LEN(rep) = 0xd00d;
+			PKT_LEN(rep) = 0xdeadd00d;
+			NB_SEGS(rep) = 0x2a;
+			PORT(rep) = 0x2a;
+			rep->ol_flags = -1;
+#endif
+			assert(rep->buf_len == seg->buf_len);
+			assert(rep->buf_len == rxq->mb_len);
+			/* Reconfigure sge to use rep instead of seg. */
+			assert(sge->lkey == rxq->mr->lkey);
+			sge->addr = ((uintptr_t)rep->buf_addr + seg_headroom);
+			elt->bufs[j] = rep;
+			++j;
+			/* Update pkt_buf if it's the first segment, or link
+			 * seg to the previous one and update pkt_buf_next. */
+			*pkt_buf_next = seg;
+			pkt_buf_next = &NEXT(seg);
+			/* Update seg information. */
+			seg_tailroom = (seg->buf_len - seg_headroom);
+			assert(sge->length == seg_tailroom);
+			SET_DATA_OFF(seg, seg_headroom);
+			if (likely(len <= seg_tailroom)) {
+				/* Last segment. */
+				DATA_LEN(seg) = len;
+				PKT_LEN(seg) = len;
+				/* Sanity check. */
+				assert(rte_pktmbuf_headroom(seg) ==
+				       seg_headroom);
+				assert(rte_pktmbuf_tailroom(seg) ==
+				       (seg_tailroom - len));
+				break;
+			}
+			DATA_LEN(seg) = seg_tailroom;
+			PKT_LEN(seg) = seg_tailroom;
+			/* Sanity check. */
+			assert(rte_pktmbuf_headroom(seg) == seg_headroom);
+			assert(rte_pktmbuf_tailroom(seg) == 0);
+			/* Fix len and clear headroom for next segments. */
+			len -= seg_tailroom;
+			seg_headroom = 0;
+		}
+		/* Update head and tail segments. */
+		*pkt_buf_next = NULL;
+		assert(pkt_buf != NULL);
+		assert(j != 0);
+		NB_SEGS(pkt_buf) = j;
+		PORT(pkt_buf) = rxq->port_id;
+		PKT_LEN(pkt_buf) = pkt_buf_len;
+
+		/* Return packet. */
+		*(pkts++) = pkt_buf;
+		++pkts_ret;
+repost:
+		if (++elts_head >= elts_n)
+			elts_head = 0;
+		continue;
+	}
+	if (unlikely(i == 0))
+		return 0;
+	*next = NULL;
+	/* Repost WRs. */
+#ifdef DEBUG_RECV
+	DEBUG("%p: reposting %d WRs", (void *)rxq, i);
+#endif
+	ret = ibv_post_recv(rxq->qp, head.next, &bad_wr);
+	if (unlikely(ret)) {
+		/* Inability to repost WRs is fatal. */
+		DEBUG("%p: ibv_post_recv(): failed for WR %p: %s",
+		      (void *)rxq->priv,
+		      (void *)bad_wr,
+		      strerror(ret));
+		abort();
+	}
+	rxq->elts_head = elts_head;
+	return pkts_ret;
+}
+
+/**
  * DPDK callback for RX.
  *
+ * The following function is the same as mlx5_rx_burst_sp(), except it doesn't
+ * manage scattered packets. Improves performance when MRU is lower than the
+ * size of the first segment.
+ *
  * @param dpdk_rxq
  *   Generic pointer to RX queue structure.
  * @param[out] pkts
@@ -331,6 +705,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 	unsigned int pkts_ret = 0;
 	int ret;
 
+	if (unlikely(rxq->sp))
+		return mlx5_rx_burst_sp(dpdk_rxq, pkts, pkts_n);
 	for (i = 0; (i != pkts_n); ++i) {
 		struct rxq_elt *elt = &(*elts)[elts_head];
 		struct ibv_recv_wr *wr = &elt->wr;
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 3733d3e..c7f634e 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -60,6 +60,13 @@
 #include "mlx5.h"
 #include "mlx5_defs.h"
 
+/* RX element (scattered packets). */
+struct rxq_elt_sp {
+	struct ibv_recv_wr wr; /* Work Request. */
+	struct ibv_sge sges[MLX5_PMD_SGE_WR_N]; /* Scatter/Gather Elements. */
+	struct rte_mbuf *bufs[MLX5_PMD_SGE_WR_N]; /* SGEs buffers. */
+};
+
 /* RX element. */
 struct rxq_elt {
 	struct ibv_recv_wr wr; /* Work Request. */
@@ -84,8 +91,10 @@ struct rxq {
 	unsigned int elts_n; /* (*elts)[] length. */
 	unsigned int elts_head; /* Current index in (*elts)[]. */
 	union {
+		struct rxq_elt_sp (*sp)[]; /* Scattered RX elements. */
 		struct rxq_elt (*no_sp)[]; /* RX elements. */
 	} elts;
+	unsigned int sp:1; /* Use scattered RX elements. */
 	uint32_t mb_len; /* Length of a mp-issued mbuf. */
 	unsigned int socket; /* CPU socket ID for allocations. */
 	struct ibv_exp_res_domain *rd; /* Resource Domain. */
@@ -151,6 +160,7 @@ void mlx5_tx_queue_release(void *);
 /* mlx5_rxtx.c */
 
 uint16_t mlx5_tx_burst(void *, struct rte_mbuf **, uint16_t);
+uint16_t mlx5_rx_burst_sp(void *, struct rte_mbuf **, uint16_t);
 uint16_t mlx5_rx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_tx_burst(void *, struct rte_mbuf **, uint16_t);
 uint16_t removed_rx_burst(void *, struct rte_mbuf **, uint16_t);
-- 
2.1.0



More information about the dev mailing list