[dpdk-dev] [PATCH v3 2/6] net/mlx4: restore full Rx support bypassing Verbs

Adrien Mazarguil adrien.mazarguil at 6wind.com
Wed Oct 4 20:48:54 CEST 2017


From: Moti Haimovsky <motih at mellanox.com>

This patch adds support for accessing the hardware directly when handling
Rx packets eliminating the need to use Verbs in the Rx data path.

The number of scatters is calculated on the fly, according to the maximum
expected packet size.

Signed-off-by: Vasily Philipov <vasilyf at mellanox.com>
Signed-off-by: Moti Haimovsky <motih at mellanox.com>
Acked-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
---
 drivers/net/mlx4/mlx4_rxq.c   | 174 ++++++++++++++++++----------
 drivers/net/mlx4/mlx4_rxtx.c  | 226 +++++++++++++++++++++----------------
 drivers/net/mlx4/mlx4_rxtx.h  |  19 ++--
 drivers/net/mlx4/mlx4_utils.h |  20 ++++
 4 files changed, 270 insertions(+), 169 deletions(-)

diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 409983f..44d095d 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -51,6 +51,7 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_errno.h>
 #include <rte_ethdev.h>
@@ -77,20 +78,18 @@ static int
 mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n)
 {
 	unsigned int i;
-	struct rxq_elt (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq->socket);
+	const uint32_t sges_n = 1 << rxq->sges_n;
+	struct rte_mbuf *(*elts)[elts_n] =
+		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket);
 
+	assert(rte_is_power_of_2(elts_n));
 	if (elts == NULL) {
 		rte_errno = ENOMEM;
 		ERROR("%p: can't allocate packets array", (void *)rxq);
 		goto error;
 	}
-	/* For each WR (packet). */
 	for (i = 0; (i != elts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct ibv_recv_wr *wr = &elt->wr;
-		struct ibv_sge *sge = &(*elts)[i].sge;
+		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[i];
 		struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
 
 		if (buf == NULL) {
@@ -98,37 +97,35 @@ mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n)
 			ERROR("%p: empty mbuf pool", (void *)rxq);
 			goto error;
 		}
-		elt->buf = buf;
-		wr->next = &(*elts)[(i + 1)].wr;
-		wr->sg_list = sge;
-		wr->num_sge = 1;
 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
 		assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
 		/* Buffer is supposed to be empty. */
 		assert(rte_pktmbuf_data_len(buf) == 0);
 		assert(rte_pktmbuf_pkt_len(buf) == 0);
-		/* sge->addr must be able to store a pointer. */
-		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-		/* SGE keeps its headroom. */
-		sge->addr = (uintptr_t)
-			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-		sge->lkey = rxq->mr->lkey;
-		/* Redundant check for tailroom. */
-		assert(sge->length == rte_pktmbuf_tailroom(buf));
+		/* Only the first segment keeps headroom. */
+		if (i % sges_n)
+			buf->data_off = 0;
+		buf->port = rxq->port_id;
+		buf->data_len = rte_pktmbuf_tailroom(buf);
+		buf->pkt_len = rte_pktmbuf_tailroom(buf);
+		buf->nb_segs = 1;
+		*scat = (struct mlx4_wqe_data_seg){
+			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
+								  uintptr_t)),
+			.byte_count = rte_cpu_to_be_32(buf->data_len),
+			.lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+		};
+		(*elts)[i] = buf;
 	}
-	/* The last WR pointer must be NULL. */
-	(*elts)[(i - 1)].wr.next = NULL;
-	DEBUG("%p: allocated and configured %u single-segment WRs",
-	      (void *)rxq, elts_n);
-	rxq->elts_n = elts_n;
-	rxq->elts_head = 0;
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq, elts_n, elts_n >> rxq->sges_n);
+	rxq->elts_n = log2above(elts_n);
 	rxq->elts = elts;
 	return 0;
 error:
 	if (elts != NULL) {
 		for (i = 0; (i != RTE_DIM(*elts)); ++i)
-			rte_pktmbuf_free_seg((*elts)[i].buf);
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
 		rte_free(elts);
 	}
 	DEBUG("%p: failed, freed everything", (void *)rxq);
@@ -146,17 +143,16 @@ static void
 mlx4_rxq_free_elts(struct rxq *rxq)
 {
 	unsigned int i;
-	unsigned int elts_n = rxq->elts_n;
-	struct rxq_elt (*elts)[elts_n] = rxq->elts;
 
-	DEBUG("%p: freeing WRs", (void *)rxq);
+	if (rxq->elts == NULL)
+		return;
+	DEBUG("%p: freeing Rx queue elements", (void *)rxq);
+	for (i = 0; i != (1u << rxq->elts_n); ++i)
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+	rte_free(rxq->elts);
 	rxq->elts_n = 0;
 	rxq->elts = NULL;
-	if (elts == NULL)
-		return;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i)
-		rte_pktmbuf_free_seg((*elts)[i].buf);
-	rte_free(elts);
 }
 
 /**
@@ -193,12 +189,15 @@ mlx4_rxq_cleanup(struct rxq *rxq)
  *   Completion queue to associate with QP.
  * @param desc
  *   Number of descriptors in QP (hint only).
+ * @param sges_n
+ *   Maximum number of segments per packet.
  *
  * @return
  *   QP pointer or NULL in case of error and rte_errno is set.
  */
 static struct ibv_qp *
-mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
+mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
+		  uint32_t sges_n)
 {
 	struct ibv_qp *qp;
 	struct ibv_qp_init_attr attr = {
@@ -211,8 +210,8 @@ mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
 			.max_recv_wr = ((priv->device_attr.max_qp_wr < desc) ?
 					priv->device_attr.max_qp_wr :
 					desc),
-			/* Max number of scatter/gather elements in a WR. */
-			.max_recv_sge = 1,
+			/* Maximum number of segments per packet. */
+			.max_recv_sge = sges_n,
 		},
 		.qp_type = IBV_QPT_RAW_PACKET,
 	};
@@ -248,13 +247,15 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	       struct rte_mempool *mp)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_qp dv_qp;
+	struct mlx4dv_cq dv_cq;
 	struct rxq tmpl = {
 		.priv = priv,
 		.mp = mp,
 		.socket = socket
 	};
 	struct ibv_qp_attr mod;
-	struct ibv_recv_wr *bad_wr;
 	unsigned int mb_len;
 	int ret;
 
@@ -269,11 +270,31 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
 	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
-		;
+		tmpl.sges_n = 0;
 	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
-		WARN("%p: scattered mode has been requested but is"
-		     " not supported, this may lead to packet loss",
-		     (void *)dev);
+		uint32_t size =
+			RTE_PKTMBUF_HEADROOM +
+			dev->data->dev_conf.rxmode.max_rx_pkt_len;
+		uint32_t sges_n;
+
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = log2above((size / mb_len) + !!(size % mb_len));
+		tmpl.sges_n = sges_n;
+		/* Make sure sges_n did not overflow. */
+		size = mb_len * (1 << tmpl.sges_n);
+		size -= RTE_PKTMBUF_HEADROOM;
+		if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
+			rte_errno = EOVERFLOW;
+			ERROR("%p: too many SGEs (%u) needed to handle"
+			      " requested maximum packet size %u",
+			      (void *)dev,
+			      1 << sges_n,
+			      dev->data->dev_conf.rxmode.max_rx_pkt_len);
+			goto error;
+		}
 	} else {
 		WARN("%p: the requested maximum Rx packet size (%u) is"
 		     " larger than a single mbuf (%u) and scattered"
@@ -282,6 +303,17 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
 		     mb_len - RTE_PKTMBUF_HEADROOM);
 	}
+	DEBUG("%p: maximum number of segments per packet: %u",
+	      (void *)dev, 1 << tmpl.sges_n);
+	if (desc % (1 << tmpl.sges_n)) {
+		rte_errno = EINVAL;
+		ERROR("%p: number of RX queue descriptors (%u) is not a"
+		      " multiple of maximum segments per packet (%u)",
+		      (void *)dev,
+		      desc,
+		      1 << tmpl.sges_n);
+		goto error;
+	}
 	/* Use the entire Rx mempool as the memory region. */
 	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
@@ -306,7 +338,8 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 			goto error;
 		}
 	}
-	tmpl.cq = ibv_create_cq(priv->ctx, desc, NULL, tmpl.channel, 0);
+	tmpl.cq = ibv_create_cq(priv->ctx, desc >> tmpl.sges_n, NULL,
+				tmpl.channel, 0);
 	if (tmpl.cq == NULL) {
 		rte_errno = ENOMEM;
 		ERROR("%p: CQ creation failure: %s",
@@ -317,7 +350,8 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 	      priv->device_attr.max_qp_wr);
 	DEBUG("priv->device_attr.max_sge is %d",
 	      priv->device_attr.max_sge);
-	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc);
+	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc >> tmpl.sges_n,
+				    1 << tmpl.sges_n);
 	if (tmpl.qp == NULL) {
 		ERROR("%p: QP creation failure: %s",
 		      (void *)dev, strerror(rte_errno));
@@ -336,21 +370,6 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
-	ret = mlx4_rxq_alloc_elts(&tmpl, desc);
-	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(rte_errno));
-		goto error;
-	}
-	ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr);
-	if (ret) {
-		rte_errno = ret;
-		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
-		      (void *)dev,
-		      (void *)bad_wr,
-		      strerror(rte_errno));
-		goto error;
-	}
 	mod = (struct ibv_qp_attr){
 		.qp_state = IBV_QPS_RTR
 	};
@@ -361,14 +380,43 @@ mlx4_rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
+	/* Retrieve device queue information. */
+	mlxdv.cq.in = tmpl.cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.qp.in = tmpl.qp;
+	mlxdv.qp.out = &dv_qp;
+	ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);
+	if (ret) {
+		ERROR("%p: failed to obtain device information", (void *)dev);
+		goto error;
+	}
+	tmpl.wqes =
+		(volatile struct mlx4_wqe_data_seg (*)[])
+		((uintptr_t)dv_qp.buf.buf + dv_qp.rq.offset);
+	tmpl.rq_db = dv_qp.rdb;
+	tmpl.rq_ci = 0;
+	tmpl.mcq.buf = dv_cq.buf.buf;
+	tmpl.mcq.cqe_cnt = dv_cq.cqe_cnt;
+	tmpl.mcq.set_ci_db = dv_cq.set_ci_db;
+	tmpl.mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
 	/* Save port ID. */
 	tmpl.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+	ret = mlx4_rxq_alloc_elts(&tmpl, desc);
+	if (ret) {
+		ERROR("%p: RXQ allocation failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
 	mlx4_rxq_cleanup(rxq);
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+	/* Update doorbell counter. */
+	rxq->rq_ci = desc >> rxq->sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
 	return 0;
 error:
 	ret = rte_errno;
@@ -406,6 +454,12 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	struct rxq *rxq = dev->data->rx_queues[idx];
 	int ret;
 
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in RX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
 	DEBUG("%p: configuring queue %u for %u descriptors",
 	      (void *)dev, idx, desc);
 	if (idx >= dev->data->nb_rx_queues) {
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 35367a2..fd8ef7b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -509,9 +509,44 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 }
 
 /**
- * DPDK callback for Rx.
+ * Poll one CQE from CQ.
  *
- * The following function doesn't manage scattered packets.
+ * @param rxq
+ *   Pointer to the receive queue structure.
+ * @param[out] out
+ *   Just polled CQE.
+ *
+ * @return
+ *   Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq, struct mlx4_cqe **out)
+{
+	int ret = 0;
+	struct mlx4_cqe *cqe = NULL;
+	struct mlx4_cq *cq = &rxq->mcq;
+
+	cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & cq->cqe_cnt))
+		goto out;
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rte_rmb();
+	assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+	assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+	       MLX4_CQE_OPCODE_ERROR);
+	ret = rte_be_to_cpu_32(cqe->byte_cnt);
+	++cq->cons_index;
+out:
+	*out = cqe;
+	return ret;
+}
+
+/**
+ * DPDK callback for Rx with scattered packets support.
  *
  * @param dpdk_rxq
  *   Generic pointer to Rx queue structure.
@@ -526,112 +561,107 @@ mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
 uint16_t
 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
-	struct ibv_recv_wr *wr_head = NULL;
-	struct ibv_recv_wr **wr_next = &wr_head;
-	struct ibv_recv_wr *wr_bad = NULL;
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
+	struct rxq *rxq = dpdk_rxq;
+	const uint32_t wr_cnt = (1 << rxq->elts_n) - 1;
+	const uint16_t sges_n = rxq->sges_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	unsigned int i = 0;
+	uint32_t rq_ci = rxq->rq_ci << sges_n;
+	int len = 0;
 
-	ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(ret == 0))
-		return 0;
-	if (unlikely(ret < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, ret);
-		return 0;
-	}
-	assert(ret <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; i != (unsigned int)ret; ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint32_t len = wc->byte_len;
-		struct rte_mbuf *seg = elt->buf;
-		struct rte_mbuf *rep;
+	while (pkts_n) {
+		struct mlx4_cqe *cqe;
+		uint32_t idx = rq_ci & wr_cnt;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx4_wqe_data_seg *scat = &(*rxq->wqes)[idx];
 
-		/* Sanity checks. */
-		assert(wr->sg_list == &elt->sge);
-		assert(wr->num_sge == 1);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		/* Link completed WRs together for repost. */
-		*wr_next = wr;
-		wr_next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p: bad work completion status (%d): %s",
-			      (void *)rxq, wc->status,
-			      ibv_wc_status_str(wc->status));
-			/* Increment dropped packets counter. */
-			++rxq->stats.idropped;
-			goto repost;
-		}
+		/* Update the 'next' pointer of the previous segment. */
+		if (pkt)
+			seg->next = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(scat);
 		rep = rte_mbuf_raw_alloc(rxq->mp);
 		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p: can't allocate a new mbuf",
-			      (void *)rxq);
-			/* Increase out of memory counters. */
 			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			goto repost;
+			if (!pkt) {
+				/*
+				 * No buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				assert(pkt != (*rxq->elts)[idx]);
+				rep = pkt->next;
+				pkt->next = NULL;
+				pkt->nb_segs = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
+		}
+		if (!pkt) {
+			/* Looking for the new packet. */
+			len = mlx4_cq_poll_one(rxq, &cqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len < 0)) {
+				/* Rx error, packet is likely too large. */
+				rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			pkt->packet_type = 0;
+			pkt->ol_flags = 0;
+			pkt->pkt_len = len;
 		}
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		assert(elt->sge.lkey == rxq->mr->lkey);
-		elt->buf = rep;
-		/* Update seg information. */
-		seg->data_off = RTE_PKTMBUF_HEADROOM;
-		seg->nb_segs = 1;
-		seg->port = rxq->port_id;
-		seg->next = NULL;
-		seg->pkt_len = len;
+		rep->nb_segs = 1;
+		rep->port = rxq->port_id;
+		rep->data_len = seg->data_len;
+		rep->data_off = seg->data_off;
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer. The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		if (len > seg->data_len) {
+			len -= seg->data_len;
+			++pkt->nb_segs;
+			++rq_ci;
+			continue;
+		}
+		/* The last segment. */
 		seg->data_len = len;
-		seg->packet_type = 0;
-		seg->ol_flags = 0;
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += pkt->pkt_len;
 		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += len;
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sges_n;
+		++rq_ci;
+		rq_ci <<= sges_n;
 	}
-	if (unlikely(i == 0))
+	if (unlikely(i == 0 && (rq_ci >> sges_n) == rxq->rq_ci))
 		return 0;
-	/* Repost WRs. */
-	*wr_next = NULL;
-	assert(wr_head);
-	ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-	return pkts_ret;
+	/* Update the consumer index. */
+	rxq->rq_ci = rq_ci >> sges_n;
+	rte_wmb();
+	*rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+	*rxq->mcq.set_ci_db = rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff);
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+	return i;
 }
 
 /**
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index cc5951c..ac84177 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -62,13 +62,6 @@ struct mlx4_rxq_stats {
 	uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
 };
 
-/** Rx element. */
-struct rxq_elt {
-	struct ibv_recv_wr wr; /**< Work request. */
-	struct ibv_sge sge; /**< Scatter/gather element. */
-	struct rte_mbuf *buf; /**< Buffer. */
-};
-
 /** Rx queue descriptor. */
 struct rxq {
 	struct priv *priv; /**< Back pointer to private data. */
@@ -77,10 +70,14 @@ struct rxq {
 	struct ibv_cq *cq; /**< Completion queue. */
 	struct ibv_qp *qp; /**< Queue pair. */
 	struct ibv_comp_channel *channel; /**< Rx completion channel. */
-	unsigned int port_id; /**< Port ID for incoming packets. */
-	unsigned int elts_n; /**< (*elts)[] length. */
-	unsigned int elts_head; /**< Current index in (*elts)[]. */
-	struct rxq_elt (*elts)[]; /**< Rx elements. */
+	uint16_t rq_ci; /**< Saved RQ consumer index. */
+	uint16_t port_id; /**< Port ID for incoming packets. */
+	uint16_t sges_n; /**< Number of segments per packet (log2 value). */
+	uint16_t elts_n; /**< Mbuf queue size (log2 value). */
+	struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+	volatile struct mlx4_wqe_data_seg (*wqes)[]; /**< HW queue entries. */
+	volatile uint32_t *rq_db; /**< RQ doorbell record. */
+	struct mlx4_cq mcq;  /**< Info for directly manipulating the CQ. */
 	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
 };
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index 0fbdc71..d6f729f 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -108,4 +108,24 @@ pmd_drv_log_basename(const char *s)
 
 int mlx4_fd_set_non_blocking(int fd);
 
+/**
+ * Return nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+	unsigned int l;
+	unsigned int r;
+
+	for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+		r |= (v & 1);
+	return l + r;
+}
+
 #endif /* MLX4_UTILS_H_ */
-- 
2.1.4



More information about the dev mailing list