[dpdk-dev] [PATCH v2 2/6] net/mlx4: get back Rx flow functionality

Matan Azrad matan at mellanox.com
Tue Oct 3 12:48:27 CEST 2017


From: Moti Haimovsky <motih at mellanox.com>

This patch adds support for accessing the hardware directly when handling
Rx packets eliminating the need to use verbs in the Rx datapath.

Now the number of scatters is calculated on the fly, according to the
maximum expected packet size.

Signed-off-by: Vasily Philipov <vasilyf at mellanox.com>
---
 drivers/net/mlx4/mlx4.h       |  11 --
 drivers/net/mlx4/mlx4_rxq.c   | 176 +++++++++++++++++++-------------
 drivers/net/mlx4/mlx4_rxtx.c  | 229 ++++++++++++++++++++++++------------------
 drivers/net/mlx4/mlx4_rxtx.h  |  19 ++--
 drivers/net/mlx4/mlx4_utils.h |  20 ++++
 5 files changed, 268 insertions(+), 187 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 93e5502..b6e1ef2 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -57,17 +57,6 @@
 /* Maximum size for inline data. */
 #define MLX4_PMD_MAX_INLINE 0
 
-/*
- * Maximum number of cached Memory Pools (MPs) per TX queue. Each RTE MP
- * from which buffers are to be transmitted will have to be mapped by this
- * driver to their own Memory Region (MR). This is a slow operation.
- *
- * This value is always 1 for RX queues.
- */
-#ifndef MLX4_PMD_TX_MP_CACHE
-#define MLX4_PMD_TX_MP_CACHE 8
-#endif
-
 /* Interrupt alarm timeout value in microseconds. */
 #define MLX4_INTR_ALARM_TIMEOUT 100000
 
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index 409983f..d7447c4 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -51,6 +51,7 @@
 #pragma GCC diagnostic error "-Wpedantic"
 #endif
 
+#include <rte_byteorder.h>
 #include <rte_common.h>
 #include <rte_errno.h>
 #include <rte_ethdev.h>
@@ -77,60 +78,63 @@
 mlx4_rxq_alloc_elts(struct rxq *rxq, unsigned int elts_n)
 {
 	unsigned int i;
-	struct rxq_elt (*elts)[elts_n] =
-		rte_calloc_socket("RXQ elements", 1, sizeof(*elts), 0,
-				  rxq->socket);
+	const unsigned int sge_n = 1 << rxq->sge_n;
+	struct rte_mbuf *(*elts)[elts_n] =
+		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, rxq->socket);
 
 	if (elts == NULL) {
+		elts_n = 0;
 		rte_errno = ENOMEM;
 		ERROR("%p: can't allocate packets array", (void *)rxq);
 		goto error;
 	}
-	/* For each WR (packet). */
-	for (i = 0; (i != elts_n); ++i) {
-		struct rxq_elt *elt = &(*elts)[i];
-		struct ibv_recv_wr *wr = &elt->wr;
-		struct ibv_sge *sge = &(*elts)[i].sge;
-		struct rte_mbuf *buf = rte_pktmbuf_alloc(rxq->mp);
+	rxq->elts = elts;
+	for (i = 0; i != elts_n; ++i) {
+		struct rte_mbuf *buf;
+		volatile struct mlx4_wqe_data_seg *scat =
+			&(*rxq->hw.wqes)[i];
 
+		buf = rte_pktmbuf_alloc(rxq->mp);
 		if (buf == NULL) {
 			rte_errno = ENOMEM;
 			ERROR("%p: empty mbuf pool", (void *)rxq);
 			goto error;
 		}
-		elt->buf = buf;
-		wr->next = &(*elts)[(i + 1)].wr;
-		wr->sg_list = sge;
-		wr->num_sge = 1;
 		/* Headroom is reserved by rte_pktmbuf_alloc(). */
 		assert(buf->data_off == RTE_PKTMBUF_HEADROOM);
 		/* Buffer is supposed to be empty. */
 		assert(rte_pktmbuf_data_len(buf) == 0);
 		assert(rte_pktmbuf_pkt_len(buf) == 0);
-		/* sge->addr must be able to store a pointer. */
-		assert(sizeof(sge->addr) >= sizeof(uintptr_t));
-		/* SGE keeps its headroom. */
-		sge->addr = (uintptr_t)
-			((uint8_t *)buf->buf_addr + RTE_PKTMBUF_HEADROOM);
-		sge->length = (buf->buf_len - RTE_PKTMBUF_HEADROOM);
-		sge->lkey = rxq->mr->lkey;
-		/* Redundant check for tailroom. */
-		assert(sge->length == rte_pktmbuf_tailroom(buf));
+		assert(!buf->next);
+		/* Only the first segment keeps headroom. */
+		if (i % sge_n)
+			buf->data_off = 0;
+		buf->port = rxq->port_id;
+		buf->data_len = rte_pktmbuf_tailroom(buf);
+		buf->pkt_len = rte_pktmbuf_tailroom(buf);
+		buf->nb_segs = 1;
+		/* scat->addr must be able to store a pointer. */
+		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx4_wqe_data_seg){
+			.addr =
+			    rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = rte_cpu_to_be_32(buf->data_len),
+			.lkey = rte_cpu_to_be_32(rxq->mr->lkey),
+		};
+		(*rxq->elts)[i] = buf;
 	}
-	/* The last WR pointer must be NULL. */
-	(*elts)[(i - 1)].wr.next = NULL;
-	DEBUG("%p: allocated and configured %u single-segment WRs",
-	      (void *)rxq, elts_n);
-	rxq->elts_n = elts_n;
-	rxq->elts_head = 0;
-	rxq->elts = elts;
+	DEBUG("%p: allocated and configured %u segments (max %u packets)",
+	      (void *)rxq, elts_n, elts_n >> rxq->sge_n);
+	rxq->elts_n = log2above(elts_n);
 	return 0;
 error:
-	if (elts != NULL) {
-		for (i = 0; (i != RTE_DIM(*elts)); ++i)
-			rte_pktmbuf_free_seg((*elts)[i].buf);
-		rte_free(elts);
+	for (i = 0; i != elts_n; ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+		(*rxq->elts)[i] = NULL;
 	}
+	rte_free(rxq->elts);
+	rxq->elts = NULL;
 	DEBUG("%p: failed, freed everything", (void *)rxq);
 	assert(rte_errno > 0);
 	return -rte_errno;
@@ -146,17 +150,18 @@
 mlx4_rxq_free_elts(struct rxq *rxq)
 {
 	unsigned int i;
-	unsigned int elts_n = rxq->elts_n;
-	struct rxq_elt (*elts)[elts_n] = rxq->elts;
 
 	DEBUG("%p: freeing WRs", (void *)rxq);
+	if (rxq->elts == NULL)
+		return;
+
+	for (i = 0; i != (1u << rxq->elts_n); ++i) {
+		if ((*rxq->elts)[i] != NULL)
+			rte_pktmbuf_free_seg((*rxq->elts)[i]);
+	}
+	rte_free(rxq->elts);
 	rxq->elts_n = 0;
 	rxq->elts = NULL;
-	if (elts == NULL)
-		return;
-	for (i = 0; (i != RTE_DIM(*elts)); ++i)
-		rte_pktmbuf_free_seg((*elts)[i].buf);
-	rte_free(elts);
 }
 
 /**
@@ -198,7 +203,8 @@
  *   QP pointer or NULL in case of error and rte_errno is set.
  */
 static struct ibv_qp *
-mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq, uint16_t desc)
+mlx4_rxq_setup_qp(struct priv *priv, struct ibv_cq *cq,
+		  uint16_t desc, unsigned int sge_n)
 {
 	struct ibv_qp *qp;
 	struct ibv_qp_init_attr attr = {
@@ -212,7 +218,7 @@
 					priv->device_attr.max_qp_wr :
 					desc),
 			/* Max number of scatter/gather elements in a WR. */
-			.max_recv_sge = 1,
+			.max_recv_sge = sge_n,
 		},
 		.qp_type = IBV_QPT_RAW_PACKET,
 	};
@@ -248,32 +254,43 @@
 	       struct rte_mempool *mp)
 {
 	struct priv *priv = dev->data->dev_private;
+	struct mlx4dv_obj mlxdv;
+	struct mlx4dv_qp dv_qp;
+	struct mlx4dv_cq dv_cq;
 	struct rxq tmpl = {
 		.priv = priv,
 		.mp = mp,
 		.socket = socket
 	};
 	struct ibv_qp_attr mod;
-	struct ibv_recv_wr *bad_wr;
 	unsigned int mb_len;
 	int ret;
 
 	(void)conf; /* Thresholds configuration (ignored). */
 	mb_len = rte_pktmbuf_data_room_size(mp);
-	if (desc == 0) {
-		rte_errno = EINVAL;
-		ERROR("%p: invalid number of Rx descriptors", (void *)dev);
-		goto error;
-	}
 	/* Enable scattered packets support for this queue if necessary. */
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
 	    (mb_len - RTE_PKTMBUF_HEADROOM)) {
-		;
+		tmpl.sge_n = 0;
 	} else if (dev->data->dev_conf.rxmode.enable_scatter) {
-		WARN("%p: scattered mode has been requested but is"
-		     " not supported, this may lead to packet loss",
-		     (void *)dev);
+		unsigned int sges_n;
+		unsigned int rx_pkt_len =
+				dev->data->dev_conf.rxmode.jumbo_frame ?
+				dev->data->dev_conf.rxmode.max_rx_pkt_len :
+				ETHER_MTU;
+
+		if (rx_pkt_len < ETHER_MTU)
+			rx_pkt_len = ETHER_MTU;
+		/* Only the first mbuf has a headroom */
+		rx_pkt_len = rx_pkt_len - mb_len + RTE_PKTMBUF_HEADROOM;
+		/*
+		 * Determine the number of SGEs needed for a full packet
+		 * and round it to the next power of two.
+		 */
+		sges_n = (rx_pkt_len / mb_len) + !!(rx_pkt_len % mb_len) + 1;
+		tmpl.sge_n = log2above(sges_n);
+		desc >>= tmpl.sge_n;
 	} else {
 		WARN("%p: the requested maximum Rx packet size (%u) is"
 		     " larger than a single mbuf (%u) and scattered"
@@ -282,6 +299,8 @@
 		     dev->data->dev_conf.rxmode.max_rx_pkt_len,
 		     mb_len - RTE_PKTMBUF_HEADROOM);
 	}
+	DEBUG("%p: number of sges %u (%u WRs)",
+	      (void *)dev, 1 << tmpl.sge_n, desc);
 	/* Use the entire Rx mempool as the memory region. */
 	tmpl.mr = mlx4_mp2mr(priv->pd, mp);
 	if (tmpl.mr == NULL) {
@@ -317,7 +336,7 @@
 	      priv->device_attr.max_qp_wr);
 	DEBUG("priv->device_attr.max_sge is %d",
 	      priv->device_attr.max_sge);
-	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc);
+	tmpl.qp = mlx4_rxq_setup_qp(priv, tmpl.cq, desc, 1 << tmpl.sge_n);
 	if (tmpl.qp == NULL) {
 		ERROR("%p: QP creation failure: %s",
 		      (void *)dev, strerror(rte_errno));
@@ -336,21 +355,6 @@
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
-	ret = mlx4_rxq_alloc_elts(&tmpl, desc);
-	if (ret) {
-		ERROR("%p: RXQ allocation failed: %s",
-		      (void *)dev, strerror(rte_errno));
-		goto error;
-	}
-	ret = ibv_post_recv(tmpl.qp, &(*tmpl.elts)[0].wr, &bad_wr);
-	if (ret) {
-		rte_errno = ret;
-		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
-		      (void *)dev,
-		      (void *)bad_wr,
-		      strerror(rte_errno));
-		goto error;
-	}
 	mod = (struct ibv_qp_attr){
 		.qp_state = IBV_QPS_RTR
 	};
@@ -361,14 +365,44 @@
 		      (void *)dev, strerror(rte_errno));
 		goto error;
 	}
+	/* Get HW depended info */
+	mlxdv.cq.in = tmpl.cq;
+	mlxdv.cq.out = &dv_cq;
+	mlxdv.qp.in = tmpl.qp;
+	mlxdv.qp.out = &dv_qp;
+	ret = mlx4dv_init_obj(&mlxdv, MLX4DV_OBJ_QP | MLX4DV_OBJ_CQ);
+	if (ret) {
+		ERROR("%p: Failed to retrieve device obj info", (void *)dev);
+		goto error;
+	}
+	/* Init HW depended fields */
+	tmpl.hw.wqes =
+		(volatile struct mlx4_wqe_data_seg (*)[])
+		((char *)dv_qp.buf.buf + dv_qp.rq.offset);
+	tmpl.hw.rq_db = dv_qp.rdb;
+	tmpl.hw.rq_ci = 0;
+	tmpl.mcq.buf = dv_cq.buf.buf;
+	tmpl.mcq.cqe_cnt = dv_cq.cqe_cnt;
+	tmpl.mcq.set_ci_db = dv_cq.set_ci_db;
+	tmpl.mcq.cqe_64 = (dv_cq.cqe_size & 64) ? 1 : 0;
 	/* Save port ID. */
 	tmpl.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq, tmpl.port_id);
+	ret = mlx4_rxq_alloc_elts(&tmpl, desc << tmpl.sge_n);
+	if (ret) {
+		ERROR("%p: RXQ allocation failed: %s",
+		      (void *)dev, strerror(rte_errno));
+		goto error;
+	}
 	/* Clean up rxq in case we're reinitializing it. */
 	DEBUG("%p: cleaning-up old rxq just in case", (void *)rxq);
 	mlx4_rxq_cleanup(rxq);
 	*rxq = tmpl;
 	DEBUG("%p: rxq updated with %p", (void *)rxq, (void *)&tmpl);
+	/* Update doorbell counter. */
+	rxq->hw.rq_ci = desc;
+	rte_wmb();
+	*rxq->hw.rq_db = rte_cpu_to_be_32(rxq->hw.rq_ci);
 	return 0;
 error:
 	ret = rte_errno;
@@ -406,6 +440,12 @@
 	struct rxq *rxq = dev->data->rx_queues[idx];
 	int ret;
 
+	if (!rte_is_power_of_2(desc)) {
+		desc = 1 << log2above(desc);
+		WARN("%p: increased number of descriptors in RX queue %u"
+		     " to the next power of two (%d)",
+		     (void *)dev, idx, desc);
+	}
 	DEBUG("%p: configuring queue %u for %u descriptors",
 	      (void *)dev, idx, desc);
 	if (idx >= dev->data->nb_rx_queues) {
diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 55c8e9a..e45bb3b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -521,9 +521,45 @@
 }
 
 /**
- * DPDK callback for Rx.
+ * Poll one CQE from CQ.
  *
- * The following function doesn't manage scattered packets.
+ * @param rxq
+ *   Pointer to the receive queue structure.
+ * @param[out] out
+ *   Just polled CQE.
+ *
+ * @return
+ *   Number of bytes of the CQE, 0 in case there is no completion.
+ */
+static unsigned int
+mlx4_cq_poll_one(struct rxq *rxq,
+		 struct mlx4_cqe **out)
+{
+	int ret = 0;
+	struct mlx4_cqe *cqe = NULL;
+	struct mlx4_cq *cq = &rxq->mcq;
+
+	cqe = (struct mlx4_cqe *)mlx4_get_cqe(cq, cq->cons_index);
+	if (!!(cqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^
+	    !!(cq->cons_index & cq->cqe_cnt))
+		goto out;
+	/*
+	 * Make sure we read CQ entry contents after we've checked the
+	 * ownership bit.
+	 */
+	rte_rmb();
+	assert(!(cqe->owner_sr_opcode & MLX4_CQE_IS_SEND_MASK));
+	assert((cqe->owner_sr_opcode & MLX4_CQE_OPCODE_MASK) !=
+	       MLX4_CQE_OPCODE_ERROR);
+	ret = rte_be_to_cpu_32(cqe->byte_cnt);
+	++cq->cons_index;
+out:
+	*out = cqe;
+	return ret;
+}
+
+/**
+ * DPDK callback for RX with scattered packets support.
  *
  * @param dpdk_rxq
  *   Generic pointer to Rx queue structure.
@@ -538,112 +574,109 @@
 uint16_t
 mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 {
-	struct rxq *rxq = (struct rxq *)dpdk_rxq;
-	struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts;
-	const unsigned int elts_n = rxq->elts_n;
-	unsigned int elts_head = rxq->elts_head;
-	struct ibv_wc wcs[pkts_n];
-	struct ibv_recv_wr *wr_head = NULL;
-	struct ibv_recv_wr **wr_next = &wr_head;
-	struct ibv_recv_wr *wr_bad = NULL;
-	unsigned int i;
-	unsigned int pkts_ret = 0;
-	int ret;
+	struct rxq *rxq = dpdk_rxq;
+	const unsigned int wr_cnt = (1 << rxq->elts_n) - 1;
+	const unsigned int sge_n = rxq->sge_n;
+	struct rte_mbuf *pkt = NULL;
+	struct rte_mbuf *seg = NULL;
+	unsigned int i = 0;
+	unsigned int rq_ci = (rxq->hw.rq_ci << sge_n);
+	int len = 0;
 
-	ret = ibv_poll_cq(rxq->cq, pkts_n, wcs);
-	if (unlikely(ret == 0))
-		return 0;
-	if (unlikely(ret < 0)) {
-		DEBUG("rxq=%p, ibv_poll_cq() failed (wc_n=%d)",
-		      (void *)rxq, ret);
-		return 0;
-	}
-	assert(ret <= (int)pkts_n);
-	/* For each work completion. */
-	for (i = 0; i != (unsigned int)ret; ++i) {
-		struct ibv_wc *wc = &wcs[i];
-		struct rxq_elt *elt = &(*elts)[elts_head];
-		struct ibv_recv_wr *wr = &elt->wr;
-		uint32_t len = wc->byte_len;
-		struct rte_mbuf *seg = elt->buf;
-		struct rte_mbuf *rep;
+	while (pkts_n) {
+		struct mlx4_cqe *cqe;
+		unsigned int idx = rq_ci & wr_cnt;
+		struct rte_mbuf *rep = (*rxq->elts)[idx];
+		volatile struct mlx4_wqe_data_seg *scat =
+					&(*rxq->hw.wqes)[idx];
 
-		/* Sanity checks. */
-		assert(wr->sg_list == &elt->sge);
-		assert(wr->num_sge == 1);
-		assert(elts_head < rxq->elts_n);
-		assert(rxq->elts_head < rxq->elts_n);
-		/*
-		 * Fetch initial bytes of packet descriptor into a
-		 * cacheline while allocating rep.
-		 */
-		rte_mbuf_prefetch_part1(seg);
-		rte_mbuf_prefetch_part2(seg);
-		/* Link completed WRs together for repost. */
-		*wr_next = wr;
-		wr_next = &wr->next;
-		if (unlikely(wc->status != IBV_WC_SUCCESS)) {
-			/* Whatever, just repost the offending WR. */
-			DEBUG("rxq=%p: bad work completion status (%d): %s",
-			      (void *)rxq, wc->status,
-			      ibv_wc_status_str(wc->status));
-			/* Increment dropped packets counter. */
-			++rxq->stats.idropped;
-			goto repost;
-		}
+		/* Update the 'next' pointer of the previous segment. */
+		if (pkt)
+			seg->next = rep;
+		seg = rep;
+		rte_prefetch0(seg);
+		rte_prefetch0(scat);
 		rep = rte_mbuf_raw_alloc(rxq->mp);
 		if (unlikely(rep == NULL)) {
-			/*
-			 * Unable to allocate a replacement mbuf,
-			 * repost WR.
-			 */
-			DEBUG("rxq=%p: can't allocate a new mbuf",
-			      (void *)rxq);
-			/* Increase out of memory counters. */
 			++rxq->stats.rx_nombuf;
-			++rxq->priv->dev->data->rx_mbuf_alloc_failed;
-			goto repost;
+			if (!pkt) {
+				/*
+				 * No buffers before we even started,
+				 * bail out silently.
+				 */
+				break;
+			}
+			while (pkt != seg) {
+				assert(pkt != (*rxq->elts)[idx]);
+				rep = pkt->next;
+				pkt->next = NULL;
+				pkt->nb_segs = 1;
+				rte_mbuf_raw_free(pkt);
+				pkt = rep;
+			}
+			break;
+		}
+		if (!pkt) {
+			/* Looking for the new packet */
+			len = mlx4_cq_poll_one(rxq, &cqe);
+			if (!len) {
+				rte_mbuf_raw_free(rep);
+				break;
+			}
+			if (unlikely(len < 0)) {
+				/* RX error, packet is likely too large. */
+				rte_mbuf_raw_free(rep);
+				++rxq->stats.idropped;
+				goto skip;
+			}
+			pkt = seg;
+			pkt->packet_type = 0;
+			pkt->ol_flags = 0;
+			pkt->pkt_len = len;
 		}
-		/* Reconfigure sge to use rep instead of seg. */
-		elt->sge.addr = (uintptr_t)rep->buf_addr + RTE_PKTMBUF_HEADROOM;
-		assert(elt->sge.lkey == rxq->mr->lkey);
-		elt->buf = rep;
-		/* Update seg information. */
-		seg->data_off = RTE_PKTMBUF_HEADROOM;
-		seg->nb_segs = 1;
-		seg->port = rxq->port_id;
-		seg->next = NULL;
-		seg->pkt_len = len;
+		rep->nb_segs = 1;
+		rep->port = rxq->port_id;
+		rep->data_len = seg->data_len;
+		rep->data_off = seg->data_off;
+		(*rxq->elts)[idx] = rep;
+		/*
+		 * Fill NIC descriptor with the new buffer. The lkey and size
+		 * of the buffers are already known, only the buffer address
+		 * changes.
+		 */
+		scat->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		if (len > seg->data_len) {
+			len -= seg->data_len;
+			++pkt->nb_segs;
+			++rq_ci;
+			continue;
+		}
+		/* The last segment. */
 		seg->data_len = len;
-		seg->packet_type = 0;
-		seg->ol_flags = 0;
+		/* Increment bytes counter. */
+		rxq->stats.ibytes += pkt->pkt_len;
 		/* Return packet. */
-		*(pkts++) = seg;
-		++pkts_ret;
-		/* Increase bytes counter. */
-		rxq->stats.ibytes += len;
-repost:
-		if (++elts_head >= elts_n)
-			elts_head = 0;
-		continue;
+		*(pkts++) = pkt;
+		pkt = NULL;
+		--pkts_n;
+		++i;
+skip:
+		/* Align consumer index to the next stride. */
+		rq_ci >>= sge_n;
+		++rq_ci;
+		rq_ci <<= sge_n;
 	}
-	if (unlikely(i == 0))
+	if (unlikely((i == 0) && ((rq_ci >> sge_n) == rxq->hw.rq_ci)))
 		return 0;
-	/* Repost WRs. */
-	*wr_next = NULL;
-	assert(wr_head);
-	ret = ibv_post_recv(rxq->qp, wr_head, &wr_bad);
-	if (unlikely(ret)) {
-		/* Inability to repost WRs is fatal. */
-		DEBUG("%p: recv_burst(): failed (ret=%d)",
-		      (void *)rxq->priv,
-		      ret);
-		abort();
-	}
-	rxq->elts_head = elts_head;
-	/* Increase packets counter. */
-	rxq->stats.ipackets += pkts_ret;
-	return pkts_ret;
+	/* Update the consumer index. */
+	rxq->hw.rq_ci = rq_ci >> sge_n;
+	rte_wmb();
+	*rxq->hw.rq_db = rte_cpu_to_be_32(rxq->hw.rq_ci);
+	*rxq->mcq.set_ci_db =
+		rte_cpu_to_be_32(rxq->mcq.cons_index & 0xffffff);
+	/* Increment packets counter. */
+	rxq->stats.ipackets += i;
+	return i;
 }
 
 /**
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index b515472..df83552 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -62,13 +62,6 @@ struct mlx4_rxq_stats {
 	uint64_t rx_nombuf; /**< Total of Rx mbuf allocation failures. */
 };
 
-/** Rx element. */
-struct rxq_elt {
-	struct ibv_recv_wr wr; /**< Work request. */
-	struct ibv_sge sge; /**< Scatter/gather element. */
-	struct rte_mbuf *buf; /**< Buffer. */
-};
-
 /** Rx queue descriptor. */
 struct rxq {
 	struct priv *priv; /**< Back pointer to private data. */
@@ -78,9 +71,15 @@ struct rxq {
 	struct ibv_qp *qp; /**< Queue pair. */
 	struct ibv_comp_channel *channel; /**< Rx completion channel. */
 	unsigned int port_id; /**< Port ID for incoming packets. */
-	unsigned int elts_n; /**< (*elts)[] length. */
-	unsigned int elts_head; /**< Current index in (*elts)[]. */
-	struct rxq_elt (*elts)[]; /**< Rx elements. */
+	unsigned int elts_n; /**< Log 2 of Mbufs. */
+	struct rte_mbuf *(*elts)[]; /**< Rx elements. */
+	struct {
+		volatile struct mlx4_wqe_data_seg(*wqes)[];
+		volatile uint32_t *rq_db;
+		uint16_t rq_ci;
+	} hw;
+	struct mlx4_cq mcq;  /**< Info for directly manipulating the CQ. */
+	unsigned int sge_n; /**< Log 2 of SGEs number. */
 	struct mlx4_rxq_stats stats; /**< Rx queue counters. */
 	unsigned int socket; /**< CPU socket ID for allocations. */
 };
diff --git a/drivers/net/mlx4/mlx4_utils.h b/drivers/net/mlx4/mlx4_utils.h
index 0fbdc71..d6f729f 100644
--- a/drivers/net/mlx4/mlx4_utils.h
+++ b/drivers/net/mlx4/mlx4_utils.h
@@ -108,4 +108,24 @@
 
 int mlx4_fd_set_non_blocking(int fd);
 
+/**
+ * Return nearest power of two above input value.
+ *
+ * @param v
+ *   Input value.
+ *
+ * @return
+ *   Nearest power of two above input value.
+ */
+static inline unsigned int
+log2above(unsigned int v)
+{
+	unsigned int l;
+	unsigned int r;
+
+	for (l = 0, r = 0; (v >> 1); ++l, v >>= 1)
+		r |= (v & 1);
+	return l + r;
+}
+
 #endif /* MLX4_UTILS_H_ */
-- 
1.8.3.1



More information about the dev mailing list