[dpdk-dev] [PATCH v1 08/21] net/mlx5: separate DPDK from Verbs Rx queue objects

Nelio Laranjeiro nelio.laranjeiro at 6wind.com
Wed Aug 2 16:10:24 CEST 2017


Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
---
 drivers/net/mlx5/mlx5.c      |   3 +
 drivers/net/mlx5/mlx5.h      |   2 +-
 drivers/net/mlx5/mlx5_flow.c |  97 +++-----
 drivers/net/mlx5/mlx5_rxq.c  | 564 ++++++++++++++++++++++++++-----------------
 drivers/net/mlx5/mlx5_rxtx.h |  26 +-
 drivers/net/mlx5/mlx5_vlan.c |   2 +-
 6 files changed, 401 insertions(+), 293 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 0d8ca52..c158d8e 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -200,6 +200,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	}
 	if (priv->reta_idx != NULL)
 		rte_free(priv->reta_idx);
+	i = mlx5_priv_rxq_ibv_verify(priv);
+	if (i)
+		WARN("%p: some Verbs Rx queue still remain", (void*)priv);
 	i = priv_flow_verify(priv);
 	if (i)
 		WARN("%p: some flows still remain", (void*)priv);
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 1ae5f59..228fd34 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -146,6 +146,7 @@ struct priv {
 	struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
 	TAILQ_HEAD(mlx5_flows, rte_flow) flows; /* RTE Flow rules. */
 	LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */
+	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	uint32_t link_speed_capa; /* Link speed capabilities. */
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
@@ -287,7 +288,6 @@ int mlx5_flow_flush(struct rte_eth_dev *, struct rte_flow_error *);
 int mlx5_flow_isolate(struct rte_eth_dev *, int, struct rte_flow_error *);
 int priv_flow_start(struct priv *);
 void priv_flow_stop(struct priv *);
-int priv_flow_rxq_in_use(struct priv *, struct mlx5_rxq_data *);
 int priv_flow_verify(struct priv *);
 
 /* mlx5_mr.c */
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index bcbb984..9ed8d05 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -95,11 +95,11 @@ struct rte_flow {
 	struct ibv_exp_flow *ibv_flow; /**< Verbs flow. */
 	struct ibv_exp_wq *wq; /**< Verbs work queue. */
 	struct ibv_cq *cq; /**< Verbs completion queue. */
-	uint16_t rxqs_n; /**< Number of queues in this flow, 0 if drop queue. */
 	uint32_t mark:1; /**< Set if the flow is marked. */
 	uint32_t drop:1; /**< Drop queue. */
 	uint64_t hash_fields; /**< Fields that participate in the hash. */
-	struct mlx5_rxq_data *rxqs[]; /**< Pointer to the queues array. */
+	uint16_t queues[RTE_MAX_QUEUES_PER_PORT]; /**< List of queues. */
+	uint16_t queues_n; /**< Number of queues in the list. */
 };
 
 /** Static initializer for items. */
@@ -1097,23 +1097,21 @@ priv_flow_create_action_queue(struct priv *priv,
 	assert(priv->pd);
 	assert(priv->ctx);
 	assert(!flow->actions.drop);
-	rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow) +
-			      sizeof(*rte_flow->rxqs) * flow->actions.queues_n,
-			      0);
+	rte_flow = rte_calloc(__func__, 1, sizeof(*rte_flow), 0);
 	if (!rte_flow) {
 		rte_flow_error_set(error, ENOMEM, RTE_FLOW_ERROR_TYPE_HANDLE,
 				   NULL, "cannot allocate flow memory");
 		return NULL;
 	}
 	for (i = 0; i < flow->actions.queues_n; ++i) {
-		struct mlx5_rxq_ctrl *rxq;
+		struct mlx5_rxq_ibv *rxq =
+			mlx5_priv_rxq_ibv_get(priv, flow->actions.queues[i]);
 
-		rxq = container_of((*priv->rxqs)[flow->actions.queues[i]],
-				   struct mlx5_rxq_ctrl, rxq);
 		wqs[i] = rxq->wq;
-		rte_flow->rxqs[i] = &rxq->rxq;
-		++rte_flow->rxqs_n;
-		rxq->rxq.mark |= flow->actions.mark;
+		rte_flow->queues[i] = flow->actions.queues[i];
+		++rte_flow->queues_n;
+		(*priv->rxqs)[flow->actions.queues[i]]->mark |=
+			flow->actions.mark;
 	}
 	/* finalise indirection table. */
 	for (j = 0; i < wqs_n; ++i, ++j) {
@@ -1294,6 +1292,8 @@ static void
 priv_flow_destroy(struct priv *priv,
 		  struct rte_flow *flow)
 {
+	unsigned int i;
+
 	TAILQ_REMOVE(&priv->flows, flow, next);
 	if (flow->ibv_flow)
 		claim_zero(ibv_exp_destroy_flow(flow->ibv_flow));
@@ -1303,37 +1303,33 @@ priv_flow_destroy(struct priv *priv,
 		claim_zero(ibv_destroy_qp(flow->qp));
 	if (flow->ind_table)
 		claim_zero(ibv_exp_destroy_rwq_ind_table(flow->ind_table));
-	if (flow->mark) {
+	for (i = 0; i != flow->queues_n; ++i) {
 		struct rte_flow *tmp;
-		struct mlx5_rxq_data *rxq;
-		uint32_t mark_n = 0;
-		uint32_t queue_n;
+		struct mlx5_rxq_data *rxq = (*priv->rxqs)[flow->queues[i]];
+		struct mlx5_rxq_ctrl *rxq_ctrl =
+			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
 		/*
 		 * To remove the mark from the queue, the queue must not be
 		 * present in any other marked flow (RSS or not).
 		 */
-		for (queue_n = 0; queue_n < flow->rxqs_n; ++queue_n) {
-			rxq = flow->rxqs[queue_n];
-			for (tmp = TAILQ_FIRST(&priv->flows);
-			     tmp;
-			     tmp = TAILQ_NEXT(tmp, next)) {
-				uint32_t tqueue_n;
+		if (flow->mark) {
+			int mark = 0;
+
+			TAILQ_FOREACH(tmp, &priv->flows, next) {
+				unsigned int j;
 
 				if (tmp->drop)
 					continue;
-				for (tqueue_n = 0;
-				     tqueue_n < tmp->rxqs_n;
-				     ++tqueue_n) {
-					struct mlx5_rxq_data *trxq;
-
-					trxq = tmp->rxqs[tqueue_n];
-					if (rxq == trxq)
-						++mark_n;
-				}
+				if (!tmp->mark)
+					continue;
+				for (j = 0; (j != tmp->queues_n) && !mark; j++)
+					if (tmp->queues[j] == flow->queues[i])
+						mark = 1;
 			}
-			rxq->mark = !!mark_n;
+			rxq->mark = mark;
 		}
+		mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
 	}
 free:
 	rte_free(flow->ibv_attr);
@@ -1532,8 +1528,8 @@ priv_flow_stop(struct priv *priv)
 		if (flow->mark) {
 			unsigned int n;
 
-			for (n = 0; n < flow->rxqs_n; ++n)
-				flow->rxqs[n]->mark = 0;
+			for (n = 0; n < flow->queues_n; ++n)
+				(*priv->rxqs)[flow->queues[n]]->mark = 0;
 		}
 		DEBUG("Flow %p removed", (void *)flow);
 	}
@@ -1575,39 +1571,8 @@ priv_flow_start(struct priv *priv)
 		if (flow->mark) {
 			unsigned int n;
 
-			for (n = 0; n < flow->rxqs_n; ++n)
-				flow->rxqs[n]->mark = 1;
-		}
-	}
-	return 0;
-}
-
-/**
- * Verify if the Rx queue is used in a flow.
- *
- * @param priv
- *   Pointer to private structure.
- * @param rxq
- *   Pointer to the queue to search.
- *
- * @return
- *   Nonzero if the queue is used by a flow.
- */
-int
-priv_flow_rxq_in_use(struct priv *priv, struct mlx5_rxq_data *rxq)
-{
-	struct rte_flow *flow;
-
-	for (flow = TAILQ_FIRST(&priv->flows);
-	     flow;
-	     flow = TAILQ_NEXT(flow, next)) {
-		unsigned int n;
-
-		if (flow->drop)
-			continue;
-		for (n = 0; n < flow->rxqs_n; ++n) {
-			if (flow->rxqs[n] == rxq)
-				return 1;
+			for (n = 0; n < flow->queues_n; ++n)
+				(*priv->rxqs)[flow->queues[n]]->mark = 1;
 		}
 	}
 	return 0;
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 80cfd96..1663734 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -378,7 +378,7 @@ priv_create_hash_rxqs(struct priv *priv)
 
 		rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
 					struct mlx5_rxq_ctrl, rxq);
-		wqs[i] = rxq_ctrl->wq;
+		wqs[i] = rxq_ctrl->ibv->wq;
 	}
 	/* Get number of hash RX queues to configure. */
 	for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
@@ -647,8 +647,6 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl, unsigned int elts_n)
 	/* Iterate on segments. */
 	for (i = 0; (i != elts_n); ++i) {
 		struct rte_mbuf *buf;
-		volatile struct mlx5_wqe_data_seg *scat =
-			&(*rxq_ctrl->rxq.wqes)[i];
 
 		buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
 		if (buf == NULL) {
@@ -669,13 +667,6 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl, unsigned int elts_n)
 		DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
 		PKT_LEN(buf) = DATA_LEN(buf);
 		NB_SEGS(buf) = 1;
-		/* scat->addr must be able to store a pointer. */
-		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
-		*scat = (struct mlx5_wqe_data_seg){
-			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
-			.byte_count = htonl(DATA_LEN(buf)),
-			.lkey = rxq_ctrl->mr->lkey,
-		};
 		(*rxq_ctrl->rxq.elts)[i] = buf;
 	}
 	if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
@@ -761,65 +752,12 @@ mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
 {
 	DEBUG("cleaning up %p", (void *)rxq_ctrl);
 	rxq_free_elts(rxq_ctrl);
-	if (rxq_ctrl->wq != NULL)
-		claim_zero(ibv_exp_destroy_wq(rxq_ctrl->wq));
-	if (rxq_ctrl->cq != NULL)
-		claim_zero(ibv_destroy_cq(rxq_ctrl->cq));
-	if (rxq_ctrl->channel != NULL)
-		claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
-	if (rxq_ctrl->mr != NULL)
-		priv_mr_release(rxq_ctrl->priv, rxq_ctrl->mr);
+	if (rxq_ctrl->ibv)
+		mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
 }
 
 /**
- * Initialize RX queue.
- *
- * @param tmpl
- *   Pointer to RX queue control template.
- *
- * @return
- *   0 on success, errno value on failure.
- */
-static inline int
-rxq_setup(struct mlx5_rxq_ctrl *tmpl)
-{
-	struct ibv_cq *ibcq = tmpl->cq;
-	struct ibv_mlx5_cq_info cq_info;
-	struct mlx5_rwq *rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
-	const uint16_t desc_n =
-		(1 << tmpl->rxq.elts_n) + tmpl->priv->rx_vec_en *
-		MLX5_VPMD_DESCS_PER_LOOP;
-	struct rte_mbuf *(*elts)[desc_n] =
-		rte_calloc_socket("RXQ", 1, sizeof(*elts), 0, tmpl->socket);
-	if (ibv_mlx5_exp_get_cq_info(ibcq, &cq_info)) {
-		ERROR("Unable to query CQ info. check your OFED.");
-		return ENOTSUP;
-	}
-	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
-		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
-		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
-		return EINVAL;
-	}
-	if (elts == NULL)
-		return ENOMEM;
-	tmpl->rxq.rq_db = rwq->rq.db;
-	tmpl->rxq.cqe_n = log2above(cq_info.cqe_cnt);
-	tmpl->rxq.cq_ci = 0;
-	tmpl->rxq.rq_ci = 0;
-	tmpl->rxq.rq_pi = 0;
-	tmpl->rxq.cq_db = cq_info.dbrec;
-	tmpl->rxq.wqes =
-		(volatile struct mlx5_wqe_data_seg (*)[])
-		(uintptr_t)rwq->rq.buff;
-	tmpl->rxq.cqes =
-		(volatile struct mlx5_cqe (*)[])
-		(uintptr_t)cq_info.buf;
-	tmpl->rxq.elts = elts;
-	return 0;
-}
-
-/**
  * Configure a RX queue.
  *
  * @param dev
@@ -848,25 +786,24 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 		.priv = priv,
 		.socket = socket,
 		.rxq = {
+			.elts = rte_calloc_socket("RXQ", 1,
+						  desc *
+						  sizeof(struct rte_mbuf *), 0,
+						  socket),
 			.elts_n = log2above(desc),
 			.mp = mp,
 			.rss_hash = priv->rxqs_n > 1,
 		},
 	};
-	struct ibv_exp_wq_attr mod;
-	union {
-		struct ibv_exp_cq_init_attr cq;
-		struct ibv_exp_wq_init_attr wq;
-		struct ibv_exp_cq_attr cq_attr;
-	} attr;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
-	unsigned int cqe_n = desc - 1;
 	const uint16_t desc_n =
 		desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
 	struct rte_mbuf *(*elts)[desc_n] = NULL;
 	int ret = 0;
 
 	(void)conf; /* Thresholds configuration (ignored). */
+	if (dev->data->dev_conf.intr_conf.rxq)
+		tmpl.memory_channel = 1;
 	/* Enable scattered packets support for this queue if necessary. */
 	assert(mb_len >= RTE_PKTMBUF_HEADROOM);
 	if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
@@ -919,78 +856,13 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 	if (priv->hw_csum_l2tun)
 		tmpl.rxq.csum_l2tun =
 			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
-	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = priv_mr_get(priv, mp);
-	if (tmpl.mr == NULL) {
-		tmpl.mr = priv_mr_new(priv, mp);
-		if (tmpl.mr == NULL) {
-			ret = EINVAL;
-			ERROR("%p: MR creation failure: %s",
-			      (void *)dev, strerror(ret));
-			goto error;
-		}
-	}
-	if (dev->data->dev_conf.intr_conf.rxq) {
-		tmpl.channel = ibv_create_comp_channel(priv->ctx);
-		if (tmpl.channel == NULL) {
-			ret = ENOMEM;
-			ERROR("%p: Rx interrupt completion channel creation"
-			      " failure: %s",
-			      (void *)dev, strerror(ret));
-			goto error;
-		}
-	}
-	attr.cq = (struct ibv_exp_cq_init_attr){
-		.comp_mask = 0,
-	};
-	if (priv->cqe_comp) {
-		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
-		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
-		/*
-		 * For vectorized Rx, it must not be doubled in order to
-		 * make cq_ci and rq_ci aligned.
-		 */
-		if (rxq_check_vec_support(&tmpl.rxq) < 0)
-			cqe_n = (desc * 2) - 1; /* Double the number of CQEs. */
-	}
-	tmpl.cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl.channel, 0,
-				    &attr.cq);
-	if (tmpl.cq == NULL) {
-		ret = ENOMEM;
-		ERROR("%p: CQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	DEBUG("priv->device_attr.max_qp_wr is %d",
-	      priv->device_attr.max_qp_wr);
-	DEBUG("priv->device_attr.max_sge is %d",
-	      priv->device_attr.max_sge);
 	/* Configure VLAN stripping. */
 	tmpl.rxq.vlan_strip = (priv->hw_vlan_strip &&
 			       !!dev->data->dev_conf.rxmode.hw_vlan_strip);
-	attr.wq = (struct ibv_exp_wq_init_attr){
-		.wq_context = NULL, /* Could be useful in the future. */
-		.wq_type = IBV_EXP_WQT_RQ,
-		/* Max number of outstanding WRs. */
-		.max_recv_wr = desc >> tmpl.rxq.sges_n,
-		/* Max number of scatter/gather elements in a WR. */
-		.max_recv_sge = 1 << tmpl.rxq.sges_n,
-		.pd = priv->pd,
-		.cq = tmpl.cq,
-		.comp_mask =
-			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
-			0,
-		.vlan_offloads = (tmpl.rxq.vlan_strip ?
-				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
-				  0),
-	};
 	/* By default, FCS (CRC) is stripped by hardware. */
 	if (dev->data->dev_conf.rxmode.hw_strip_crc) {
 		tmpl.rxq.crc_present = 0;
 	} else if (priv->hw_fcs_strip) {
-		/* Ask HW/Verbs to leave CRC in place when supported. */
-		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
-		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
 		tmpl.rxq.crc_present = 1;
 	} else {
 		WARN("%p: CRC stripping has been disabled but will still"
@@ -1004,59 +876,9 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 	      (void *)dev,
 	      tmpl.rxq.crc_present ? "disabled" : "enabled",
 	      tmpl.rxq.crc_present << 2);
-	if (!mlx5_getenv_int("MLX5_PMD_ENABLE_PADDING"))
-		; /* Nothing else to do. */
-	else if (priv->hw_padding) {
-		INFO("%p: enabling packet padding on queue %p",
-		     (void *)dev, (void *)rxq_ctrl);
-		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
-		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
-	} else
-		WARN("%p: packet padding has been requested but is not"
-		     " supported, make sure MLNX_OFED and firmware are"
-		     " up to date",
-		     (void *)dev);
-
-	tmpl.wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
-	if (tmpl.wq == NULL) {
-		ret = (errno ? errno : EINVAL);
-		ERROR("%p: WQ creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	/*
-	 * Make sure number of WRs*SGEs match expectations since a queue
-	 * cannot allocate more than "desc" buffers.
-	 */
-	if (((int)attr.wq.max_recv_wr != (desc >> tmpl.rxq.sges_n)) ||
-	    ((int)attr.wq.max_recv_sge != (1 << tmpl.rxq.sges_n))) {
-		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
-		      (void *)dev,
-		      (desc >> tmpl.rxq.sges_n), (1 << tmpl.rxq.sges_n),
-		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
-		ret = EINVAL;
-		goto error;
-	}
 	/* Save port ID. */
 	tmpl.rxq.port_id = dev->data->port_id;
 	DEBUG("%p: RTE port ID: %u", (void *)rxq_ctrl, tmpl.rxq.port_id);
-	/* Change queue state to ready. */
-	mod = (struct ibv_exp_wq_attr){
-		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
-		.wq_state = IBV_EXP_WQS_RDY,
-	};
-	ret = ibv_exp_modify_wq(tmpl.wq, &mod);
-	if (ret) {
-		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
-	ret = rxq_setup(&tmpl);
-	if (ret) {
-		ERROR("%p: cannot initialize RX queue structure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
-	}
 	ret = rxq_alloc_elts(&tmpl, desc);
 	if (ret) {
 		ERROR("%p: RXQ allocation failed: %s",
@@ -1075,17 +897,12 @@ mlx5_rxq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 	rte_free(tmpl.rxq.elts);
 	tmpl.rxq.elts = elts;
 	*rxq_ctrl = tmpl;
-	/* Update doorbell counter. */
-	rxq_ctrl->rxq.rq_ci = desc >> rxq_ctrl->rxq.sges_n;
-	rte_wmb();
-	*rxq_ctrl->rxq.rq_db = htonl(rxq_ctrl->rxq.rq_ci);
 	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
 	assert(ret == 0);
 	return 0;
 error:
-	elts = tmpl.rxq.elts;
+	rte_free(tmpl.rxq.elts);
 	mlx5_rxq_cleanup(&tmpl);
-	rte_free(elts);
 	assert(ret > 0);
 	return ret;
 }
@@ -1175,14 +992,20 @@ mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		}
 	}
 	ret = mlx5_rxq_ctrl_setup(dev, rxq_ctrl, desc, socket, conf, mp);
-	if (ret)
+	if (ret) {
 		rte_free(rxq_ctrl);
-	else {
-		rxq_ctrl->rxq.stats.idx = idx;
-		DEBUG("%p: adding RX queue %p to list",
-		      (void *)dev, (void *)rxq_ctrl);
-		(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+		goto out;
 	}
+	rxq_ctrl->rxq.stats.idx = idx;
+	DEBUG("%p: adding RX queue %p to list",
+	      (void *)dev, (void *)rxq_ctrl);
+	(*priv->rxqs)[idx] = &rxq_ctrl->rxq;
+	rxq_ctrl->ibv = mlx5_priv_rxq_ibv_new(priv, idx);
+	if (!rxq_ctrl->ibv) {
+		ret = EAGAIN;
+		goto out;
+	}
+out:
 	priv_unlock(priv);
 	return -ret;
 }
@@ -1209,7 +1032,7 @@ mlx5_rx_queue_release(void *dpdk_rxq)
 	rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 	priv = rxq_ctrl->priv;
 	priv_lock(priv);
-	if (priv_flow_rxq_in_use(priv, rxq))
+	if (!mlx5_priv_rxq_ibv_releasable(priv, rxq_ctrl->ibv))
 		rte_panic("Rx queue %p is still used by a flow and cannot be"
 			  " removed\n", (void *)rxq_ctrl);
 	for (i = 0; (i != priv->rxqs_n); ++i)
@@ -1253,15 +1076,14 @@ priv_rx_intr_vec_enable(struct priv *priv)
 	}
 	intr_handle->type = RTE_INTR_HANDLE_EXT;
 	for (i = 0; i != n; ++i) {
-		struct mlx5_rxq_data *rxq = (*priv->rxqs)[i];
-		struct mlx5_rxq_ctrl *rxq_ctrl =
-			container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+		/* This rxq ibv must not be released in this function. */
+		struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, i);
 		int fd;
 		int flags;
 		int rc;
 
 		/* Skip queues that cannot request interrupts. */
-		if (!rxq || !rxq_ctrl->channel) {
+		if (!rxq || !rxq->channel) {
 			/* Use invalid intr_vec[] index to disable entry. */
 			intr_handle->intr_vec[i] =
 				RTE_INTR_VEC_RXTX_OFFSET +
@@ -1275,7 +1097,7 @@ priv_rx_intr_vec_enable(struct priv *priv)
 			priv_rx_intr_vec_disable(priv);
 			return -1;
 		}
-		fd = rxq_ctrl->channel->fd;
+		fd = rxq->channel->fd;
 		flags = fcntl(fd, F_GETFL);
 		rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
 		if (rc < 0) {
@@ -1305,7 +1127,27 @@ void
 priv_rx_intr_vec_disable(struct priv *priv)
 {
 	struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
+	unsigned int i;
+	unsigned int rxqs_n = priv->rxqs_n;
+	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
 
+	if (!priv->dev->data->dev_conf.intr_conf.rxq)
+		return;
+	for (i = 0; i != n; ++i) {
+		struct mlx5_rxq_ctrl *ctrl;
+		struct mlx5_rxq_data *rxq;
+
+		if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
+		    RTE_MAX_RXTX_INTR_VEC_ID)
+			continue;
+		/**
+		 * Need to access directly the queue to release the reference
+		 * kept in priv_rx_intr_vec_enable().
+		 */
+		rxq = (*priv->rxqs)[i];
+		ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+		mlx5_priv_rxq_ibv_release(priv, ctrl->ibv);
+	}
 	rte_intr_free_epoll_fd(intr_handle);
 	free(intr_handle->intr_vec);
 	intr_handle->nb_efd = 0;
@@ -1329,19 +1171,19 @@ int
 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
 	struct priv *priv = mlx5_get_priv(dev);
-	struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
-	struct mlx5_rxq_ctrl *rxq_ctrl =
-		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
 	int ret;
 
-	if (!rxq || !rxq_ctrl->channel) {
+	if (!rxq || !rxq->channel) {
 		ret = EINVAL;
 	} else {
-		ibv_mlx5_exp_update_cq_ci(rxq_ctrl->cq, rxq->cq_ci);
-		ret = ibv_req_notify_cq(rxq_ctrl->cq, 0);
+		ibv_mlx5_exp_update_cq_ci(rxq->cq,
+					  (*priv->rxqs)[rx_queue_id]->cq_ci);
+		ret = ibv_req_notify_cq(rxq->cq, 0);
 	}
 	if (ret)
 		WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
+	mlx5_priv_rxq_ibv_release(priv, rxq);
 	return -ret;
 }
 
@@ -1360,26 +1202,312 @@ int
 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
 {
 	struct priv *priv = mlx5_get_priv(dev);
-	struct mlx5_rxq_data *rxq = (*priv->rxqs)[rx_queue_id];
-	struct mlx5_rxq_ctrl *rxq_ctrl =
-		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_rxq_ibv *rxq = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
 	struct ibv_cq *ev_cq;
 	void *ev_ctx;
 	int ret;
 
-	if (!rxq || !rxq_ctrl->channel) {
+	if (!rxq || !rxq->channel) {
 		ret = EINVAL;
 	} else {
-		ret = ibv_get_cq_event(rxq_ctrl->cq->channel, &ev_cq, &ev_ctx);
-		if (ret || ev_cq != rxq_ctrl->cq)
+		ret = ibv_get_cq_event(rxq->cq->channel, &ev_cq, &ev_ctx);
+		if (ret || ev_cq != rxq->cq)
 			ret = EINVAL;
 	}
 	if (ret)
 		WARN("unable to disable interrupt on rx queue %d",
 		     rx_queue_id);
 	else
-		ibv_ack_cq_events(rxq_ctrl->cq, 1);
+		ibv_ack_cq_events(rxq->cq, 1);
+	mlx5_priv_rxq_ibv_release(priv, rxq);
 	return -ret;
 }
 
 #endif /* HAVE_UPDATE_CQ_CI */
+
+/**
+ * Create the Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object initialised if it can be created.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct ibv_exp_wq_attr mod;
+	union {
+		struct ibv_exp_cq_init_attr cq;
+		struct ibv_exp_wq_init_attr wq;
+		struct ibv_exp_cq_attr cq_attr;
+	} attr;
+	unsigned int cqe_n = (1 << rxq->elts_n) - 1;
+	struct mlx5_rxq_ibv *tmpl;
+	struct ibv_mlx5_cq_info cq_info;
+	struct mlx5_rwq *rwq;
+	unsigned int i;
+	int ret = 0;
+
+	assert(!rxq_ctrl->ibv);
+	tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
+				 rxq_ctrl->socket);
+	if (!tmpl) {
+		ERROR("%p: cannot allocate verbs ressources",
+		       (void*)rxq_ctrl);
+		goto error;
+	}
+	/* Use the entire RX mempool as the memory region. */
+	tmpl->mr = priv_mr_get(priv, rxq->mp);
+	if (!tmpl->mr) {
+		tmpl->mr = priv_mr_new(priv, rxq->mp);
+		if (!tmpl->mr) {
+			ERROR("%p: MR creation failure", (void *)rxq_ctrl);
+			goto error;
+		}
+	}
+	if (rxq_ctrl->memory_channel) {
+		tmpl->channel = ibv_create_comp_channel(priv->ctx);
+		if (!tmpl->channel) {
+			ERROR("%p: Comp Channel creation failure",
+			      (void *)rxq_ctrl);
+			goto error;
+		}
+	}
+	attr.cq = (struct ibv_exp_cq_init_attr){
+		.comp_mask = 0,
+	};
+	if (priv->cqe_comp) {
+		attr.cq.comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS;
+		attr.cq.flags |= IBV_EXP_CQ_COMPRESSED_CQE;
+		/*
+		 * For vectorized Rx, it must not be doubled in order to
+		 * make cq_ci and rq_ci aligned.
+		 */
+		if (rxq_check_vec_support(rxq) < 0)
+			cqe_n *= 2;
+	}
+	tmpl->cq = ibv_exp_create_cq(priv->ctx, cqe_n, NULL, tmpl->channel, 0,
+				     &attr.cq);
+	if (tmpl->cq == NULL) {
+		ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
+		goto error;
+	}
+	if (ibv_mlx5_exp_get_cq_info(tmpl->cq, &cq_info)) {
+		ERROR("Unable to query CQ info. check your OFED.");
+		goto error;
+	}
+	if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
+		ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
+		      "it should be set to %u", RTE_CACHE_LINE_SIZE);
+		goto error;
+	}
+	DEBUG("priv->device_attr.max_qp_wr is %d",
+	      priv->device_attr.max_qp_wr);
+	DEBUG("priv->device_attr.max_sge is %d",
+	      priv->device_attr.max_sge);
+	attr.wq = (struct ibv_exp_wq_init_attr){
+		.wq_context = NULL, /* Could be useful in the future. */
+		.wq_type = IBV_EXP_WQT_RQ,
+		/* Max number of outstanding WRs. */
+		.max_recv_wr = (1 << rxq->elts_n) >> rxq->sges_n,
+		/* Max number of scatter/gather elements in a WR. */
+		.max_recv_sge = 1 << rxq->sges_n,
+		.pd = priv->pd,
+		.cq = tmpl->cq,
+		.comp_mask =
+			IBV_EXP_CREATE_WQ_VLAN_OFFLOADS |
+			0,
+		.vlan_offloads = (rxq->vlan_strip ?
+				  IBV_EXP_RECEIVE_WQ_CVLAN_STRIP :
+				  0),
+	};
+	/* By default, FCS (CRC) is stripped by hardware. */
+	if (rxq->crc_present) {
+		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_SCATTER_FCS;
+		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
+	}
+	if (priv->hw_padding) {
+		attr.wq.flags |= IBV_EXP_CREATE_WQ_FLAG_RX_END_PADDING;
+		attr.wq.comp_mask |= IBV_EXP_CREATE_WQ_FLAGS;
+	}
+	tmpl->wq = ibv_exp_create_wq(priv->ctx, &attr.wq);
+	if (tmpl->wq == NULL) {
+		ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
+		goto error;
+	}
+	/*
+	 * Make sure number of WRs*SGEs match expectations since a queue
+	 * cannot allocate more than "desc" buffers.
+	 */
+	if (((int)attr.wq.max_recv_wr != ((1 << rxq->elts_n) >> rxq->sges_n)) ||
+	    ((int)attr.wq.max_recv_sge != (1 << rxq->sges_n))) {
+		ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
+		      (void *)rxq_ctrl,
+		      ((1 << rxq->elts_n) >> rxq->sges_n),
+		      (1 << rxq->sges_n),
+		      attr.wq.max_recv_wr, attr.wq.max_recv_sge);
+		goto error;
+	}
+	/* Change queue state to ready. */
+	mod = (struct ibv_exp_wq_attr){
+		.attr_mask = IBV_EXP_WQ_ATTR_STATE,
+		.wq_state = IBV_EXP_WQS_RDY,
+	};
+	ret = ibv_exp_modify_wq(tmpl->wq, &mod);
+	if (ret) {
+		ERROR("%p: WQ state to IBV_EXP_WQS_RDY failed",
+		      (void *)rxq_ctrl);
+		goto error;
+	}
+	/* Fill the rings. */
+	rwq = container_of(tmpl->wq, struct mlx5_rwq, wq);
+	rxq->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
+		(uintptr_t)rwq->rq.buff;
+	for (i = 0; (i != (unsigned int)(1 << rxq->elts_n)); ++i) {
+		struct rte_mbuf *buf = (*rxq->elts)[i];
+		volatile struct mlx5_wqe_data_seg *scat = &(*rxq->wqes)[i];
+
+		/* scat->addr must be able to store a pointer. */
+		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
+		*scat = (struct mlx5_wqe_data_seg){
+			.addr = htonll(rte_pktmbuf_mtod(buf, uintptr_t)),
+			.byte_count = htonl(DATA_LEN(buf)),
+			.lkey = tmpl->mr->lkey,
+		};
+	}
+	rxq->rq_db = rwq->rq.db;
+	rxq->cqe_n = log2above(cq_info.cqe_cnt);
+	rxq->cq_ci = 0;
+	rxq->rq_ci = 0;
+	rxq->cq_db = cq_info.dbrec;
+	rxq->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
+	/* Update doorbell counter. */
+	rxq->rq_ci = (1 << rxq->elts_n) >> rxq->sges_n;
+	rte_wmb();
+	*rxq->rq_db = htonl(rxq->rq_ci);
+	DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
+	rte_atomic32_inc(&tmpl->refcnt);
+	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+	      (void*)tmpl, rte_atomic32_read(&tmpl->refcnt));
+	LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
+	return tmpl;
+error:
+	if (tmpl->wq)
+		claim_zero(ibv_exp_destroy_wq(tmpl->wq));
+	if (tmpl->cq)
+		claim_zero(ibv_destroy_cq(tmpl->cq));
+	if (tmpl->channel)
+		claim_zero(ibv_destroy_comp_channel(tmpl->channel));
+	if (tmpl->mr)
+		priv_mr_release(priv, tmpl->mr);
+	return NULL;
+
+}
+
+/**
+ * Get an Rx queue Verbs object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param idx
+ *   Queue index in DPDK Rx queue array
+ *
+ * @return
+ *   The Verbs object if it exists.
+ */
+struct mlx5_rxq_ibv*
+mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
+{
+	struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
+	struct mlx5_rxq_ctrl *ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
+	struct mlx5_mr *mr __rte_unused;
+
+	if (ctrl->ibv) {
+		mr = priv_mr_get(priv, rxq->mp);
+		rte_atomic32_inc(&ctrl->ibv->refcnt);
+		DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+		      (void*)ctrl->ibv, rte_atomic32_read(&ctrl->ibv->refcnt));
+	}
+	return ctrl->ibv;
+}
+
+/**
+ * Release an Rx verbs queue object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq
+ *   Verbs Rx queue object.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq)
+{
+	int ret;
+
+	assert(rxq->wq);
+	assert(rxq->cq);
+	assert(rxq->mr);
+	ret = priv_mr_release(priv, rxq->mr);
+	if (!ret)
+		rxq->mr = NULL;
+	DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void*)priv,
+	      (void*)rxq, rte_atomic32_read(&rxq->refcnt));
+	if (rte_atomic32_dec_and_test(&rxq->refcnt)) {
+		claim_zero(ibv_exp_destroy_wq(rxq->wq));
+		claim_zero(ibv_destroy_cq(rxq->cq));
+		if (rxq->channel)
+			claim_zero(ibv_destroy_comp_channel(rxq->channel));
+		LIST_REMOVE(rxq, next);
+		rte_free(rxq);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify the Verbs Rx queue list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+mlx5_priv_rxq_ibv_verify(struct priv *priv)
+{
+	int ret = 0;
+	struct mlx5_rxq_ibv *rxq;
+
+	LIST_FOREACH(rxq, &priv->rxqsibv, next) {
+		DEBUG("%p: Verbs Rx queue %p still referenced", (void*)priv,
+		      (void*)rxq);
+		++ret;
+	}
+	return ret;
+}
+
+/**
+ * Return true if a single reference exists on the object.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param rxq
+ *   Verbs Rx queue object.
+ */
+int
+mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq)
+{
+	(void)priv;
+	return (rte_atomic32_read(&rxq->refcnt) == 1);
+}
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index c7c7518..abdbf6a 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -130,15 +130,24 @@ struct mlx5_rxq_data {
 	struct rte_mbuf fake_mbuf; /* elts padding for vectorized Rx. */
 } __rte_cache_aligned;
 
-/* RX queue control descriptor. */
-struct mlx5_rxq_ctrl {
-	struct priv *priv; /* Back pointer to private data. */
+/* Verbs Rx queue elements. */
+struct mlx5_rxq_ibv {
+	LIST_ENTRY(mlx5_rxq_ibv) next; /* Pointer to the next element. */
+	rte_atomic32_t refcnt; /* Reference counter. */
+	struct mlx5_rxq_ctrl *rxq_ctrl; /* Back pointer to parent. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_exp_wq *wq; /* Work Queue. */
-	struct mlx5_mr *mr; /* Memory Region (for mp). */
 	struct ibv_comp_channel *channel;
-	unsigned int socket; /* CPU socket ID for allocations. */
+	struct mlx5_mr *mr; /* Memory Region (for mp). */
+};
+
+/* RX queue control descriptor. */
+struct mlx5_rxq_ctrl {
+	struct priv *priv; /* Back pointer to private data. */
+	struct mlx5_rxq_ibv *ibv; /* Verbs elements. */
 	struct mlx5_rxq_data rxq; /* Data path structure. */
+	unsigned int socket; /* CPU socket ID for allocations. */
+	unsigned int memory_channel:1; /* Need memory channel. */
 };
 
 /* Hash RX queue types. */
@@ -298,7 +307,6 @@ void priv_destroy_hash_rxqs(struct priv *);
 int priv_allow_flow_type(struct priv *, enum hash_rxq_flow_type);
 int priv_rehash_flows(struct priv *);
 void mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *);
-int mlx5_rxq_rehash(struct rte_eth_dev *, struct mlx5_rxq_ctrl *);
 int mlx5_rxq_ctrl_setup(struct rte_eth_dev *, struct mlx5_rxq_ctrl *,
 			uint16_t, unsigned int, const struct rte_eth_rxconf *,
 			struct rte_mempool *);
@@ -311,6 +319,11 @@ void priv_rx_intr_vec_disable(struct priv *priv);
 int mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
 int mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id);
 #endif /* HAVE_UPDATE_CQ_CI */
+struct mlx5_rxq_ibv* mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx);
+struct mlx5_rxq_ibv* mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx);
+int mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq);
+int mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq);
+int mlx5_priv_rxq_ibv_verify(struct priv *priv);
 
 /* mlx5_txq.c */
 
@@ -347,7 +360,6 @@ uint16_t mlx5_rx_burst_vec(void *, struct rte_mbuf **, uint16_t);
 
 /* mlx5_mr.c */
 
-struct ibv_mr *mlx5_mp2mr(struct ibv_pd *, struct rte_mempool *);
 void mlx5_txq_mp2mr_iter(struct rte_mempool *, void *);
 uint32_t mlx5_txq_mp2mr_reg(struct mlx5_txq_data *, struct rte_mempool *,
 			    unsigned int);
diff --git a/drivers/net/mlx5/mlx5_vlan.c b/drivers/net/mlx5/mlx5_vlan.c
index 512052a..dffa1cd 100644
--- a/drivers/net/mlx5/mlx5_vlan.c
+++ b/drivers/net/mlx5/mlx5_vlan.c
@@ -153,7 +153,7 @@ priv_vlan_strip_queue_set(struct priv *priv, uint16_t idx, int on)
 		.vlan_offloads = vlan_offloads,
 	};
 
-	err = ibv_exp_modify_wq(rxq_ctrl->wq, &mod);
+	err = ibv_exp_modify_wq(rxq_ctrl->ibv->wq, &mod);
 	if (err) {
 		ERROR("%p: failed to modified stripping mode: %s",
 		      (void *)priv, strerror(err));
-- 
2.1.4



More information about the dev mailing list