[dpdk-dev] [PATCH v2 09/30] net/mlx5: add reference counter on memory region

Nelio Laranjeiro nelio.laranjeiro at 6wind.com
Thu Oct 5 14:49:41 CEST 2017


This patch introduce the Memory region as a shared object where users
should get a reference to it by calling the priv_mr_get() or priv_mr_new()
to create the memory region.  This last one will register the memory pool
in the kernel driver and retrieve the associated memory region.

This should help to reduce the memory consumption cause by registering
multiple times the same memory pool.

Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
---
 drivers/net/mlx5/mlx5.c      |   3 +
 drivers/net/mlx5/mlx5.h      |   8 ++
 drivers/net/mlx5/mlx5_mr.c   | 210 ++++++++++++++++++++++++++++++-------------
 drivers/net/mlx5/mlx5_rxq.c  |  17 ++--
 drivers/net/mlx5/mlx5_rxtx.h |  52 +++++++----
 drivers/net/mlx5/mlx5_txq.c  |   8 +-
 6 files changed, 206 insertions(+), 92 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 6d17d30..eb0d6c5 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -256,6 +256,9 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	ret = priv_flow_verify(priv);
 	if (ret)
 		WARN("%p: some flows still remain", (void *)priv);
+	ret = priv_mr_verify(priv);
+	if (ret)
+		WARN("%p: some Memory Region still remain", (void *)priv);
 	priv_unlock(priv);
 	memset(priv, 0, sizeof(*priv));
 }
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index c6563bd..f563722 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -146,6 +146,7 @@ struct priv {
 	unsigned int reta_idx_n; /* RETA index size. */
 	struct rte_flow_drop *flow_drop_queue; /* Flow drop queue. */
 	TAILQ_HEAD(mlx5_flows, rte_flow) flows; /* RTE Flow rules. */
+	LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */
 	uint32_t link_speed_capa; /* Link speed capabilities. */
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
@@ -299,4 +300,11 @@ int priv_socket_uninit(struct priv *priv);
 void priv_socket_handle(struct priv *priv);
 int priv_socket_connect(struct priv *priv);
 
+/* mlx5_mr.c */
+
+struct mlx5_mr *priv_mr_new(struct priv *, struct rte_mempool *);
+struct mlx5_mr *priv_mr_get(struct priv *, struct rte_mempool *);
+int priv_mr_release(struct priv *, struct mlx5_mr *);
+int priv_mr_verify(struct priv *);
+
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 6199746..3f14c47 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -42,6 +42,7 @@
 #endif
 
 #include <rte_mempool.h>
+#include <rte_malloc.h>
 
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
@@ -111,54 +112,6 @@ static int mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
 }
 
 /**
- * Register mempool as a memory region.
- *
- * @param pd
- *   Pointer to protection domain.
- * @param mp
- *   Pointer to memory pool.
- *
- * @return
- *   Memory region pointer, NULL in case of error.
- */
-struct ibv_mr *
-mlx5_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
-{
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	uintptr_t start;
-	uintptr_t end;
-	unsigned int i;
-
-	if (mlx5_check_mempool(mp, &start, &end) != 0) {
-		ERROR("mempool %p: not virtually contiguous",
-		      (void *)mp);
-		return NULL;
-	}
-
-	DEBUG("mempool %p area start=%p end=%p size=%zu",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	/* Round start and end to page boundary if found in memory segments. */
-	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-		uintptr_t addr = (uintptr_t)ms[i].addr;
-		size_t len = ms[i].len;
-		unsigned int align = ms[i].hugepage_sz;
-
-		if ((start > addr) && (start < addr + len))
-			start = RTE_ALIGN_FLOOR(start, align);
-		if ((end > addr) && (end < addr + len))
-			end = RTE_ALIGN_CEIL(end, align);
-	}
-	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
-	      (void *)mp, (void *)start, (void *)end,
-	      (size_t)(end - start));
-	return ibv_reg_mr(pd,
-			  (void *)start,
-			  end - start,
-			  IBV_ACCESS_LOCAL_WRITE);
-}
-
-/**
  * Register a Memory Region (MR) <-> Memory Pool (MP) association in
  * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
  *
@@ -180,12 +133,14 @@ mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp,
 {
 	struct mlx5_txq_ctrl *txq_ctrl =
 		container_of(txq, struct mlx5_txq_ctrl, txq);
-	struct ibv_mr *mr;
+	struct mlx5_mr *mr;
 
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq_ctrl, mp->name, (void *)mp);
-	mr = mlx5_mp2mr(txq_ctrl->priv->pd, mp);
+	mr = priv_mr_get(txq_ctrl->priv, mp);
+	if (mr == NULL)
+		mr = priv_mr_new(txq_ctrl->priv, mp);
 	if (unlikely(mr == NULL)) {
 		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
 		      (void *)txq_ctrl);
@@ -196,20 +151,17 @@ mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp,
 		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
 		      (void *)txq_ctrl);
 		--idx;
-		claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[0].mr));
+		priv_mr_release(txq_ctrl->priv, txq_ctrl->txq.mp2mr[0]);
 		memmove(&txq_ctrl->txq.mp2mr[0], &txq_ctrl->txq.mp2mr[1],
 			(sizeof(txq_ctrl->txq.mp2mr) -
 			 sizeof(txq_ctrl->txq.mp2mr[0])));
 	}
 	/* Store the new entry. */
-	txq_ctrl->txq.mp2mr[idx].start = (uintptr_t)mr->addr;
-	txq_ctrl->txq.mp2mr[idx].end = (uintptr_t)mr->addr + mr->length;
-	txq_ctrl->txq.mp2mr[idx].mr = mr;
-	txq_ctrl->txq.mp2mr[idx].lkey = rte_cpu_to_be_32(mr->lkey);
+	txq_ctrl->txq.mp2mr[idx] = mr;
 	DEBUG("%p: new MR lkey for MP \"%s\" (%p): 0x%08" PRIu32,
 	      (void *)txq_ctrl, mp->name, (void *)mp,
-	      txq_ctrl->txq.mp2mr[idx].lkey);
-	return txq_ctrl->txq.mp2mr[idx].lkey;
+	      txq_ctrl->txq.mp2mr[idx]->lkey);
+	return mr->lkey;
 }
 
 struct txq_mp2mr_mbuf_check_data {
@@ -275,15 +227,149 @@ mlx5_txq_mp2mr_iter(struct rte_mempool *mp, void *arg)
 		return;
 	}
 	for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
-		struct ibv_mr *mr = txq_ctrl->txq.mp2mr[i].mr;
-
-		if (unlikely(mr == NULL)) {
+		if (unlikely(txq_ctrl->txq.mp2mr[i] == NULL)) {
 			/* Unknown MP, add a new MR for it. */
 			break;
 		}
-		if (start >= (uintptr_t)mr->addr &&
-		    end <= (uintptr_t)mr->addr + mr->length)
+		if (start >= (uintptr_t)txq_ctrl->txq.mp2mr[i]->start &&
+		    end <= (uintptr_t)txq_ctrl->txq.mp2mr[i]->end)
 			return;
 	}
 	mlx5_txq_mp2mr_reg(&txq_ctrl->txq, mp, i);
 }
+
+/**
+ * Register a new memory region from the mempool and store it in the memory
+ * region list.
+ *
+ * @param  priv
+ *   Pointer to private structure.
+ * @param mp
+ *   Pointer to the memory pool to register.
+ * @return
+ *   The memory region on success.
+ */
+struct mlx5_mr*
+priv_mr_new(struct priv *priv, struct rte_mempool *mp)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	uintptr_t start;
+	uintptr_t end;
+	unsigned int i;
+	struct mlx5_mr *mr;
+
+	mr = rte_zmalloc_socket(__func__, sizeof(*mr), 0, mp->socket_id);
+	if (!mr) {
+		DEBUG("unable to configure MR, ibv_reg_mr() failed.");
+		return NULL;
+	}
+	if (mlx5_check_mempool(mp, &start, &end) != 0) {
+		ERROR("mempool %p: not virtually contiguous",
+		      (void *)mp);
+		return NULL;
+	}
+	DEBUG("mempool %p area start=%p end=%p size=%zu",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	/* Round start and end to page boundary if found in memory segments. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
+		uintptr_t addr = (uintptr_t)ms[i].addr;
+		size_t len = ms[i].len;
+		unsigned int align = ms[i].hugepage_sz;
+
+		if ((start > addr) && (start < addr + len))
+			start = RTE_ALIGN_FLOOR(start, align);
+		if ((end > addr) && (end < addr + len))
+			end = RTE_ALIGN_CEIL(end, align);
+	}
+	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
+	      (void *)mp, (void *)start, (void *)end,
+	      (size_t)(end - start));
+	mr->mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
+			    IBV_ACCESS_LOCAL_WRITE);
+	mr->mp = mp;
+	mr->lkey = rte_cpu_to_be_32(mr->mr->lkey);
+	mr->start = start;
+	mr->end = (uintptr_t)mr->mr->addr + mr->mr->length;
+	rte_atomic32_inc(&mr->refcnt);
+	DEBUG("%p: new Memory Region %p refcnt: %d", (void *)priv,
+	      (void *)mr, rte_atomic32_read(&mr->refcnt));
+	LIST_INSERT_HEAD(&priv->mr, mr, next);
+	return mr;
+}
+
+/**
+ * Search the memory region object in the memory region list.
+ *
+ * @param  priv
+ *   Pointer to private structure.
+ * @param mp
+ *   Pointer to the memory pool to register.
+ * @return
+ *   The memory region on success.
+ */
+struct mlx5_mr*
+priv_mr_get(struct priv *priv, struct rte_mempool *mp)
+{
+	struct mlx5_mr *mr;
+
+	assert(mp);
+	if (LIST_EMPTY(&priv->mr))
+		return NULL;
+	LIST_FOREACH(mr, &priv->mr, next) {
+		if (mr->mp == mp) {
+			rte_atomic32_inc(&mr->refcnt);
+			DEBUG("Memory Region %p refcnt: %d",
+			      (void *)mr, rte_atomic32_read(&mr->refcnt));
+			return mr;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * Release the memory region object.
+ *
+ * @param  mr
+ *   Pointer to memory region to release.
+ *
+ * @return
+ *   0 on success, errno on failure.
+ */
+int
+priv_mr_release(struct priv *priv, struct mlx5_mr *mr)
+{
+	(void)priv;
+	assert(mr);
+	DEBUG("Memory Region %p refcnt: %d",
+	      (void *)mr, rte_atomic32_read(&mr->refcnt));
+	if (rte_atomic32_dec_and_test(&mr->refcnt)) {
+		claim_zero(ibv_dereg_mr(mr->mr));
+		LIST_REMOVE(mr, next);
+		rte_free(mr);
+		return 0;
+	}
+	return EBUSY;
+}
+
+/**
+ * Verify the flow list is empty
+ *
+ * @param priv
+ *  Pointer to private structure.
+ *
+ * @return the number of object not released.
+ */
+int
+priv_mr_verify(struct priv *priv)
+{
+	int ret = 0;
+	struct mlx5_mr *mr;
+
+	LIST_FOREACH(mr, &priv->mr, next) {
+		DEBUG("%p: mr %p still referenced", (void *)priv,
+		      (void *)mr);
+		++ret;
+	}
+	return ret;
+}
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 683a4a7..0d645ec 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -673,7 +673,7 @@ rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl, unsigned int elts_n)
 			.addr =
 			    rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t)),
 			.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-			.lkey = rte_cpu_to_be_32(rxq_ctrl->mr->lkey),
+			.lkey = rxq_ctrl->mr->lkey,
 		};
 		(*rxq_ctrl->rxq.elts)[i] = buf;
 	}
@@ -767,7 +767,7 @@ mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
 	if (rxq_ctrl->channel != NULL)
 		claim_zero(ibv_destroy_comp_channel(rxq_ctrl->channel));
 	if (rxq_ctrl->mr != NULL)
-		claim_zero(ibv_dereg_mr(rxq_ctrl->mr));
+		priv_mr_release(rxq_ctrl->priv, rxq_ctrl->mr);
 	memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
 }
 
@@ -929,12 +929,15 @@ rxq_ctrl_setup(struct rte_eth_dev *dev, struct mlx5_rxq_ctrl *rxq_ctrl,
 		tmpl.rxq.csum_l2tun =
 			!!dev->data->dev_conf.rxmode.hw_ip_checksum;
 	/* Use the entire RX mempool as the memory region. */
-	tmpl.mr = mlx5_mp2mr(priv->pd, mp);
+	tmpl.mr = priv_mr_get(priv, mp);
 	if (tmpl.mr == NULL) {
-		ret = EINVAL;
-		ERROR("%p: MR creation failure: %s",
-		      (void *)dev, strerror(ret));
-		goto error;
+		tmpl.mr = priv_mr_new(priv, mp);
+		if (tmpl.mr == NULL) {
+			ret = EINVAL;
+			ERROR("%p: MR creation failure: %s",
+			      (void *)dev, strerror(ret));
+			goto error;
+		}
 	}
 	if (dev->data->dev_conf.intr_conf.rxq) {
 		tmpl.channel = ibv_create_comp_channel(priv->ctx);
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 4f877cb..b0f17c0 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -36,6 +36,7 @@
 
 #include <stddef.h>
 #include <stdint.h>
+#include <sys/queue.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -52,6 +53,7 @@
 #include <rte_mempool.h>
 #include <rte_common.h>
 #include <rte_hexdump.h>
+#include <rte_atomic.h>
 
 #include "mlx5_utils.h"
 #include "mlx5.h"
@@ -80,6 +82,17 @@ struct mlx5_txq_stats {
 
 struct priv;
 
+/* Memory region queue object. */
+struct mlx5_mr {
+	LIST_ENTRY(mlx5_mr) next; /**< Pointer to the next element. */
+	rte_atomic32_t refcnt; /*<< Reference counter. */
+	uint32_t lkey; /*<< rte_cpu_to_be_32(mr->lkey) */
+	uintptr_t start; /* Start address of MR */
+	uintptr_t end; /* End address of MR */
+	struct ibv_mr *mr; /*<< Memory Region. */
+	struct rte_mempool *mp; /*<< Memory Pool. */
+};
+
 /* Compressed CQE context. */
 struct rxq_zip {
 	uint16_t ai; /* Array index. */
@@ -126,7 +139,7 @@ struct mlx5_rxq_ctrl {
 	struct priv *priv; /* Back pointer to private data. */
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_wq *wq; /* Work Queue. */
-	struct ibv_mr *mr; /* Memory Region (for mp). */
+	struct mlx5_mr *mr; /* Memory Region (for mp). */
 	struct ibv_comp_channel *channel;
 	unsigned int socket; /* CPU socket ID for allocations. */
 	struct mlx5_rxq_data rxq; /* Data path structure. */
@@ -252,6 +265,7 @@ struct mlx5_txq_data {
 	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
+	uint16_t mr_cache_idx; /* Index of last hit entry. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint32_t flags; /* Flags for Tx Queue. */
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
@@ -259,13 +273,7 @@ struct mlx5_txq_data {
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register. */
-	struct {
-		uintptr_t start; /* Start address of MR */
-		uintptr_t end; /* End address of MR */
-		struct ibv_mr *mr; /* Memory Region (for mp). */
-		uint32_t lkey; /* rte_cpu_to_be_32(mr->lkey) */
-	} mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MP to MR translation table. */
-	uint16_t mr_cache_idx; /* Index of last hit entry. */
+	struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 } __rte_cache_aligned;
@@ -564,26 +572,34 @@ mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
 {
 	uint16_t i = txq->mr_cache_idx;
 	uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t);
+	uint32_t lkey;
 
 	assert(i < RTE_DIM(txq->mp2mr));
-	if (likely(txq->mp2mr[i].start <= addr && txq->mp2mr[i].end >= addr))
-		return txq->mp2mr[i].lkey;
+	if (likely(txq->mp2mr[i]->start <= addr && txq->mp2mr[i]->end >= addr))
+		return txq->mp2mr[i]->lkey;
 	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mr == NULL)) {
+		if (unlikely(txq->mp2mr[i]->mr == NULL)) {
 			/* Unknown MP, add a new MR for it. */
 			break;
 		}
-		if (txq->mp2mr[i].start <= addr &&
-		    txq->mp2mr[i].end >= addr) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(rte_cpu_to_be_32(txq->mp2mr[i].mr->lkey) ==
-			       txq->mp2mr[i].lkey);
+		if (txq->mp2mr[i]->start <= addr &&
+		    txq->mp2mr[i]->end >= addr) {
+			assert(txq->mp2mr[i]->lkey != (uint32_t)-1);
+			assert(rte_cpu_to_be_32(txq->mp2mr[i]->mr->lkey) ==
+			       txq->mp2mr[i]->lkey);
 			txq->mr_cache_idx = i;
-			return txq->mp2mr[i].lkey;
+			return txq->mp2mr[i]->lkey;
 		}
 	}
 	txq->mr_cache_idx = 0;
-	return mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i);
+	lkey = mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i);
+	/*
+	 * Request the reference to use in this queue, the original one is
+	 * kept by the control plane.
+	 */
+	if (lkey != (uint32_t)-1)
+		rte_atomic32_inc(&txq->mp2mr[i]->refcnt);
+	return lkey;
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index f551f87..1899850 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -142,11 +142,9 @@ mlx5_txq_cleanup(struct mlx5_txq_ctrl *txq_ctrl)
 		claim_zero(ibv_destroy_qp(txq_ctrl->qp));
 	if (txq_ctrl->cq != NULL)
 		claim_zero(ibv_destroy_cq(txq_ctrl->cq));
-	for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i) {
-		if (txq_ctrl->txq.mp2mr[i].mr == NULL)
-			break;
-		claim_zero(ibv_dereg_mr(txq_ctrl->txq.mp2mr[i].mr));
-	}
+	for (i = 0; (i != RTE_DIM(txq_ctrl->txq.mp2mr)); ++i)
+		if (txq_ctrl->txq.mp2mr[i])
+			priv_mr_release(txq_ctrl->priv, txq_ctrl->txq.mp2mr[i]);
 	memset(txq_ctrl, 0, sizeof(*txq_ctrl));
 }
 
-- 
2.1.4



More information about the dev mailing list