[dpdk-dev] [PATCH 2/2] net/mlx4: share memory region resources

Adrien Mazarguil adrien.mazarguil at 6wind.com
Thu Nov 2 19:14:22 CET 2017


Memory regions assigned to hardware and used during Tx/Rx are mapped to
mbuf pools. Each Rx queue creates its own MR based on the mempool provided
during queue setup, while each Tx queue looks up and registers MRs for all
existing mbuf pools instead.

Since most applications use few large mbuf pools (usually only a single one
per NUMA node) common to all Tx/Rx queues, the above approach wastes
hardware resources due to redundant MRs. This negatively affects
performance, particularly with large numbers of queues.

This patch therefore makes the entire MR registration common to all queues
using a reference count. A spinlock is added to protect against
asynchronous registration that may occur from the Tx side where new
mempools are discovered based on mbuf data.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
---

This patch is related to and applies on top of Matan/Ophir's series:

 "net/mlx4: Tx path improvements" (v5)

 drivers/net/mlx4/mlx4.h      | 18 +++++++-
 drivers/net/mlx4/mlx4_mr.c   | 90 ++++++++++++++++++++++++++++++++-------
 drivers/net/mlx4/mlx4_rxq.c  |  5 +--
 drivers/net/mlx4/mlx4_rxtx.h |  4 +-
 drivers/net/mlx4/mlx4_txq.c  |  2 +-
 5 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index 70cf453..bccd30c 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -51,6 +51,7 @@
 #include <rte_ether.h>
 #include <rte_interrupts.h>
 #include <rte_mempool.h>
+#include <rte_spinlock.h>
 
 /** Maximum number of simultaneous MAC addresses. This value is arbitrary. */
 #define MLX4_MAX_MAC_ADDRESSES 128
@@ -100,6 +101,18 @@ struct rxq;
 struct txq;
 struct rte_flow;
 
+/** Memory region descriptor. */
+struct mlx4_mr {
+	LIST_ENTRY(mlx4_mr) next; /**< Next entry in list. */
+	uintptr_t start; /**< Base address for memory region. */
+	uintptr_t end; /**< End address for memory region. */
+	uint32_t lkey; /**< L_Key extracted from @p mr. */
+	uint32_t refcnt; /**< Reference count for this object. */
+	struct priv *priv; /**< Back pointer to private data. */
+	struct ibv_mr *mr; /**< Memory region associated with @p mp. */
+	struct rte_mempool *mp; /**< Target memory pool (mempool). */
+};
+
 /** Private data structure. */
 struct priv {
 	struct rte_eth_dev *dev; /**< Ethernet device. */
@@ -119,6 +132,8 @@ struct priv {
 	struct mlx4_drop *drop; /**< Shared resources for drop flow rules. */
 	LIST_HEAD(, mlx4_rss) rss; /**< Shared targets for Rx flow rules. */
 	LIST_HEAD(, rte_flow) flows; /**< Configured flow rule handles. */
+	LIST_HEAD(, mlx4_mr) mr; /**< Registered memory regions. */
+	rte_spinlock_t mr_lock; /**< Lock for @p mr access. */
 	struct ether_addr mac[MLX4_MAX_MAC_ADDRESSES];
 	/**< Configured MAC addresses. Unused entries are zeroed. */
 };
@@ -159,7 +174,8 @@ int mlx4_rx_intr_enable(struct rte_eth_dev *dev, uint16_t idx);
 
 /* mlx4_mr.c */
 
-struct ibv_mr *mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp);
+struct mlx4_mr *mlx4_mr_get(struct priv *priv, struct rte_mempool *mp);
+void mlx4_mr_put(struct mlx4_mr *mr);
 uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
 			 uint32_t i);
 
diff --git a/drivers/net/mlx4/mlx4_mr.c b/drivers/net/mlx4/mlx4_mr.c
index 8105cc5..2a3e269 100644
--- a/drivers/net/mlx4/mlx4_mr.c
+++ b/drivers/net/mlx4/mlx4_mr.c
@@ -55,8 +55,10 @@
 #include <rte_branch_prediction.h>
 #include <rte_common.h>
 #include <rte_errno.h>
+#include <rte_malloc.h>
 #include <rte_memory.h>
 #include <rte_mempool.h>
+#include <rte_spinlock.h>
 
 #include "mlx4_rxtx.h"
 #include "mlx4_utils.h"
@@ -135,24 +137,27 @@ mlx4_check_mempool(struct rte_mempool *mp, uintptr_t *start, uintptr_t *end)
 }
 
 /**
- * Register mempool as a memory region.
+ * Obtain a memory region from a memory pool.
  *
- * @param pd
- *   Pointer to protection domain.
+ * If a matching memory region already exists, it is returned with its
+ * reference count incremented, otherwise a new one is registered.
+ *
+ * @param priv
+ *   Pointer to private structure.
  * @param mp
  *   Pointer to memory pool.
  *
  * @return
  *   Memory region pointer, NULL in case of error and rte_errno is set.
  */
-struct ibv_mr *
-mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
+struct mlx4_mr *
+mlx4_mr_get(struct priv *priv, struct rte_mempool *mp)
 {
 	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
 	uintptr_t start;
 	uintptr_t end;
 	unsigned int i;
-	struct ibv_mr *mr;
+	struct mlx4_mr *mr;
 
 	if (mlx4_check_mempool(mp, &start, &end) != 0) {
 		rte_errno = EINVAL;
@@ -177,16 +182,71 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
 	DEBUG("mempool %p using start=%p end=%p size=%zu for MR",
 	      (void *)mp, (void *)start, (void *)end,
 	      (size_t)(end - start));
-	mr = ibv_reg_mr(pd,
-			(void *)start,
-			end - start,
-			IBV_ACCESS_LOCAL_WRITE);
-	if (!mr)
+	rte_spinlock_lock(&priv->mr_lock);
+	LIST_FOREACH(mr, &priv->mr, next)
+		if (mp == mr->mp && start >= mr->start && end <= mr->end)
+			break;
+	if (mr) {
+		++mr->refcnt;
+		goto release;
+	}
+	mr = rte_malloc(__func__, sizeof(*mr), 0);
+	if (!mr) {
+		rte_errno = ENOMEM;
+		goto release;
+	}
+	*mr = (struct mlx4_mr){
+		.start = start,
+		.end = end,
+		.refcnt = 1,
+		.priv = priv,
+		.mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
+				 IBV_ACCESS_LOCAL_WRITE),
+		.mp = mp,
+	};
+	if (mr->mr) {
+		mr->lkey = mr->mr->lkey;
+		LIST_INSERT_HEAD(&priv->mr, mr, next);
+	} else {
+		rte_free(mr);
+		mr = NULL;
 		rte_errno = errno ? errno : EINVAL;
+	}
+release:
+	rte_spinlock_unlock(&priv->mr_lock);
 	return mr;
 }
 
 /**
+ * Release a memory region.
+ *
+ * This function decrements its reference count and destroys it after
+ * reaching 0.
+ *
+ * Note to avoid race conditions given this function may be used from the
+ * data plane, it's extremely important that each user holds its own
+ * reference.
+ *
+ * @param mr
+ *   Memory region to release.
+ */
+void
+mlx4_mr_put(struct mlx4_mr *mr)
+{
+	struct priv *priv = mr->priv;
+
+	rte_spinlock_lock(&priv->mr_lock);
+	assert(mr->refcnt);
+	if (--mr->refcnt)
+		goto release;
+	LIST_REMOVE(mr, next);
+	claim_zero(ibv_dereg_mr(mr->mr));
+	rte_free(mr);
+release:
+	rte_spinlock_unlock(&priv->mr_lock);
+}
+
+/**
  * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
  * If mp2mr[] is full, remove an entry first.
  *
@@ -203,14 +263,14 @@ mlx4_mp2mr(struct ibv_pd *pd, struct rte_mempool *mp)
 uint32_t
 mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 {
-	struct ibv_mr *mr;
+	struct mlx4_mr *mr;
 
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (void *)mp);
-	mr = mlx4_mp2mr(txq->priv->pd, mp);
+	mr = mlx4_mr_get(txq->priv, mp);
 	if (unlikely(mr == NULL)) {
-		DEBUG("%p: unable to configure MR, ibv_reg_mr() failed.",
+		DEBUG("%p: unable to configure MR, mlx4_mr_get() failed",
 		      (void *)txq);
 		return (uint32_t)-1;
 	}
@@ -219,7 +279,7 @@ mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 		DEBUG("%p: MR <-> MP table full, dropping oldest entry.",
 		      (void *)txq);
 		--i;
-		claim_zero(ibv_dereg_mr(txq->mp2mr[0].mr));
+		mlx4_mr_put(txq->mp2mr[0].mr);
 		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
 			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
 	}
diff --git a/drivers/net/mlx4/mlx4_rxq.c b/drivers/net/mlx4/mlx4_rxq.c
index ee1e7a9..8b97a89 100644
--- a/drivers/net/mlx4/mlx4_rxq.c
+++ b/drivers/net/mlx4/mlx4_rxq.c
@@ -804,9 +804,8 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		goto error;
 	}
 	/* Use the entire Rx mempool as the memory region. */
-	rxq->mr = mlx4_mp2mr(priv->pd, mp);
+	rxq->mr = mlx4_mr_get(priv, mp);
 	if (!rxq->mr) {
-		rte_errno = EINVAL;
 		ERROR("%p: MR creation failure: %s",
 		      (void *)dev, strerror(rte_errno));
 		goto error;
@@ -869,6 +868,6 @@ mlx4_rx_queue_release(void *dpdk_rxq)
 	if (rxq->channel)
 		claim_zero(ibv_destroy_comp_channel(rxq->channel));
 	if (rxq->mr)
-		claim_zero(ibv_dereg_mr(rxq->mr));
+		mlx4_mr_put(rxq->mr);
 	rte_free(rxq);
 }
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index 185dbdf..4acad80 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -67,7 +67,7 @@ struct mlx4_rxq_stats {
 struct rxq {
 	struct priv *priv; /**< Back pointer to private data. */
 	struct rte_mempool *mp; /**< Memory pool for allocations. */
-	struct ibv_mr *mr; /**< Memory region (for mp). */
+	struct mlx4_mr *mr; /**< Memory region. */
 	struct ibv_cq *cq; /**< Completion queue. */
 	struct ibv_wq *wq; /**< Work queue. */
 	struct ibv_comp_channel *channel; /**< Rx completion channel. */
@@ -134,7 +134,7 @@ struct txq {
 	/**< Memory used for storing the first DWORD of data TXBBs. */
 	struct {
 		const struct rte_mempool *mp; /**< Cached memory pool. */
-		struct ibv_mr *mr; /**< Memory region (for mp). */
+		struct mlx4_mr *mr; /**< Memory region (for mp). */
 		uint32_t lkey; /**< mr->lkey copy. */
 	} mp2mr[MLX4_PMD_TX_MP_CACHE]; /**< MP to MR translation table. */
 	struct priv *priv; /**< Back pointer to private data. */
diff --git a/drivers/net/mlx4/mlx4_txq.c b/drivers/net/mlx4/mlx4_txq.c
index a9c5bd2..7882a4d 100644
--- a/drivers/net/mlx4/mlx4_txq.c
+++ b/drivers/net/mlx4/mlx4_txq.c
@@ -408,7 +408,7 @@ mlx4_tx_queue_release(void *dpdk_txq)
 		if (!txq->mp2mr[i].mp)
 			break;
 		assert(txq->mp2mr[i].mr);
-		claim_zero(ibv_dereg_mr(txq->mp2mr[i].mr));
+		mlx4_mr_put(txq->mp2mr[i].mr);
 	}
 	rte_free(txq);
 }
-- 
2.1.4


More information about the dev mailing list