[dpdk-stable] [PATCH 17.11] net/mlx5: fix multi-chunk mempool support

Yongseok Koh yskoh at mellanox.com
Thu Dec 13 02:31:36 CET 2018


Mempool having multiple chunks isn't supported by mlx5 PMD. If system
memory is much fragmented, it usually happens and mlx5 PMD fails to
initialize. In such a case, application can't be launched or restarted.

This patch enables non-contiguous mempool support by registering entire
memseg[] in the beginning. For MR search, there are 3 layers.

- L1 searches the last-hit entry of each Rx/Tx queue.
- L2 linearly searches per-queue cache table, which is an array having 8
entries by default.
- L3 searches per-queue lookup table by binary search.

On Rx, as mempool is statically registered when setting up Rx queue, lookup
table is built when queue is created.

On Tx, registration is done on demand in runtime.

Fixes: 491770fafcfb ("net/mlx5: split memory registration function")

Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
---
 drivers/net/mlx5/mlx5.c          |   5 +-
 drivers/net/mlx5/mlx5.h          |  18 +-
 drivers/net/mlx5/mlx5_defs.h     |   6 +
 drivers/net/mlx5/mlx5_ethdev.c   |   5 +
 drivers/net/mlx5/mlx5_mr.c       | 583 +++++++++++++++++++++------------------
 drivers/net/mlx5/mlx5_rxq.c      |  36 +--
 drivers/net/mlx5/mlx5_rxtx.c     |   3 +
 drivers/net/mlx5/mlx5_rxtx.h     | 175 +++++++-----
 drivers/net/mlx5/mlx5_rxtx_vec.h |   6 +-
 drivers/net/mlx5/mlx5_trigger.c  |  14 -
 drivers/net/mlx5/mlx5_txq.c      |  23 +-
 11 files changed, 470 insertions(+), 404 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 46590378ba..e117ec8439 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -243,6 +243,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 		priv->txqs_n = 0;
 		priv->txqs = NULL;
 	}
+	mlx5_mr_deregister_memseg(dev);
 	if (priv->pd != NULL) {
 		assert(priv->ctx != NULL);
 		claim_zero(ibv_dealloc_pd(priv->pd));
@@ -283,10 +284,6 @@ mlx5_dev_close(struct rte_eth_dev *dev)
 	if (ret)
 		DRV_LOG(WARNING, "port %u some flows still remain",
 			dev->data->port_id);
-	ret = mlx5_mr_verify(dev);
-	if (ret)
-		DRV_LOG(WARNING, "port %u some memory region still remain",
-			dev->data->port_id);
 	memset(priv, 0, sizeof(*priv));
 }
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a5bff00f5e..08b667f98e 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -154,7 +154,9 @@ struct priv {
 	struct mlx5_hrxq_drop *flow_drop_queue; /* Flow drop queue. */
 	struct mlx5_flows flows; /* RTE Flow rules. */
 	struct mlx5_flows ctrl_flows; /* Control flow rules. */
-	LIST_HEAD(mr, mlx5_mr) mr; /* Memory region. */
+	struct mlx5_mr (*mr)[]; /* Static MR table. */
+	struct mlx5_mr_cache (*mr_cache)[]; /* Global MR cache table. */
+	unsigned int mr_n; /* Size of static MR table. */
 	LIST_HEAD(rxq, mlx5_rxq_ctrl) rxqsctrl; /* DPDK Rx queues. */
 	LIST_HEAD(rxqibv, mlx5_rxq_ibv) rxqsibv; /* Verbs Rx queues. */
 	LIST_HEAD(hrxq, mlx5_hrxq) hrxqs; /* Verbs Hash Rx queues. */
@@ -303,16 +305,14 @@ void mlx5_flow_delete_drop_queue(struct rte_eth_dev *dev);
 
 /* mlx5_socket.c */
 
-int mlx5_socket_init(struct rte_eth_dev *priv);
-void mlx5_socket_uninit(struct rte_eth_dev *priv);
-void mlx5_socket_handle(struct rte_eth_dev *priv);
-int mlx5_socket_connect(struct rte_eth_dev *priv);
+int mlx5_socket_init(struct rte_eth_dev *dev);
+void mlx5_socket_uninit(struct rte_eth_dev *dev);
+void mlx5_socket_handle(struct rte_eth_dev *dev);
+int mlx5_socket_connect(struct rte_eth_dev *dev);
 
 /* mlx5_mr.c */
 
-struct mlx5_mr *mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp);
-struct mlx5_mr *mlx5_mr_get(struct rte_eth_dev *dev, struct rte_mempool *mp);
-int mlx5_mr_release(struct mlx5_mr *mr);
-int mlx5_mr_verify(struct rte_eth_dev *dev);
+int mlx5_mr_register_memseg(struct rte_eth_dev *dev);
+void mlx5_mr_deregister_memseg(struct rte_eth_dev *dev);
 
 #endif /* RTE_PMD_MLX5_H_ */
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index 8ee93a63c7..1de3bdc417 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -129,6 +129,12 @@
  */
 #define MLX5_UAR_OFFSET (1ULL << 32)
 
+/* Size of per-queue MR cache table. */
+#define MLX5_MR_CACHE_N 8
+
+/* First entry must be NULL for comparison. */
+#define MLX5_MR_LOOKUP_TABLE_PAD 1
+
 /* Definition of static_assert found in /usr/include/assert.h */
 #ifndef HAVE_STATIC_ASSERT
 #define static_assert _Static_assert
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index e441483a98..198c30b3b3 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -408,6 +408,11 @@ mlx5_dev_configure(struct rte_eth_dev *dev)
 	ret = mlx5_rss_reta_index_resize(dev, reta_idx_n);
 	if (ret)
 		return ret;
+	if (mlx5_mr_register_memseg(dev)) {
+		DRV_LOG(ERR, "%p: MR registration failed", (void *)dev);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
 	/* When the number of RX queues is not a power of two, the remaining
 	 * table entries are padded with reused WQs and hashes are not spread
 	 * uniformly. */
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index a50c520880..c3410a6294 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -47,355 +47,398 @@
 #include "mlx5.h"
 #include "mlx5_rxtx.h"
 
-struct mlx5_check_mempool_data {
-	int ret;
-	char *start;
-	char *end;
+struct mr_update_mempool_data {
+	struct rte_eth_dev *dev;
+	struct mlx5_mr_cache *lkp_tbl;
+	uint16_t tbl_sz;
 };
 
-/* Called by mlx5_check_mempool() when iterating the memory chunks. */
-static void
-mlx5_check_mempool_cb(struct rte_mempool *mp __rte_unused,
-		      void *opaque, struct rte_mempool_memhdr *memhdr,
-		      unsigned int mem_idx __rte_unused)
+/**
+ * Look up LKEY from given lookup table by Binary Search, store the last index
+ * and return searched LKEY.
+ *
+ * @param lkp_tbl
+ *   Pointer to lookup table.
+ * @param n
+ *   Size of lookup table.
+ * @param[out] idx
+ *   Pointer to index. Even on searh failure, returns index where it stops
+ *   searching so that index can be used when inserting a new entry.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKEY on success, UINT32_MAX on no match.
+ */
+static uint32_t
+mlx5_mr_lookup(struct mlx5_mr_cache *lkp_tbl, uint16_t n, uint16_t *idx,
+	       uintptr_t addr)
 {
-	struct mlx5_check_mempool_data *data = opaque;
+	uint16_t base = 0;
 
-	/* It already failed, skip the next chunks. */
-	if (data->ret != 0)
-		return;
-	/* It is the first chunk. */
-	if (data->start == NULL && data->end == NULL) {
-		data->start = memhdr->addr;
-		data->end = data->start + memhdr->len;
-		return;
-	}
-	if (data->end == memhdr->addr) {
-		data->end += memhdr->len;
-		return;
-	}
-	if (data->start == (char *)memhdr->addr + memhdr->len) {
-		data->start -= memhdr->len;
-		return;
-	}
-	/* Error, mempool is not virtually contiguous. */
-	data->ret = -1;
+	/* First entry must be NULL for comparison. */
+	assert(n == 0 || (lkp_tbl[0].start == 0 &&
+			  lkp_tbl[0].lkey == UINT32_MAX));
+	/* Binary search. */
+	do {
+		register uint16_t delta = n >> 1;
+
+		if (addr < lkp_tbl[base + delta].start) {
+			n = delta;
+		} else {
+			base += delta;
+			n -= delta;
+		}
+	} while (n > 1);
+	assert(addr >= lkp_tbl[base].start);
+	*idx = base;
+	if (addr < lkp_tbl[base].end)
+		return lkp_tbl[base].lkey;
+	/* Not found. */
+	return UINT32_MAX;
 }
 
 /**
- * Check if a mempool can be used: it must be virtually contiguous.
+ * Insert an entry to LKEY lookup table.
  *
- * @param[in] mp
- *   Pointer to memory pool.
- * @param[out] start
- *   Pointer to the start address of the mempool virtual memory area
- * @param[out] end
- *   Pointer to the end address of the mempool virtual memory area
+ * @param lkp_tbl
+ *   Pointer to lookup table. The size of array must be enough to add one more
+ *   entry.
+ * @param n
+ *   Size of lookup table.
+ * @param entry
+ *   Pointer to new entry to insert.
  *
  * @return
- *   0 on success (mempool is virtually contiguous), -1 on error.
+ *   Size of returning lookup table.
  */
 static int
-mlx5_check_mempool(struct rte_mempool *mp, uintptr_t *start,
-		   uintptr_t *end)
+mlx5_mr_insert(struct mlx5_mr_cache *lkp_tbl, uint16_t n,
+	       struct mlx5_mr_cache *entry)
 {
-	struct mlx5_check_mempool_data data;
+	uint16_t idx = 0;
+	size_t shift;
 
-	memset(&data, 0, sizeof(data));
-	rte_mempool_mem_iter(mp, mlx5_check_mempool_cb, &data);
-	*start = (uintptr_t)data.start;
-	*end = (uintptr_t)data.end;
-	return data.ret;
+	/* Check if entry exist. */
+	if (mlx5_mr_lookup(lkp_tbl, n, &idx, entry->start) != UINT32_MAX)
+		return n;
+	/* Insert entry. */
+	++idx;
+	shift = (n - idx) * sizeof(struct mlx5_mr_cache);
+	if (shift)
+		memmove(&lkp_tbl[idx + 1], &lkp_tbl[idx], shift);
+	lkp_tbl[idx] = *entry;
+	DRV_LOG(DEBUG, "%p: inserted lkp_tbl[%u], start = 0x%lx, end = 0x%lx",
+		(void *)lkp_tbl, idx, lkp_tbl[idx].start, lkp_tbl[idx].end);
+	return n + 1;
 }
 
 /**
- * Register a Memory Region (MR) <-> Memory Pool (MP) association in
- * txq->mp2mr[]. If mp2mr[] is full, remove an entry first.
+ * Incrementally update LKEY lookup table for a specific address from registered
+ * Memory Regions.
  *
- * @param txq
- *   Pointer to TX queue structure.
- * @param[in] mp
- *   Memory Pool for which a Memory Region lkey must be returned.
- * @param idx
- *   Index of the next available entry.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param lkp_tbl
+ *   Pointer to lookup table to fill. The size of array must be at least
+ *   (priv->mr_n + 1).
+ * @param n
+ *   Size of lookup table.
+ * @param addr
+ *   Search key.
  *
  * @return
- *   mr on success, NULL on failure and rte_errno is set.
+ *   Size of returning lookup table.
  */
-struct mlx5_mr *
-mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq, struct rte_mempool *mp,
-		   unsigned int idx)
+static int
+mlx5_mr_update_addr(struct rte_eth_dev *dev, struct mlx5_mr_cache *lkp_tbl,
+		    uint16_t n, uintptr_t addr)
 {
-	struct mlx5_txq_ctrl *txq_ctrl =
-		container_of(txq, struct mlx5_txq_ctrl, txq);
-	struct rte_eth_dev *dev;
-	struct mlx5_mr *mr;
+	struct priv *priv = dev->data->dev_private;
+	uint16_t idx;
+	uint32_t ret __rte_unused;
 
-	rte_spinlock_lock(&txq_ctrl->priv->mr_lock);
-	/* Add a new entry, register MR first. */
-	DRV_LOG(DEBUG, "port %u discovered new memory pool \"%s\" (%p)",
-		PORT_ID(txq_ctrl->priv), mp->name, (void *)mp);
-	dev = ETH_DEV(txq_ctrl->priv);
-	mr = mlx5_mr_get(dev, mp);
-	if (mr == NULL) {
-		if (rte_eal_process_type() != RTE_PROC_PRIMARY) {
-			DRV_LOG(DEBUG,
-				"port %u using unregistered mempool 0x%p(%s)"
-				" in secondary process, please create mempool"
-				" before rte_eth_dev_start()",
-				PORT_ID(txq_ctrl->priv), (void *)mp, mp->name);
-			rte_spinlock_unlock(&txq_ctrl->priv->mr_lock);
-			rte_errno = ENOTSUP;
-			return NULL;
-		}
-		mr = mlx5_mr_new(dev, mp);
-	}
-	if (unlikely(mr == NULL)) {
-		DRV_LOG(DEBUG,
-			"port %u unable to configure memory region,"
-			" ibv_reg_mr() failed.",
-			PORT_ID(txq_ctrl->priv));
-		rte_spinlock_unlock(&txq_ctrl->priv->mr_lock);
-		return NULL;
+	if (n == 0) {
+		/* First entry must be NULL for comparison. */
+		lkp_tbl[n++] = (struct mlx5_mr_cache) {
+			.lkey = UINT32_MAX,
+		};
 	}
-	if (unlikely(idx == RTE_DIM(txq->mp2mr))) {
-		/* Table is full, remove oldest entry. */
-		DRV_LOG(DEBUG,
-			"port %u memory region <-> memory pool table full, "
-			" dropping oldest entry",
-			PORT_ID(txq_ctrl->priv));
-		--idx;
-		mlx5_mr_release(txq->mp2mr[0]);
-		memmove(&txq->mp2mr[0], &txq->mp2mr[1],
-			(sizeof(txq->mp2mr) - sizeof(txq->mp2mr[0])));
-	}
-	/* Store the new entry. */
-	txq_ctrl->txq.mp2mr[idx] = mr;
-	DRV_LOG(DEBUG,
-		"port %u new memory region lkey for MP \"%s\" (%p): 0x%08"
-		PRIu32,
-		PORT_ID(txq_ctrl->priv), mp->name, (void *)mp,
-		txq_ctrl->txq.mp2mr[idx]->lkey);
-	rte_spinlock_unlock(&txq_ctrl->priv->mr_lock);
-	return mr;
+	ret = mlx5_mr_lookup(*priv->mr_cache, MR_TABLE_SZ(priv->mr_n),
+			     &idx, addr);
+	/* Lookup must succeed, the global cache is all-inclusive. */
+	assert(ret != UINT32_MAX);
+	DRV_LOG(DEBUG, "port %u adding LKEY (0x%x) for addr 0x%lx",
+		dev->data->port_id, (*priv->mr_cache)[idx].lkey, addr);
+	return mlx5_mr_insert(lkp_tbl, n, &(*priv->mr_cache)[idx]);
 }
 
-struct mlx5_mp2mr_mbuf_check_data {
-	int ret;
-};
-
 /**
- * Callback function for rte_mempool_obj_iter() to check whether a given
- * mempool object looks like a mbuf.
+ * Bottom-half of LKEY search on datapath. Firstly search in cache_bh[] and if
+ * misses, search in the global MR cache table and update the new entry to
+ * per-queue local caches.
  *
- * @param[in] mp
- *   The mempool pointer
- * @param[in] arg
- *   Context data (struct txq_mp2mr_mbuf_check_data). Contains the
- *   return value.
- * @param[in] obj
- *   Object address.
- * @param index
- *   Object index, unused.
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param mr_ctrl
+ *   Pointer to per-queue MR control structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   LKEY on success.
  */
-static void
-txq_mp2mr_mbuf_check(struct rte_mempool *mp, void *arg, void *obj,
-	uint32_t index __rte_unused)
+static inline uint32_t
+mlx5_mr_mb2mr_bh(struct rte_eth_dev *dev, struct mlx5_mr_ctrl *mr_ctrl,
+		 uintptr_t addr)
 {
-	struct mlx5_mp2mr_mbuf_check_data *data = arg;
-	struct rte_mbuf *buf = obj;
+	uint32_t lkey;
+	uint16_t bh_idx = 0;
+	struct mlx5_mr_cache *mr_cache = &mr_ctrl->cache[mr_ctrl->head];
 
-	/*
-	 * Check whether mbuf structure fits element size and whether mempool
-	 * pointer is valid.
-	 */
-	if (sizeof(*buf) > mp->elt_size || buf->pool != mp)
-		data->ret = -1;
+	/* Binary-search MR translation table. */
+	lkey = mlx5_mr_lookup(*mr_ctrl->cache_bh, mr_ctrl->bh_n, &bh_idx, addr);
+	if (likely(lkey != UINT32_MAX)) {
+		/* Update cache. */
+		*mr_cache = (*mr_ctrl->cache_bh)[bh_idx];
+		mr_ctrl->mru = mr_ctrl->head;
+		/* Point to the next victim, the oldest. */
+		mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
+		return lkey;
+	}
+	/* Missed in the per-queue lookup table. Search in the global cache. */
+	mr_ctrl->bh_n = mlx5_mr_update_addr(dev, *mr_ctrl->cache_bh,
+					    mr_ctrl->bh_n, addr);
+	/* Search again with updated entries. */
+	lkey = mlx5_mr_lookup(*mr_ctrl->cache_bh, mr_ctrl->bh_n, &bh_idx, addr);
+	/* Must always succeed. */
+	assert(lkey != UINT32_MAX);
+	/* Update cache. */
+	*mr_cache = (*mr_ctrl->cache_bh)[bh_idx];
+	mr_ctrl->mru = mr_ctrl->head;
+	/* Point to the next victim, the oldest. */
+	mr_ctrl->head = (mr_ctrl->head + 1) % MLX5_MR_CACHE_N;
+	return lkey;
 }
 
 /**
- * Iterator function for rte_mempool_walk() to register existing mempools and
- * fill the MP to MR cache of a TX queue.
+ * Bottom-half of mlx5_rx_mb2mr() if search on mr_cache_bh[] fails.
  *
- * @param[in] mp
- *   Memory Pool to register.
- * @param *arg
- *   Pointer to TX queue structure.
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   LKEY on success.
  */
-void
-mlx5_mp2mr_iter(struct rte_mempool *mp, void *arg)
+uint32_t
+mlx5_rx_mb2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr)
 {
-	struct priv *priv = (struct priv *)arg;
-	struct mlx5_mp2mr_mbuf_check_data data = {
-		.ret = 0,
-	};
-	struct mlx5_mr *mr;
+	struct mlx5_rxq_ctrl *rxq_ctrl =
+		container_of(rxq, struct mlx5_rxq_ctrl, rxq);
 
-	/* Register mempool only if the first element looks like a mbuf. */
-	if (rte_mempool_obj_iter(mp, txq_mp2mr_mbuf_check, &data) == 0 ||
-			data.ret == -1)
-		return;
-	mr = mlx5_mr_get(ETH_DEV(priv), mp);
-	if (mr) {
-		mlx5_mr_release(mr);
-		return;
-	}
-	mr = mlx5_mr_new(ETH_DEV(priv), mp);
-	if (!mr)
-		DRV_LOG(ERR, "port %u cannot create memory region: %s",
-			PORT_ID(priv), strerror(rte_errno));
+	DRV_LOG(DEBUG,
+		"port %u not found in rxq->mr_cache[], last-hit=%u, head=%u",
+		PORT_ID(rxq_ctrl->priv), rxq->mr_ctrl.mru, rxq->mr_ctrl.head);
+	return mlx5_mr_mb2mr_bh(ETH_DEV(rxq_ctrl->priv), &rxq->mr_ctrl, addr);
 }
 
 /**
- * Register a new memory region from the mempool and store it in the memory
- * region list.
+ * Bottom-half of mlx5_tx_mb2mr() if search on cache_bh[] fails.
  *
- * @param dev
- *   Pointer to Ethernet device.
- * @param mp
- *   Pointer to the memory pool to register.
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param addr
+ *   Search key.
  *
  * @return
- *   The memory region on success, NULL on failure and rte_errno is set.
+ *   LKEY on success.
  */
-struct mlx5_mr *
-mlx5_mr_new(struct rte_eth_dev *dev, struct rte_mempool *mp)
+uint32_t
+mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr)
 {
-	struct priv *priv = dev->data->dev_private;
-	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
-	uintptr_t start;
-	uintptr_t end;
-	unsigned int i;
-	struct mlx5_mr *mr;
-
-	mr = rte_zmalloc_socket(__func__, sizeof(*mr), 0, mp->socket_id);
-	if (!mr) {
-		DRV_LOG(DEBUG,
-			"port %u unable to configure memory region,"
-			" ibv_reg_mr() failed.",
-			dev->data->port_id);
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	if (mlx5_check_mempool(mp, &start, &end) != 0) {
-		DRV_LOG(ERR, "port %u mempool %p: not virtually contiguous",
-			dev->data->port_id, (void *)mp);
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	DRV_LOG(DEBUG, "port %u mempool %p area start=%p end=%p size=%zu",
-		dev->data->port_id, (void *)mp, (void *)start, (void *)end,
-		(size_t)(end - start));
-	/* Save original addresses for exact MR lookup. */
-	mr->start = start;
-	mr->end = end;
-	/* Round start and end to page boundary if found in memory segments. */
-	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i) {
-		uintptr_t addr = (uintptr_t)ms[i].addr;
-		size_t len = ms[i].len;
-		unsigned int align = ms[i].hugepage_sz;
+	struct mlx5_txq_ctrl *txq_ctrl =
+		container_of(txq, struct mlx5_txq_ctrl, txq);
 
-		if ((start > addr) && (start < addr + len))
-			start = RTE_ALIGN_FLOOR(start, align);
-		if ((end > addr) && (end < addr + len))
-			end = RTE_ALIGN_CEIL(end, align);
-	}
 	DRV_LOG(DEBUG,
-		"port %u mempool %p using start=%p end=%p size=%zu for memory"
-		" region",
-		dev->data->port_id, (void *)mp, (void *)start, (void *)end,
-		(size_t)(end - start));
-	mr->mr = ibv_reg_mr(priv->pd, (void *)start, end - start,
-			    IBV_ACCESS_LOCAL_WRITE);
-	if (!mr->mr) {
-		rte_errno = ENOMEM;
-		return NULL;
-	}
-	mr->mp = mp;
-	mr->lkey = rte_cpu_to_be_32(mr->mr->lkey);
-	rte_atomic32_inc(&mr->refcnt);
-	DRV_LOG(DEBUG, "port %u new memory Region %p refcnt: %d",
-		dev->data->port_id, (void *)mr, rte_atomic32_read(&mr->refcnt));
-	LIST_INSERT_HEAD(&priv->mr, mr, next);
-	return mr;
+		"port %u not found in txq->mr_cache[], last-hit=%u, head=%u",
+		PORT_ID(txq_ctrl->priv), txq->mr_ctrl.mru, txq->mr_ctrl.head);
+	return mlx5_mr_mb2mr_bh(ETH_DEV(txq_ctrl->priv), &txq->mr_ctrl, addr);
+}
+
+/* Called by mr_update_mempool() when iterating the memory chunks. */
+static void
+mr_update_mempool_cb(struct rte_mempool *mp __rte_unused,
+		    void *opaque, struct rte_mempool_memhdr *memhdr,
+		    unsigned int mem_idx __rte_unused)
+{
+	struct mr_update_mempool_data *data = opaque;
+
+	DRV_LOG(DEBUG, "port %u adding chunk[%u] of %s",
+		data->dev->data->port_id, mem_idx, mp->name);
+	data->tbl_sz =
+		mlx5_mr_update_addr(data->dev, data->lkp_tbl, data->tbl_sz,
+				    (uintptr_t)memhdr->addr);
 }
 
 /**
- * Search the memory region object in the memory region list.
+ * Incrementally update LKEY lookup table for a specific Memory Pool from
+ * registered Memory Regions.
  *
  * @param dev
  *   Pointer to Ethernet device.
- * @param mp
- *   Pointer to the memory pool to register.
+ * @param[out] lkp_tbl
+ *   Pointer to lookup table to fill. The size of array must be at least
+ *   (priv->static_mr_n + 1).
+ * @param n
+ *   Size of lookup table.
+ * @param[in] mp
+ *   Pointer to Memory Pool.
  *
  * @return
- *   The memory region on success.
+ *   Size of returning lookup table.
  */
-struct mlx5_mr *
-mlx5_mr_get(struct rte_eth_dev *dev, struct rte_mempool *mp)
+int
+mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_cache *lkp_tbl,
+		  uint16_t n, struct rte_mempool *mp)
 {
-	struct priv *priv = dev->data->dev_private;
-	struct mlx5_mr *mr;
+	struct mr_update_mempool_data data = {
+		.dev = dev,
+		.lkp_tbl = lkp_tbl,
+		.tbl_sz = n
+	};
 
-	assert(mp);
-	if (LIST_EMPTY(&priv->mr))
-		return NULL;
-	LIST_FOREACH(mr, &priv->mr, next) {
-		if (mr->mp == mp) {
-			rte_atomic32_inc(&mr->refcnt);
-			DRV_LOG(DEBUG, "port %u memory region %p refcnt: %d",
-				dev->data->port_id, (void *)mr,
-				rte_atomic32_read(&mr->refcnt));
-			return mr;
-		}
-	}
-	return NULL;
+	rte_mempool_mem_iter(mp, mr_update_mempool_cb, &data);
+	return data.tbl_sz;
+}
+
+/* Called by qsort() to compare MR entries. */
+static int
+mr_comp_addr(const void *m1, const void *m2)
+{
+	const struct mlx5_mr *mi1 = m1;
+	const struct mlx5_mr *mi2 = m2;
+
+	if (mi1->memseg->addr < mi2->memseg->addr)
+		return -1;
+	else if (mi1->memseg->addr > mi2->memseg->addr)
+		return 1;
+	else
+		return 0;
 }
 
 /**
- * Release the memory region object.
+ * Register entire physical memory to Verbs.
  *
- * @param  mr
- *   Pointer to memory region to release.
+ * @param dev
+ *   Pointer to Ethernet device.
  *
  * @return
- *   1 while a reference on it exists, 0 when freed.
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
  */
 int
-mlx5_mr_release(struct mlx5_mr *mr)
+mlx5_mr_register_memseg(struct rte_eth_dev *dev)
 {
-	assert(mr);
-	DRV_LOG(DEBUG, "memory region %p refcnt: %d", (void *)mr,
-		rte_atomic32_read(&mr->refcnt));
-	if (rte_atomic32_dec_and_test(&mr->refcnt)) {
-		claim_zero(ibv_dereg_mr(mr->mr));
-		LIST_REMOVE(mr, next);
-		rte_free(mr);
+	struct priv *priv = dev->data->dev_private;
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	struct mlx5_mr *mr;
+	struct mlx5_mr_cache *mr_cache;
+	unsigned int i;
+
+	if (priv->mr_n != 0)
 		return 0;
+	/* Count the existing memsegs in the system. */
+	for (i = 0; (i < RTE_MAX_MEMSEG) && (ms[i].addr != NULL); ++i)
+		++priv->mr_n;
+	priv->mr = rte_calloc(__func__, priv->mr_n, sizeof(*mr), 0);
+	if (priv->mr == NULL) {
+		DRV_LOG(ERR,
+			"port %u cannot allocate memory for array of static MR",
+			dev->data->port_id);
+		rte_errno = ENOMEM;
+		return -rte_errno;
+	}
+	priv->mr_cache = rte_calloc(__func__, MR_TABLE_SZ(priv->mr_n),
+				    sizeof(*mr_cache), 0);
+	if (priv->mr_cache == NULL) {
+		DRV_LOG(ERR,
+			"port %u cannot allocate memory for array of MR cache",
+			dev->data->port_id);
+		rte_free(priv->mr);
+		rte_errno = ENOMEM;
+		return -rte_errno;
 	}
-	return 1;
+	for (i = 0; i < priv->mr_n; ++i) {
+		mr = &(*priv->mr)[i];
+		mr->memseg = &ms[i];
+		mr->ibv_mr = ibv_reg_mr(priv->pd,
+					mr->memseg->addr, mr->memseg->len,
+					IBV_ACCESS_LOCAL_WRITE);
+		if (mr->ibv_mr == NULL) {
+			rte_dump_physmem_layout(stderr);
+			DRV_LOG(ERR, "port %u cannot register memseg[%u]",
+				dev->data->port_id, i);
+			goto error;
+		}
+	}
+	/* Sort by virtual address. */
+	qsort(*priv->mr, priv->mr_n, sizeof(struct mlx5_mr), mr_comp_addr);
+	/* First entry must be NULL for comparison. */
+	(*priv->mr_cache)[0] = (struct mlx5_mr_cache) {
+		.lkey = UINT32_MAX,
+	};
+	/* Compile global all-inclusive MR cache table. */
+	for (i = 0; i < priv->mr_n; ++i) {
+		mr = &(*priv->mr)[i];
+		mr_cache = &(*priv->mr_cache)[i + 1];
+		/* Paranoid, mr[] must be sorted. */
+		assert(i == 0 || mr->memseg->addr > (mr - 1)->memseg->addr);
+		*mr_cache = (struct mlx5_mr_cache) {
+			.start = (uintptr_t)mr->memseg->addr,
+			.end = (uintptr_t)mr->memseg->addr + mr->memseg->len,
+			.lkey = rte_cpu_to_be_32(mr->ibv_mr->lkey)
+		};
+	}
+	return 0;
+error:
+	for (i = 0; i < priv->mr_n; ++i) {
+		mr = &(*priv->mr)[i];
+		if (mr->ibv_mr != NULL)
+			ibv_dereg_mr(mr->ibv_mr);
+	}
+	rte_free(priv->mr);
+	rte_free(priv->mr_cache);
+	rte_errno = ENOMEM;
+	return -rte_errno;
 }
 
 /**
- * Verify the flow list is empty
+ * Deregister all Memory Regions.
  *
  * @param dev
  *   Pointer to Ethernet device.
- *
- * @return
- *   The number of object not released.
  */
-int
-mlx5_mr_verify(struct rte_eth_dev *dev)
+void
+mlx5_mr_deregister_memseg(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-	int ret = 0;
-	struct mlx5_mr *mr;
+	unsigned int i;
+
+	if (priv->mr_n == 0)
+		return;
+	for (i = 0; i < priv->mr_n; ++i) {
+		struct mlx5_mr *mr;
 
-	LIST_FOREACH(mr, &priv->mr, next) {
-		DRV_LOG(DEBUG, "port %u memory region %p still referenced",
-			dev->data->port_id, (void *)mr);
-		++ret;
+		mr = &(*priv->mr)[i];
+		/* Physical memory can't be changed dynamically. */
+		assert(mr->memseg != NULL);
+		assert(mr->ibv_mr != NULL);
+		ibv_dereg_mr(mr->ibv_mr);
 	}
-	return ret;
+	rte_free(priv->mr);
+	rte_free(priv->mr_cache);
+	priv->mr = NULL;
+	priv->mr_cache = NULL;
+	priv->mr_n = 0;
 }
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 47e84c2029..7161825a57 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -595,16 +595,6 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 		goto error;
 	}
 	tmpl->rxq_ctrl = rxq_ctrl;
-	/* Use the entire RX mempool as the memory region. */
-	tmpl->mr = mlx5_mr_get(dev, rxq_data->mp);
-	if (!tmpl->mr) {
-		tmpl->mr = mlx5_mr_new(dev, rxq_data->mp);
-		if (!tmpl->mr) {
-			DRV_LOG(ERR, "port %u: memeroy region creation failure",
-				dev->data->port_id);
-			goto error;
-		}
-	}
 	if (rxq_ctrl->irq) {
 		tmpl->channel = ibv_create_comp_channel(priv->ctx);
 		if (!tmpl->channel) {
@@ -737,14 +727,14 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 	for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
 		struct rte_mbuf *buf = (*rxq_data->elts)[i];
 		volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i];
+		uintptr_t addr = rte_pktmbuf_mtod(buf, uintptr_t);
 
 		/* scat->addr must be able to store a pointer. */
 		assert(sizeof(scat->addr) >= sizeof(uintptr_t));
 		*scat = (struct mlx5_wqe_data_seg){
-			.addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
-								  uintptr_t)),
+			.addr = rte_cpu_to_be_64(addr),
 			.byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
-			.lkey = tmpl->mr->lkey,
+			.lkey = mlx5_rx_mb2mr(rxq_data, buf)
 		};
 	}
 	rxq_data->rq_db = rwq.dbrec;
@@ -780,8 +770,6 @@ mlx5_rxq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 		claim_zero(ibv_destroy_cq(tmpl->cq));
 	if (tmpl->channel)
 		claim_zero(ibv_destroy_comp_channel(tmpl->channel));
-	if (tmpl->mr)
-		mlx5_mr_release(tmpl->mr);
 	priv->verbs_alloc_ctx.type = MLX5_VERBS_ALLOC_TYPE_NONE;
 	rte_errno = ret; /* Restore rte_errno. */
 	return NULL;
@@ -811,7 +799,6 @@ mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx)
 		return NULL;
 	rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
 	if (rxq_ctrl->ibv) {
-		mlx5_mr_get(dev, rxq_data->mp);
 		rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
 		DRV_LOG(DEBUG, "port %u Verbs Rx queue %u: refcnt %d",
 			dev->data->port_id, rxq_ctrl->idx,
@@ -832,15 +819,9 @@ mlx5_rxq_ibv_get(struct rte_eth_dev *dev, uint16_t idx)
 int
 mlx5_rxq_ibv_release(struct mlx5_rxq_ibv *rxq_ibv)
 {
-	int ret;
-
 	assert(rxq_ibv);
 	assert(rxq_ibv->wq);
 	assert(rxq_ibv->cq);
-	assert(rxq_ibv->mr);
-	ret = mlx5_mr_release(rxq_ibv->mr);
-	if (!ret)
-		rxq_ibv->mr = NULL;
 	DRV_LOG(DEBUG, "port %u Verbs Rx queue %u: refcnt %d",
 		PORT_ID(rxq_ibv->rxq_ctrl->priv),
 		rxq_ibv->rxq_ctrl->idx, rte_atomic32_read(&rxq_ibv->refcnt));
@@ -918,10 +899,12 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	const uint16_t desc_n =
 		desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
 	unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
+	const unsigned int mr_n = MR_TABLE_SZ(priv->mr_n);
 
 	tmpl = rte_calloc_socket("RXQ", 1,
 				 sizeof(*tmpl) +
-				 desc_n * sizeof(struct rte_mbuf *),
+				 desc_n * sizeof(struct rte_mbuf *) +
+				 mr_n * sizeof(struct mlx5_mr_cache),
 				 0, socket);
 	if (!tmpl) {
 		rte_errno = ENOMEM;
@@ -1023,6 +1006,13 @@ mlx5_rxq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		MLX5_VPMD_RXQ_RPLNSH_THRESH(1 << tmpl->rxq.elts_n);
 	tmpl->rxq.elts =
 		(struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
+	tmpl->rxq.mr_ctrl.cache_bh =
+		(struct mlx5_mr_cache (*)[mr_n])&(*tmpl->rxq.elts)[desc_n];
+	tmpl->rxq.mr_ctrl.bh_n =
+		mlx5_mr_update_mp(dev, *tmpl->rxq.mr_ctrl.cache_bh,
+				  tmpl->rxq.mr_ctrl.bh_n, mp);
+	DRV_LOG(DEBUG, "Rx MR lookup table: %u entires built",
+		MR_N(tmpl->rxq.mr_ctrl.bh_n));
 	tmpl->idx = idx;
 	rte_atomic32_inc(&tmpl->refcnt);
 	DRV_LOG(DEBUG, "port %u Rx queue %u: refcnt %d", dev->data->port_id,
diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
index 1bbce3b754..d95c4bff3d 100644
--- a/drivers/net/mlx5/mlx5_rxtx.c
+++ b/drivers/net/mlx5/mlx5_rxtx.c
@@ -1920,6 +1920,9 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
 		 * changes.
 		 */
 		wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+		/* If there's only one MR, no need to replace LKEY in WQEs. */
+		if (unlikely(!IS_SINGLE_MR(rxq->mr_ctrl.bh_n)))
+			wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
 		if (len > DATA_LEN(seg)) {
 			len -= DATA_LEN(seg);
 			++NB_SEGS(pkt);
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 36fc93ea2c..5dc58a123f 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -82,17 +82,37 @@ struct mlx5_txq_stats {
 
 struct priv;
 
-/* Memory region queue object. */
+/* Memory Region object. */
 struct mlx5_mr {
-	LIST_ENTRY(mlx5_mr) next; /**< Pointer to the next element. */
-	rte_atomic32_t refcnt; /*<< Reference counter. */
-	uint32_t lkey; /*<< rte_cpu_to_be_32(mr->lkey) */
-	uintptr_t start; /* Start address of MR */
-	uintptr_t end; /* End address of MR */
-	struct ibv_mr *mr; /*<< Memory Region. */
-	struct rte_mempool *mp; /*<< Memory Pool. */
+	const struct rte_memseg *memseg;
+	struct ibv_mr *ibv_mr; /* Verbs Memory Region. */
 };
 
+/* Cache entry for Memory Region. */
+struct mlx5_mr_cache {
+	uintptr_t start; /* Start address of MR. */
+	uintptr_t end; /* End address of MR. */
+	uint32_t lkey; /* rte_cpu_to_be_32(ibv_mr->lkey). */
+} __rte_packed;
+
+/* Per-queue MR control descriptor. */
+struct mlx5_mr_ctrl {
+	uint16_t bh_n; /* Size of MR cache table for bottom-half. */
+	uint16_t mru; /* Index of last hit entry. */
+	uint16_t head; /* Index of the oldest entry. */
+	struct mlx5_mr_cache cache[MLX5_MR_CACHE_N]; /* MR cache. */
+	struct mlx5_mr_cache (*cache_bh)[]; /* MR cache for bottom-half. */
+} __rte_packed;
+
+/* MR table size including padding at index 0. */
+#define MR_TABLE_SZ(n) ((n) + MLX5_MR_LOOKUP_TABLE_PAD)
+
+/* Actual table size excluding padding at index 0. */
+#define MR_N(n) ((n) - MLX5_MR_LOOKUP_TABLE_PAD)
+
+/* Whether there's only one entry in MR lookup table. */
+#define IS_SINGLE_MR(n) (MR_N(n) <= 1)
+
 /* Compressed CQE context. */
 struct rxq_zip {
 	uint16_t ai; /* Array index. */
@@ -122,6 +142,7 @@ struct mlx5_rxq_data {
 	uint16_t rq_pi;
 	uint16_t cq_ci;
 	uint16_t rq_repl_thresh; /* Threshold for buffer replenishment. */
+	struct mlx5_mr_ctrl mr_ctrl;
 	volatile struct mlx5_wqe_data_seg(*wqes)[];
 	volatile struct mlx5_cqe(*cqes)[];
 	struct rxq_zip zip; /* Compressed context. */
@@ -143,7 +164,6 @@ struct mlx5_rxq_ibv {
 	struct ibv_cq *cq; /* Completion Queue. */
 	struct ibv_wq *wq; /* Work Queue. */
 	struct ibv_comp_channel *channel;
-	struct mlx5_mr *mr; /* Memory Region (for mp). */
 };
 
 /* RX queue control descriptor. */
@@ -201,15 +221,14 @@ struct mlx5_txq_data {
 	uint16_t mpw_hdr_dseg:1; /* Enable DSEGs in the title WQEBB. */
 	uint16_t max_inline; /* Multiple of RTE_CACHE_LINE_SIZE to inline. */
 	uint16_t inline_max_packet_sz; /* Max packet size for inlining. */
-	uint16_t mr_cache_idx; /* Index of last hit entry. */
 	uint32_t qp_num_8s; /* QP number shifted by 8. */
 	uint32_t flags; /* Flags for Tx Queue. */
+	struct mlx5_mr_ctrl mr_ctrl;
 	volatile struct mlx5_cqe (*cqes)[]; /* Completion queue. */
 	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
 	volatile void *bf_reg; /* Blueflame register remapped. */
-	struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
 } __rte_cache_aligned;
@@ -338,9 +357,10 @@ uint16_t mlx5_rx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts,
 
 /* mlx5_mr.c */
 
-void mlx5_mp2mr_iter(struct rte_mempool *mp, void *arg);
-struct mlx5_mr *mlx5_txq_mp2mr_reg(struct mlx5_txq_data *txq,
-				   struct rte_mempool *mp, unsigned int idx);
+int mlx5_mr_update_mp(struct rte_eth_dev *dev, struct mlx5_mr_cache *lkp_tbl,
+		      uint16_t n, struct rte_mempool *mp);
+uint32_t mlx5_rx_mb2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
+uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, uintptr_t addr);
 
 #ifndef NDEBUG
 /**
@@ -528,75 +548,98 @@ mlx5_tx_complete(struct mlx5_txq_data *txq)
 }
 
 /**
- * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
+ * Look up LKEY from given lookup table by linear search. Firstly look up the
+ * last-hit entry. If miss, the entire array is searched. If found, update the
+ * last-hit index and return LKEY.
  *
- * @param buf
+ * @param lkp_tbl
+ *   Pointer to lookup table.
+ * @param[in,out] cached_idx
+ *   Pointer to last-hit index.
+ * @param n
+ *   Size of lookup table.
+ * @param addr
+ *   Search key.
+ *
+ * @return
+ *   Searched LKEY on success, UINT32_MAX on no match.
+ */
+static __rte_always_inline uint32_t
+mlx5_mr_lookup_cache(struct mlx5_mr_cache *lkp_tbl, uint16_t *cached_idx,
+		     uint16_t n, uintptr_t addr)
+{
+	uint16_t idx;
+
+	if (likely(addr >= lkp_tbl[*cached_idx].start &&
+		   addr < lkp_tbl[*cached_idx].end))
+		return lkp_tbl[*cached_idx].lkey;
+	for (idx = 0; idx < n && lkp_tbl[idx].start != 0; ++idx) {
+		if (addr >= lkp_tbl[idx].start &&
+		    addr < lkp_tbl[idx].end) {
+			/* Found. */
+			*cached_idx = idx;
+			return lkp_tbl[idx].lkey;
+		}
+	}
+	return UINT32_MAX;
+}
+
+/**
+ * Query LKEY from a packet buffer for Rx.
+ *
+ * @param rxq
+ *   Pointer to Rx queue structure.
+ * @param mb
  *   Pointer to mbuf.
  *
  * @return
- *   Memory pool where data is located for given mbuf.
+ *   LKEY on success.
  */
-static struct rte_mempool *
-mlx5_tx_mb2mp(struct rte_mbuf *buf)
+static __rte_always_inline uint32_t
+mlx5_rx_mb2mr(struct mlx5_rxq_data *rxq, struct rte_mbuf *mb)
 {
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
+	const uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	/* Linear search on MR cache array. */
+	lkey = mlx5_mr_lookup_cache(rxq->mr_ctrl.cache,
+				    &rxq->mr_ctrl.mru,
+				    MLX5_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	DEBUG("No found in rxq->mr_cache[], last-hit = %u, head = %u)",
+	      rxq->mr_ctrl.mru, rxq->mr_ctrl.head);
+	/* Take slower bottom-half (binary search) on miss. */
+	return mlx5_rx_mb2mr_bh(rxq, addr);
 }
 
 /**
- * Get Memory Region (MR) <-> rte_mbuf association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
+ * Query LKEY from a packet buffer for Tx. If not found, add the mempool.
  *
  * @param txq
- *   Pointer to TX queue structure.
- * @param[in] mp
- *   Memory Pool for which a Memory Region lkey must be returned.
+ *   Pointer to Tx queue structure.
+ * @param mb
+ *   Pointer to mbuf.
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   LKEY on success.
  */
 static __rte_always_inline uint32_t
 mlx5_tx_mb2mr(struct mlx5_txq_data *txq, struct rte_mbuf *mb)
 {
-	uint16_t i = txq->mr_cache_idx;
-	uintptr_t addr = rte_pktmbuf_mtod(mb, uintptr_t);
-	struct mlx5_mr *mr;
-
-	assert(i < RTE_DIM(txq->mp2mr));
-	if (likely(txq->mp2mr[i]->start <= addr && txq->mp2mr[i]->end > addr))
-		return txq->mp2mr[i]->lkey;
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i] == NULL ||
-		    txq->mp2mr[i]->mr == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i]->start <= addr &&
-		    txq->mp2mr[i]->end > addr) {
-			assert(txq->mp2mr[i]->lkey != (uint32_t)-1);
-			txq->mr_cache_idx = i;
-			return txq->mp2mr[i]->lkey;
-		}
-	}
-	mr = mlx5_txq_mp2mr_reg(txq, mlx5_tx_mb2mp(mb), i);
-	/*
-	 * Request the reference to use in this queue, the original one is
-	 * kept by the control plane.
-	 */
-	if (mr) {
-		rte_atomic32_inc(&mr->refcnt);
-		txq->mr_cache_idx = i >= RTE_DIM(txq->mp2mr) ? i - 1 : i;
-		return mr->lkey;
-	} else {
-		struct rte_mempool *mp = mlx5_tx_mb2mp(mb);
-
-		DRV_LOG(WARNING, "failed to register mempool 0x%p(%s)",
-			(void *)mp, mp->name);
-	}
-	return (uint32_t)-1;
+	const uintptr_t addr = (uintptr_t)mb->buf_addr;
+	uint32_t lkey;
+
+	/* Linear search on MR cache array. */
+	lkey = mlx5_mr_lookup_cache(txq->mr_ctrl.cache,
+				    &txq->mr_ctrl.mru,
+				    MLX5_MR_CACHE_N, addr);
+	if (likely(lkey != UINT32_MAX))
+		return lkey;
+	DEBUG("No found in txq->mr_cache[], last-hit = %u, head = %u)",
+	      txq->mr_ctrl.mru, txq->mr_ctrl.head);
+	/* Take slower bottom-half (binary search) on miss. */
+	return mlx5_tx_mb2mr_bh(txq, addr);
 }
 
 /**
diff --git a/drivers/net/mlx5/mlx5_rxtx_vec.h b/drivers/net/mlx5/mlx5_rxtx_vec.h
index d504e2aee1..750559b8d1 100644
--- a/drivers/net/mlx5/mlx5_rxtx_vec.h
+++ b/drivers/net/mlx5/mlx5_rxtx_vec.h
@@ -115,9 +115,13 @@ mlx5_rx_replenish_bulk_mbuf(struct mlx5_rxq_data *rxq, uint16_t n)
 		rxq->stats.rx_nombuf += n;
 		return;
 	}
-	for (i = 0; i < n; ++i)
+	for (i = 0; i < n; ++i) {
 		wq[i].addr = rte_cpu_to_be_64((uintptr_t)elts[i]->buf_addr +
 					      RTE_PKTMBUF_HEADROOM);
+		/* If there's only one MR, no need to replace LKEY in WQEs. */
+		if (unlikely(!IS_SINGLE_MR(rxq->mr_ctrl.bh_n)))
+			wq[i].lkey = mlx5_rx_mb2mr(rxq, elts[i]);
+	}
 	rxq->rq_ci += n;
 	/* Prevent overflowing into consumed mbufs. */
 	elts_idx = rxq->rq_ci & q_mask;
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 9a1d6f954b..e6a29cb7fe 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -74,17 +74,10 @@ mlx5_txq_start(struct rte_eth_dev *dev)
 	int ret;
 
 	for (i = 0; i != priv->txqs_n; ++i) {
-		unsigned int idx = 0;
-		struct mlx5_mr *mr;
 		struct mlx5_txq_ctrl *txq_ctrl = mlx5_txq_get(dev, i);
 
 		if (!txq_ctrl)
 			continue;
-		LIST_FOREACH(mr, &priv->mr, next) {
-			mlx5_txq_mp2mr_reg(&txq_ctrl->txq, mr->mp, idx++);
-			if (idx == MLX5_PMD_TX_MP_CACHE)
-				break;
-		}
 		txq_alloc_elts(txq_ctrl);
 		txq_ctrl->ibv = mlx5_txq_ibv_new(dev, i);
 		if (!txq_ctrl->ibv) {
@@ -177,7 +170,6 @@ int
 mlx5_dev_start(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-	struct mlx5_mr *mr = NULL;
 	int ret;
 
 	DRV_LOG(DEBUG, "port %u starting device", dev->data->port_id);
@@ -187,7 +179,6 @@ mlx5_dev_start(struct rte_eth_dev *dev)
 			dev->data->port_id, strerror(rte_errno));
 		goto error;
 	}
-	rte_mempool_walk(mlx5_mp2mr_iter, priv);
 	ret = mlx5_txq_start(dev);
 	if (ret) {
 		DRV_LOG(ERR, "port %u Tx queue allocation failed: %s",
@@ -229,8 +220,6 @@ mlx5_dev_start(struct rte_eth_dev *dev)
 	ret = rte_errno; /* Save rte_errno before cleanup. */
 	/* Rollback. */
 	dev->data->dev_started = 0;
-	for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr))
-		mlx5_mr_release(mr);
 	mlx5_flow_stop(dev, &priv->flows);
 	mlx5_traffic_disable(dev);
 	mlx5_txq_stop(dev);
@@ -252,7 +241,6 @@ void
 mlx5_dev_stop(struct rte_eth_dev *dev)
 {
 	struct priv *priv = dev->data->dev_private;
-	struct mlx5_mr *mr;
 
 	dev->data->dev_started = 0;
 	/* Prevent crashes when queues are still in use. */
@@ -267,8 +255,6 @@ mlx5_dev_stop(struct rte_eth_dev *dev)
 	mlx5_dev_interrupt_handler_uninstall(dev);
 	mlx5_txq_stop(dev);
 	mlx5_rxq_stop(dev);
-	for (mr = LIST_FIRST(&priv->mr); mr; mr = LIST_FIRST(&priv->mr))
-		mlx5_mr_release(mr);
 	mlx5_flow_delete_drop_queue(dev);
 }
 
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 760ac92d44..2ead2177fb 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -339,7 +339,6 @@ mlx5_txq_ibv_new(struct rte_eth_dev *dev, uint16_t idx)
 		return NULL;
 	}
 	memset(&tmpl, 0, sizeof(struct mlx5_txq_ibv));
-	/* MRs will be registered in mp2mr[] later. */
 	attr.cq = (struct ibv_cq_init_attr_ex){
 		.comp_mask = 0,
 	};
@@ -622,10 +621,12 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		((MLX5_MAX_TSO_HEADER + (RTE_CACHE_LINE_SIZE - 1)) /
 		 RTE_CACHE_LINE_SIZE);
 	struct mlx5_txq_ctrl *tmpl;
+	const unsigned int mr_n = MR_TABLE_SZ(priv->mr_n);
 
 	tmpl = rte_calloc_socket("TXQ", 1,
 				 sizeof(*tmpl) +
-				 desc * sizeof(struct rte_mbuf *),
+				 desc * sizeof(struct rte_mbuf *) +
+				 mr_n * sizeof(struct mlx5_mr_cache),
 				 0, socket);
 	if (!tmpl) {
 		rte_errno = ENOMEM;
@@ -639,7 +640,6 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 	tmpl->idx = idx;
 	if (priv->mps == MLX5_MPW_ENHANCED)
 		tmpl->txq.mpw_hdr_dseg = priv->mpw_hdr_dseg;
-	/* MRs will be registered in mp2mr[] later. */
 	DRV_LOG(DEBUG, "port %u priv->device_attr.max_qp_wr is %d",
 		dev->data->port_id, priv->device_attr.orig_attr.max_qp_wr);
 	DRV_LOG(DEBUG, "port %u priv->device_attr.max_sge is %d",
@@ -700,6 +700,9 @@ mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 		tmpl->txq.tunnel_en = 1;
 	tmpl->txq.elts =
 		(struct rte_mbuf *(*)[1 << tmpl->txq.elts_n])(tmpl + 1);
+	tmpl->txq.mr_ctrl.cache_bh =
+		(struct mlx5_mr_cache (*)[mr_n])
+		&(*tmpl->txq.elts)[1 << tmpl->txq.elts_n];
 	tmpl->txq.stats.idx = idx;
 	rte_atomic32_inc(&tmpl->refcnt);
 	DRV_LOG(DEBUG, "port %u Tx queue %u: refcnt %d", dev->data->port_id,
@@ -728,15 +731,8 @@ mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx)
 	if ((*priv->txqs)[idx]) {
 		ctrl = container_of((*priv->txqs)[idx], struct mlx5_txq_ctrl,
 				    txq);
-		unsigned int i;
 
 		mlx5_txq_ibv_get(dev, idx);
-		for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) {
-			if (ctrl->txq.mp2mr[i])
-				claim_nonzero
-					(mlx5_mr_get(dev,
-						     ctrl->txq.mp2mr[i]->mp));
-		}
 		rte_atomic32_inc(&ctrl->refcnt);
 		DRV_LOG(DEBUG, "port %u Tx queue %u refcnt %d",
 			dev->data->port_id,
@@ -760,7 +756,6 @@ int
 mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
 {
 	struct priv *priv = dev->data->dev_private;
-	unsigned int i;
 	struct mlx5_txq_ctrl *txq;
 	size_t page_size = sysconf(_SC_PAGESIZE);
 
@@ -771,12 +766,6 @@ mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx)
 		txq->idx, rte_atomic32_read(&txq->refcnt));
 	if (txq->ibv && !mlx5_txq_ibv_release(txq->ibv))
 		txq->ibv = NULL;
-	for (i = 0; i != MLX5_PMD_TX_MP_CACHE; ++i) {
-		if (txq->txq.mp2mr[i]) {
-			mlx5_mr_release(txq->txq.mp2mr[i]);
-			txq->txq.mp2mr[i] = NULL;
-		}
-	}
 	if (priv->uar_base)
 		munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg,
 		       page_size), page_size);
-- 
2.11.0



More information about the stable mailing list