[dpdk-dev] [PATCH v2] net/mlx5: mmap uar address around huge pages

Xueming Li xuemingl at mellanox.com
Thu Jan 25 16:00:24 CET 2018


Reserving the memory space for the UAR near huge pages helps to
**reduce** the cases where the secondary process cannot start. Those
pages being physical pages they must be mapped at the same virtual
address as in the primary process to have a
working secondary process.

As this remap is almost the latest being done by the processes
(libraries, heaps, stacks are already loaded), similar to huge pages,
there is **no guarantee** this mechanism will always work.

Signed-off-by: Xueming Li <xuemingl at mellanox.com>
---
 drivers/net/mlx5/mlx5.c         | 114 ++++++++++++++++++++++++++++++++++++++--
 drivers/net/mlx5/mlx5.h         |   1 +
 drivers/net/mlx5/mlx5_defs.h    |  10 ++++
 drivers/net/mlx5/mlx5_rxtx.h    |   3 +-
 drivers/net/mlx5/mlx5_trigger.c |   7 ++-
 drivers/net/mlx5/mlx5_txq.c     |  51 +++++++++++++-----
 6 files changed, 167 insertions(+), 19 deletions(-)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index bc4b6ba..7c0cd28 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -39,6 +39,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include <net/if.h>
+#include <sys/mman.h>
 
 /* Verbs header. */
 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
@@ -56,6 +57,7 @@
 #include <rte_pci.h>
 #include <rte_bus_pci.h>
 #include <rte_common.h>
+#include <rte_eal_memconfig.h>
 #include <rte_kvargs.h>
 
 #include "mlx5.h"
@@ -478,6 +480,106 @@
 
 static struct rte_pci_driver mlx5_driver;
 
+/*
+ * Reserved UAR address space for TXQ UAR(hw doorbell) mapping, process
+ * local resource used by both primary and secondary to avoid duplicate
+ * reservation.
+ * The space has to be available on both primary and secondary process,
+ * TXQ UAR maps to this area using fixed mmap w/o double check.
+ */
+static void *uar_base;
+
+/**
+ * Reserve UAR address space for primary process.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_uar_init_primary(struct priv *priv)
+{
+	void *addr = (void *)0;
+	int i;
+	const struct rte_mem_config *mcfg;
+	int ret;
+
+	if (uar_base) { /* UAR address space mapped. */
+		priv->uar_base = uar_base;
+		return 0;
+	}
+	/* find out lower bound of hugepage segments */
+	mcfg = rte_eal_get_configuration()->mem_config;
+	for (i = 0; i < RTE_MAX_MEMSEG && mcfg->memseg[i].addr; i++) {
+		if (addr)
+			addr = RTE_MIN(addr, mcfg->memseg[i].addr);
+		else
+			addr = mcfg->memseg[i].addr;
+	}
+	/* keep distance to hugepages to minimize potential conflicts. */
+	addr = RTE_PTR_SUB(addr, MLX5_UAR_OFFSET + MLX5_UAR_SIZE);
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(addr, MLX5_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("Failed to reserve UAR address space, please adjust "
+		      "MLX5_UAR_SIZE or try --base-virtaddr");
+		ret = ENOMEM;
+		return ret;
+	}
+	/* Accept either same addr or a new addr returned from mmap if target
+	 * range occupied.
+	 */
+	INFO("Reserved UAR address space: %p", addr);
+	priv->uar_base = addr; /* for primary and secondary UAR re-mmap. */
+	uar_base = addr; /* process local, don't reserve again. */
+	return 0;
+}
+
+/**
+ * Reserve UAR address space for secondary process, align with
+ * primary process.
+ *
+ * @param[in] priv
+ *   Pointer to private structure.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+static int
+priv_uar_init_secondary(struct priv *priv)
+{
+	void *addr;
+	int ret;
+
+	assert(priv->uar_base);
+	if (uar_base) { /* already reserved. */
+		assert(uar_base == priv->uar_base);
+		return 0;
+	}
+	/* anonymous mmap, no real memory consumption. */
+	addr = mmap(priv->uar_base, MLX5_UAR_SIZE,
+		    PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+	if (addr == MAP_FAILED) {
+		ERROR("UAR mmap failed: %p size: %llu",
+		      priv->uar_base, MLX5_UAR_SIZE);
+		ret = ENXIO;
+		return ret;
+	}
+	if (priv->uar_base != addr) {
+		ERROR("UAR address %p size %llu occupied, please adjust "
+		      "MLX5_UAR_OFFSET or try EAL parameter --base-virtaddr",
+		      priv->uar_base, MLX5_UAR_SIZE);
+		ret = ENXIO;
+		return ret;
+	}
+	uar_base = addr; /* process local, don't reserve again */
+	INFO("Reserved UAR address space: %p", addr);
+	return 0;
+}
+
 /**
  * DPDK callback to register a PCI device.
  *
@@ -660,6 +762,11 @@
 			eth_dev->device = &pci_dev->device;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			priv = eth_dev->data->dev_private;
+			err = priv_uar_init_secondary(priv);
+			if (err < 0) {
+				err = -err;
+				goto error;
+			}
 			/* Receive command fd from primary process */
 			err = priv_socket_connect(priv);
 			if (err < 0) {
@@ -668,10 +775,8 @@
 			}
 			/* Remap UAR for Tx queues. */
 			err = priv_tx_uar_remap(priv, err);
-			if (err < 0) {
-				err = -err;
+			if (err)
 				goto error;
-			}
 			/*
 			 * Ethdev pointer is still required as input since
 			 * the primary device is not accessible from the
@@ -817,6 +922,9 @@
 			WARN("Rx CQE compression isn't supported");
 			config.cqe_comp = 0;
 		}
+		err = priv_uar_init_primary(priv);
+		if (err)
+			goto port_error;
 		/* Configure the first MAC address by default. */
 		if (priv_get_mac(priv, &mac.addr_bytes)) {
 			ERROR("cannot get MAC address, is mlx5_en loaded?"
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index a7ec607..ab40914 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -180,6 +180,7 @@ struct priv {
 	struct mlx5_xstats_ctrl xstats_ctrl; /* Extended stats control. */
 	rte_spinlock_t lock; /* Lock for control functions. */
 	int primary_socket; /* Unix socket for primary process. */
+	void *uar_base; /* Reserved address space for UAR mapping */
 	struct rte_intr_handle intr_handle_socket; /* Interrupt handler. */
 	struct mlx5_dev_config config; /* Device configuration. */
 	struct mlx5_verbs_alloc_ctx verbs_alloc_ctx;
diff --git a/drivers/net/mlx5/mlx5_defs.h b/drivers/net/mlx5/mlx5_defs.h
index a71db28..7f6ae56 100644
--- a/drivers/net/mlx5/mlx5_defs.h
+++ b/drivers/net/mlx5/mlx5_defs.h
@@ -110,4 +110,14 @@
 /* Supported RSS */
 #define MLX5_RSS_HF_MASK (~(ETH_RSS_IP | ETH_RSS_UDP | ETH_RSS_TCP))
 
+/* Reserved address space for UAR mapping. */
+#define MLX5_UAR_SIZE (1ULL << 32)
+
+/* Offset of reserved UAR address space to hugepage memory. Offset is used here
+ * to minimize possibility of address next to hugepage being used by other code
+ * in either primary or secondary process, failing to map TX UAR would make TX
+ * packets invisible to HW.
+ */
+#define MLX5_UAR_OFFSET (1ULL << 32)
+
 #endif /* RTE_PMD_MLX5_DEFS_H_ */
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index 2eb2f05..278bcf2 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -205,7 +205,7 @@ struct mlx5_txq_data {
 	volatile void *wqes; /* Work queue (use volatile to write into). */
 	volatile uint32_t *qp_db; /* Work queue doorbell. */
 	volatile uint32_t *cq_db; /* Completion queue doorbell. */
-	volatile void *bf_reg; /* Blueflame register. */
+	volatile void *bf_reg; /* Blueflame register remapped. */
 	struct mlx5_mr *mp2mr[MLX5_PMD_TX_MP_CACHE]; /* MR translation table. */
 	struct rte_mbuf *(*elts)[]; /* TX elements. */
 	struct mlx5_txq_stats stats; /* TX queue counters. */
@@ -230,6 +230,7 @@ struct mlx5_txq_ctrl {
 	struct mlx5_txq_ibv *ibv; /* Verbs queue object. */
 	struct mlx5_txq_data txq; /* Data path structure. */
 	off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
+	volatile void *bf_reg_orig; /* Blueflame register from verbs. */
 };
 
 /* mlx5_rxq.c */
diff --git a/drivers/net/mlx5/mlx5_trigger.c b/drivers/net/mlx5/mlx5_trigger.c
index 827db2e..54819ce 100644
--- a/drivers/net/mlx5/mlx5_trigger.c
+++ b/drivers/net/mlx5/mlx5_trigger.c
@@ -76,10 +76,13 @@
 			goto error;
 		}
 	}
-	return -ret;
+	ret = priv_tx_uar_remap(priv, priv->ctx->cmd_fd);
+	if (ret)
+		goto error;
+	return ret;
 error:
 	priv_txq_stop(priv);
-	return -ret;
+	return ret;
 }
 
 static void
diff --git a/drivers/net/mlx5/mlx5_txq.c b/drivers/net/mlx5/mlx5_txq.c
index 18321c3..65086d3 100644
--- a/drivers/net/mlx5/mlx5_txq.c
+++ b/drivers/net/mlx5/mlx5_txq.c
@@ -288,7 +288,9 @@
 
 
 /**
- * Map locally UAR used in Tx queues for BlueFlame doorbell.
+ * Mmap TX UAR(HW doorbell) pages into reserved UAR address space.
+ * Both primary and secondary process do mmap to make UAR address
+ * aligned.
  *
  * @param[in] priv
  *   Pointer to private structure.
@@ -305,11 +307,14 @@
 	uintptr_t pages[priv->txqs_n];
 	unsigned int pages_n = 0;
 	uintptr_t uar_va;
+	uintptr_t off;
 	void *addr;
+	void *ret;
 	struct mlx5_txq_data *txq;
 	struct mlx5_txq_ctrl *txq_ctrl;
 	int already_mapped;
 	size_t page_size = sysconf(_SC_PAGESIZE);
+	int r;
 
 	memset(pages, 0, priv->txqs_n * sizeof(uintptr_t));
 	/*
@@ -320,8 +325,10 @@
 	for (i = 0; i != priv->txqs_n; ++i) {
 		txq = (*priv->txqs)[i];
 		txq_ctrl = container_of(txq, struct mlx5_txq_ctrl, txq);
-		uar_va = (uintptr_t)txq_ctrl->txq.bf_reg;
-		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size);
+		/* UAR addr form verbs used to find dup and offset in page. */
+		uar_va = (uintptr_t)txq_ctrl->bf_reg_orig;
+		off = uar_va & (page_size - 1); /* offset in page. */
+		uar_va = RTE_ALIGN_FLOOR(uar_va, page_size); /* page addr. */
 		already_mapped = 0;
 		for (j = 0; j != pages_n; ++j) {
 			if (pages[j] == uar_va) {
@@ -329,16 +336,30 @@
 				break;
 			}
 		}
-		if (already_mapped)
-			continue;
-		pages[pages_n++] = uar_va;
-		addr = mmap((void *)uar_va, page_size,
-			    PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
-			    txq_ctrl->uar_mmap_offset);
-		if (addr != (void *)uar_va) {
-			ERROR("call to mmap failed on UAR for txq %d\n", i);
-			return -1;
+		/* new address in reserved UAR address space. */
+		addr = RTE_PTR_ADD(priv->uar_base,
+				   uar_va & (MLX5_UAR_SIZE - 1));
+		if (!already_mapped) {
+			pages[pages_n++] = uar_va;
+			/* fixed mmap to specified address in reserved
+			 * address space.
+			 */
+			ret = mmap(addr, page_size,
+				   PROT_WRITE, MAP_FIXED | MAP_SHARED, fd,
+				   txq_ctrl->uar_mmap_offset);
+			if (ret != addr) {
+				/* fixed mmap have to return same address */
+				ERROR("call to mmap failed on UAR for txq %d\n",
+				      i);
+				r = ENXIO;
+				return r;
+			}
 		}
+		if (rte_eal_process_type() == RTE_PROC_PRIMARY) /* save once */
+			txq_ctrl->txq.bf_reg = RTE_PTR_ADD((void *)addr, off);
+		else
+			assert(txq_ctrl->txq.bf_reg ==
+			       RTE_PTR_ADD((void *)addr, off));
 	}
 	return 0;
 }
@@ -505,7 +526,7 @@ struct mlx5_txq_ibv*
 	txq_data->wqes = qp.sq.buf;
 	txq_data->wqe_n = log2above(qp.sq.wqe_cnt);
 	txq_data->qp_db = &qp.dbrec[MLX5_SND_DBR];
-	txq_data->bf_reg = qp.bf.reg;
+	txq_ctrl->bf_reg_orig = qp.bf.reg;
 	txq_data->cq_db = cq_info.dbrec;
 	txq_data->cqes =
 		(volatile struct mlx5_cqe (*)[])
@@ -840,6 +861,7 @@ struct mlx5_txq_ctrl*
 {
 	unsigned int i;
 	struct mlx5_txq_ctrl *txq;
+	size_t page_size = sysconf(_SC_PAGESIZE);
 
 	if (!(*priv->txqs)[idx])
 		return 0;
@@ -859,6 +881,9 @@ struct mlx5_txq_ctrl*
 			txq->txq.mp2mr[i] = NULL;
 		}
 	}
+	if (priv->uar_base)
+		munmap((void *)RTE_ALIGN_FLOOR((uintptr_t)txq->txq.bf_reg,
+		       page_size), page_size);
 	if (rte_atomic32_dec_and_test(&txq->refcnt)) {
 		txq_free_elts(txq);
 		LIST_REMOVE(txq, next);
-- 
1.8.3.1



More information about the dev mailing list