[dpdk-dev] [PATCH v2 06/13] mlx5: add MTU configuration support

Adrien Mazarguil adrien.mazarguil at 6wind.com
Fri Oct 30 19:52:35 CET 2015


Depending on the MTU and whether jumbo frames are enabled, RX queues may
switch between SG and non-SG modes for better performance.

Signed-off-by: Adrien Mazarguil <adrien.mazarguil at 6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro at 6wind.com>
---
 drivers/net/mlx5/mlx5.c        |   1 +
 drivers/net/mlx5/mlx5.h        |   1 +
 drivers/net/mlx5/mlx5_ethdev.c | 102 +++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxq.c    | 178 +++++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h   |   1 +
 5 files changed, 283 insertions(+)

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index aafa70b..ddd74d0 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -140,6 +140,7 @@ static const struct eth_dev_ops mlx5_dev_ops = {
 	.tx_queue_release = mlx5_tx_queue_release,
 	.mac_addr_remove = mlx5_mac_addr_remove,
 	.mac_addr_add = mlx5_mac_addr_add,
+	.mtu_set = mlx5_dev_set_mtu,
 };
 
 static struct {
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 3f47a15..0e2457a 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -162,6 +162,7 @@ int priv_get_mtu(struct priv *, uint16_t *);
 int priv_set_flags(struct priv *, unsigned int, unsigned int);
 int mlx5_dev_configure(struct rte_eth_dev *);
 void mlx5_dev_infos_get(struct rte_eth_dev *, struct rte_eth_dev_info *);
+int mlx5_dev_set_mtu(struct rte_eth_dev *, uint16_t);
 int mlx5_ibv_device_to_pci_addr(const struct ibv_device *,
 				struct rte_pci_addr *);
 
diff --git a/drivers/net/mlx5/mlx5_ethdev.c b/drivers/net/mlx5/mlx5_ethdev.c
index 6b13cec..0afc1bb 100644
--- a/drivers/net/mlx5/mlx5_ethdev.c
+++ b/drivers/net/mlx5/mlx5_ethdev.c
@@ -347,6 +347,23 @@ priv_get_mtu(struct priv *priv, uint16_t *mtu)
 }
 
 /**
+ * Set device MTU.
+ *
+ * @param priv
+ *   Pointer to private structure.
+ * @param mtu
+ *   MTU value to set.
+ *
+ * @return
+ *   0 on success, -1 on failure and errno is set.
+ */
+static int
+priv_set_mtu(struct priv *priv, uint16_t mtu)
+{
+	return priv_set_sysfs_ulong(priv, "mtu", mtu);
+}
+
+/**
  * Set device flags.
  *
  * @param priv
@@ -518,6 +535,91 @@ mlx5_dev_infos_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *info)
 }
 
 /**
+ * DPDK callback to change the MTU.
+ *
+ * Setting the MTU affects hardware MRU (packets larger than the MTU cannot be
+ * received). Use this as a hint to enable/disable scattered packets support
+ * and improve performance when not needed.
+ * Since failure is not an option, reconfiguring queues on the fly is not
+ * recommended.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param in_mtu
+ *   New MTU.
+ *
+ * @return
+ *   0 on success, negative errno value on failure.
+ */
+int
+mlx5_dev_set_mtu(struct rte_eth_dev *dev, uint16_t mtu)
+{
+	struct priv *priv = dev->data->dev_private;
+	int ret = 0;
+	unsigned int i;
+	uint16_t (*rx_func)(void *, struct rte_mbuf **, uint16_t) =
+		mlx5_rx_burst;
+
+	priv_lock(priv);
+	/* Set kernel interface MTU first. */
+	if (priv_set_mtu(priv, mtu)) {
+		ret = errno;
+		WARN("cannot set port %u MTU to %u: %s", priv->port, mtu,
+		     strerror(ret));
+		goto out;
+	} else
+		DEBUG("adapter port %u MTU set to %u", priv->port, mtu);
+	priv->mtu = mtu;
+	/* Temporarily replace RX handler with a fake one, assuming it has not
+	 * been copied elsewhere. */
+	dev->rx_pkt_burst = removed_rx_burst;
+	/* Make sure everyone has left mlx5_rx_burst() and uses
+	 * removed_rx_burst() instead. */
+	rte_wmb();
+	usleep(1000);
+	/* Reconfigure each RX queue. */
+	for (i = 0; (i != priv->rxqs_n); ++i) {
+		struct rxq *rxq = (*priv->rxqs)[i];
+		unsigned int max_frame_len;
+		int sp;
+
+		if (rxq == NULL)
+			continue;
+		/* Calculate new maximum frame length according to MTU and
+		 * toggle scattered support (sp) if necessary. */
+		max_frame_len = (priv->mtu + ETHER_HDR_LEN +
+				 (ETHER_MAX_VLAN_FRAME_LEN - ETHER_MAX_LEN));
+		sp = (max_frame_len > (rxq->mb_len - RTE_PKTMBUF_HEADROOM));
+		/* Provide new values to rxq_setup(). */
+		dev->data->dev_conf.rxmode.jumbo_frame = sp;
+		dev->data->dev_conf.rxmode.max_rx_pkt_len = max_frame_len;
+		ret = rxq_rehash(dev, rxq);
+		if (ret) {
+			/* Force SP RX if that queue requires it and abort. */
+			if (rxq->sp)
+				rx_func = mlx5_rx_burst_sp;
+			break;
+		}
+		/* Reenable non-RSS queue attributes. No need to check
+		 * for errors at this stage. */
+		if (!priv->rss) {
+			if (priv->started)
+				rxq_mac_addrs_add(rxq);
+		}
+		/* Scattered burst function takes priority. */
+		if (rxq->sp)
+			rx_func = mlx5_rx_burst_sp;
+	}
+	/* Burst functions can now be called again. */
+	rte_wmb();
+	dev->rx_pkt_burst = rx_func;
+out:
+	priv_unlock(priv);
+	assert(ret >= 0);
+	return -ret;
+}
+
+/**
  * Get PCI information from struct ibv_device.
  *
  * @param device
diff --git a/drivers/net/mlx5/mlx5_rxq.c b/drivers/net/mlx5/mlx5_rxq.c
index 1eddfc7..71d4470 100644
--- a/drivers/net/mlx5/mlx5_rxq.c
+++ b/drivers/net/mlx5/mlx5_rxq.c
@@ -526,6 +526,184 @@ rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
 #endif /* RSS_SUPPORT */
 
 /**
+ * Reconfigure a RX queue with new parameters.
+ *
+ * rxq_rehash() does not allocate mbufs, which, if not done from the right
+ * thread (such as a control thread), may corrupt the pool.
+ * In case of failure, the queue is left untouched.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param rxq
+ *   RX queue pointer.
+ *
+ * @return
+ *   0 on success, errno value on failure.
+ */
+int
+rxq_rehash(struct rte_eth_dev *dev, struct rxq *rxq)
+{
+	struct priv *priv = rxq->priv;
+	struct rxq tmpl = *rxq;
+	unsigned int mbuf_n;
+	unsigned int desc_n;
+	struct rte_mbuf **pool;
+	unsigned int i, k;
+	struct ibv_exp_qp_attr mod;
+	struct ibv_recv_wr *bad_wr;
+	int err;
+	int parent = (rxq == &priv->rxq_parent);
+
+	if (parent) {
+		ERROR("%p: cannot rehash parent queue %p",
+		      (void *)dev, (void *)rxq);
+		return EINVAL;
+	}
+	DEBUG("%p: rehashing queue %p", (void *)dev, (void *)rxq);
+	/* Number of descriptors and mbufs currently allocated. */
+	desc_n = (tmpl.elts_n * (tmpl.sp ? MLX5_PMD_SGE_WR_N : 1));
+	mbuf_n = desc_n;
+	/* Enable scattered packets support for this queue if necessary. */
+	if ((dev->data->dev_conf.rxmode.jumbo_frame) &&
+	    (dev->data->dev_conf.rxmode.max_rx_pkt_len >
+	     (tmpl.mb_len - RTE_PKTMBUF_HEADROOM))) {
+		tmpl.sp = 1;
+		desc_n /= MLX5_PMD_SGE_WR_N;
+	} else
+		tmpl.sp = 0;
+	DEBUG("%p: %s scattered packets support (%u WRs)",
+	      (void *)dev, (tmpl.sp ? "enabling" : "disabling"), desc_n);
+	/* If scatter mode is the same as before, nothing to do. */
+	if (tmpl.sp == rxq->sp) {
+		DEBUG("%p: nothing to do", (void *)dev);
+		return 0;
+	}
+	/* Remove attached flows if RSS is disabled (no parent queue). */
+	if (!priv->rss) {
+		rxq_mac_addrs_del(&tmpl);
+		/* Update original queue in case of failure. */
+		memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
+	}
+	/* From now on, any failure will render the queue unusable.
+	 * Reinitialize QP. */
+	mod = (struct ibv_exp_qp_attr){ .qp_state = IBV_QPS_RESET };
+	err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
+	if (err) {
+		ERROR("%p: cannot reset QP: %s", (void *)dev, strerror(err));
+		assert(err > 0);
+		return err;
+	}
+	err = ibv_resize_cq(tmpl.cq, desc_n);
+	if (err) {
+		ERROR("%p: cannot resize CQ: %s", (void *)dev, strerror(err));
+		assert(err > 0);
+		return err;
+	}
+	mod = (struct ibv_exp_qp_attr){
+		/* Move the QP to this state. */
+		.qp_state = IBV_QPS_INIT,
+		/* Primary port number. */
+		.port_num = priv->port
+	};
+	err = ibv_exp_modify_qp(tmpl.qp, &mod,
+				(IBV_EXP_QP_STATE |
+#ifdef RSS_SUPPORT
+				 (parent ? IBV_EXP_QP_GROUP_RSS : 0) |
+#endif /* RSS_SUPPORT */
+				 IBV_EXP_QP_PORT));
+	if (err) {
+		ERROR("%p: QP state to IBV_QPS_INIT failed: %s",
+		      (void *)dev, strerror(err));
+		assert(err > 0);
+		return err;
+	};
+	/* Reconfigure flows. Do not care for errors. */
+	if (!priv->rss) {
+		if (priv->started)
+			rxq_mac_addrs_add(&tmpl);
+		/* Update original queue in case of failure. */
+		memcpy(rxq->mac_flow, tmpl.mac_flow, sizeof(rxq->mac_flow));
+	}
+	/* Allocate pool. */
+	pool = rte_malloc(__func__, (mbuf_n * sizeof(*pool)), 0);
+	if (pool == NULL) {
+		ERROR("%p: cannot allocate memory", (void *)dev);
+		return ENOBUFS;
+	}
+	/* Snatch mbufs from original queue. */
+	k = 0;
+	if (rxq->sp) {
+		struct rxq_elt_sp (*elts)[rxq->elts_n] = rxq->elts.sp;
+
+		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+			struct rxq_elt_sp *elt = &(*elts)[i];
+			unsigned int j;
+
+			for (j = 0; (j != RTE_DIM(elt->bufs)); ++j) {
+				assert(elt->bufs[j] != NULL);
+				pool[k++] = elt->bufs[j];
+			}
+		}
+	} else {
+		struct rxq_elt (*elts)[rxq->elts_n] = rxq->elts.no_sp;
+
+		for (i = 0; (i != RTE_DIM(*elts)); ++i) {
+			struct rxq_elt *elt = &(*elts)[i];
+			struct rte_mbuf *buf = (void *)
+				((uintptr_t)elt->sge.addr -
+				 WR_ID(elt->wr.wr_id).offset);
+
+			assert(WR_ID(elt->wr.wr_id).id == i);
+			pool[k++] = buf;
+		}
+	}
+	assert(k == mbuf_n);
+	tmpl.elts_n = 0;
+	tmpl.elts.sp = NULL;
+	assert((void *)&tmpl.elts.sp == (void *)&tmpl.elts.no_sp);
+	err = ((tmpl.sp) ?
+	       rxq_alloc_elts_sp(&tmpl, desc_n, pool) :
+	       rxq_alloc_elts(&tmpl, desc_n, pool));
+	if (err) {
+		ERROR("%p: cannot reallocate WRs, aborting", (void *)dev);
+		rte_free(pool);
+		assert(err > 0);
+		return err;
+	}
+	assert(tmpl.elts_n == desc_n);
+	assert(tmpl.elts.sp != NULL);
+	rte_free(pool);
+	/* Clean up original data. */
+	rxq->elts_n = 0;
+	rte_free(rxq->elts.sp);
+	rxq->elts.sp = NULL;
+	/* Post WRs. */
+	err = ibv_post_recv(tmpl.qp,
+			    (tmpl.sp ?
+			     &(*tmpl.elts.sp)[0].wr :
+			     &(*tmpl.elts.no_sp)[0].wr),
+			    &bad_wr);
+	if (err) {
+		ERROR("%p: ibv_post_recv() failed for WR %p: %s",
+		      (void *)dev,
+		      (void *)bad_wr,
+		      strerror(err));
+		goto skip_rtr;
+	}
+	mod = (struct ibv_exp_qp_attr){
+		.qp_state = IBV_QPS_RTR
+	};
+	err = ibv_exp_modify_qp(tmpl.qp, &mod, IBV_EXP_QP_STATE);
+	if (err)
+		ERROR("%p: QP state to IBV_QPS_RTR failed: %s",
+		      (void *)dev, strerror(err));
+skip_rtr:
+	*rxq = tmpl;
+	assert(err >= 0);
+	return err;
+}
+
+/**
  * Configure a RX queue.
  *
  * @param dev
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index c7f634e..b6f2128 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -144,6 +144,7 @@ struct txq {
 /* mlx5_rxq.c */
 
 void rxq_cleanup(struct rxq *);
+int rxq_rehash(struct rte_eth_dev *, struct rxq *);
 int rxq_setup(struct rte_eth_dev *, struct rxq *, uint16_t, unsigned int,
 	      const struct rte_eth_rxconf *, struct rte_mempool *);
 int mlx5_rx_queue_setup(struct rte_eth_dev *, uint16_t, uint16_t, unsigned int,
-- 
2.1.0



More information about the dev mailing list