[dpdk-dev] [PATCH v11 09/11] net/failsafe: support device removal

Gaetan Rivet gaetan.rivet at 6wind.com
Tue Jul 18 14:48:20 CEST 2017


Listen to INTR_RMV events issued by slaves.
Add atomic flags on slave queues to detect use of slave bursts function.
If a removal is detected, set the recollection flag on this slave.

During a slave upkeep round, if its recollection flag is set and its
burst functions are not in use by any thread, remove that slave.

Signed-off-by: Gaetan Rivet <gaetan.rivet at 6wind.com>
Acked-by: Olga Shern <olgas at mellanox.com>
---
 doc/guides/nics/fail_safe.rst           | 15 ++++++
 drivers/net/failsafe/failsafe.c         |  1 +
 drivers/net/failsafe/failsafe_args.c    | 13 +++++
 drivers/net/failsafe/failsafe_eal.c     |  3 +-
 drivers/net/failsafe/failsafe_ether.c   | 96 ++++++++++++++++++++++++++++++---
 drivers/net/failsafe/failsafe_ops.c     | 37 +++++++++++--
 drivers/net/failsafe/failsafe_private.h | 72 ++++++++++++++++++++++---
 drivers/net/failsafe/failsafe_rxtx.c    | 17 +++++-
 8 files changed, 234 insertions(+), 20 deletions(-)

diff --git a/doc/guides/nics/fail_safe.rst b/doc/guides/nics/fail_safe.rst
index c81f06a..34ecae2 100644
--- a/doc/guides/nics/fail_safe.rst
+++ b/doc/guides/nics/fail_safe.rst
@@ -51,6 +51,12 @@ The Fail-safe PMD only supports a limited set of features. If you plan to use a
 device underneath the Fail-safe PMD with a specific feature, this feature must
 be supported by the Fail-safe PMD to avoid throwing any error.
 
+A notable exception is the device removal feature. The fail-safe PMD being a
+virtual device, it cannot currently be removed in the sense of a specific bus
+hotplug, like for PCI for example. It will however enable this feature for its
+sub-device automatically, detecting those that are capable and register the
+relevant callback for such event.
+
 Check the feature matrix for the complete set of supported features.
 
 Compilation option
@@ -179,6 +185,14 @@ apply it upon the probing of its missing sub-device. After this configuration
 pass, the new sub-device will be synchronized with other sub-devices, i.e. be
 started if the fail-safe PMD has been started by the user before.
 
+Plug-out feature
+----------------
+
+A sub-device supporting the device removal event can be removed from its bus at
+any time. The fail-safe PMD will register a callback for such event and react
+accordingly. It will try to safely stop, close and uninit the sub-device having
+emitted this event, allowing it to free its eventual resources.
+
 Fail-safe glossary
 ------------------
 
@@ -195,6 +209,7 @@ Preferred device : Primary device
 Upkeep round
     Periodical process when slaves are serviced. Each devices having a state
     different to that of the fail-safe device itself, is synchronized with it.
+    Additionally, each slave having the remove flag set are cleaned-up.
 
 Slave
     In the context of the fail-safe PMD, synonymous to sub-device.
diff --git a/drivers/net/failsafe/failsafe.c b/drivers/net/failsafe/failsafe.c
index 12d79ac..7628c21 100644
--- a/drivers/net/failsafe/failsafe.c
+++ b/drivers/net/failsafe/failsafe.c
@@ -142,6 +142,7 @@ fs_hotplug_alarm(void *arg)
 		if (ret)
 			ERROR("Unable to synchronize sub_device state");
 	}
+	failsafe_dev_remove(dev);
 	ret = failsafe_hotplug_alarm_install(dev);
 	if (ret)
 		ERROR("Unable to set up next alarm");
diff --git a/drivers/net/failsafe/failsafe_args.c b/drivers/net/failsafe/failsafe_args.c
index ec3db55..932e371 100644
--- a/drivers/net/failsafe/failsafe_args.c
+++ b/drivers/net/failsafe/failsafe_args.c
@@ -436,6 +436,17 @@ failsafe_args_count_subdevice(struct rte_eth_dev *dev,
 				    dev, params);
 }
 
+static int
+fs_parse_sub_device(struct sub_device *sdev)
+{
+	struct rte_devargs *da;
+	char devstr[DEVARGS_MAXLEN] = "";
+
+	da = &sdev->devargs;
+	snprintf(devstr, sizeof(devstr), "%s,%s", da->name, da->args);
+	return fs_parse_device(sdev, devstr);
+}
+
 int
 failsafe_args_parse_subs(struct rte_eth_dev *dev)
 {
@@ -448,6 +459,8 @@ failsafe_args_parse_subs(struct rte_eth_dev *dev)
 			continue;
 		if (sdev->cmdline)
 			ret = fs_execute_cmd(sdev, sdev->cmdline);
+		else
+			ret = fs_parse_sub_device(sdev);
 		if (ret == 0)
 			sdev->state = DEV_PARSED;
 	}
diff --git a/drivers/net/failsafe/failsafe_eal.c b/drivers/net/failsafe/failsafe_eal.c
index d11ea67..c8f4318 100644
--- a/drivers/net/failsafe/failsafe_eal.c
+++ b/drivers/net/failsafe/failsafe_eal.c
@@ -63,6 +63,7 @@ fs_bus_init(struct rte_eth_dev *dev)
 			return -ENODEV;
 		}
 		SUB_ID(sdev) = i;
+		sdev->fs_dev = dev;
 		sdev->dev = ETH(sdev)->device;
 		ETH(sdev)->state = RTE_ETH_DEV_DEFERRED;
 		sdev->state = DEV_PROBED;
@@ -80,7 +81,7 @@ failsafe_eal_init(struct rte_eth_dev *dev)
 		return ret;
 	if (PRIV(dev)->state < DEV_PROBED)
 		PRIV(dev)->state = DEV_PROBED;
-	fs_switch_dev(dev);
+	fs_switch_dev(dev, NULL);
 	return 0;
 }
 
diff --git a/drivers/net/failsafe/failsafe_ether.c b/drivers/net/failsafe/failsafe_ether.c
index 2958207..359b8c9 100644
--- a/drivers/net/failsafe/failsafe_ether.c
+++ b/drivers/net/failsafe/failsafe_ether.c
@@ -250,6 +250,67 @@ fs_eth_dev_conf_apply(struct rte_eth_dev *dev,
 	return 0;
 }
 
+static void
+fs_dev_remove(struct sub_device *sdev)
+{
+	int ret;
+
+	if (sdev == NULL)
+		return;
+	switch (sdev->state) {
+	case DEV_STARTED:
+		rte_eth_dev_stop(PORT_ID(sdev));
+		sdev->state = DEV_ACTIVE;
+		/* fallthrough */
+	case DEV_ACTIVE:
+		rte_eth_dev_close(PORT_ID(sdev));
+		sdev->state = DEV_PROBED;
+		/* fallthrough */
+	case DEV_PROBED:
+		ret = rte_eal_hotplug_remove(sdev->bus->name,
+					     sdev->dev->name);
+		if (ret) {
+			ERROR("Bus detach failed for sub_device %u",
+			      SUB_ID(sdev));
+		} else {
+			ETH(sdev)->state = RTE_ETH_DEV_UNUSED;
+		}
+		sdev->state = DEV_PARSED;
+		/* fallthrough */
+	case DEV_PARSED:
+	case DEV_UNDEFINED:
+		sdev->state = DEV_UNDEFINED;
+		/* the end */
+		break;
+	}
+	failsafe_hotplug_alarm_install(sdev->fs_dev);
+}
+
+static inline int
+fs_rxtx_clean(struct sub_device *sdev)
+{
+	uint16_t i;
+
+	for (i = 0; i < ETH(sdev)->data->nb_rx_queues; i++)
+		if (FS_ATOMIC_RX(sdev, i))
+			return 0;
+	for (i = 0; i < ETH(sdev)->data->nb_tx_queues; i++)
+		if (FS_ATOMIC_TX(sdev, i))
+			return 0;
+	return 1;
+}
+
+void
+failsafe_dev_remove(struct rte_eth_dev *dev)
+{
+	struct sub_device *sdev;
+	uint8_t i;
+
+	FOREACH_SUBDEV_STATE(sdev, i, dev, DEV_ACTIVE)
+		if (sdev->remove && fs_rxtx_clean(sdev))
+			fs_dev_remove(sdev);
+}
+
 int
 failsafe_eth_dev_state_sync(struct rte_eth_dev *dev)
 {
@@ -263,13 +324,13 @@ failsafe_eth_dev_state_sync(struct rte_eth_dev *dev)
 
 	ret = failsafe_args_parse_subs(dev);
 	if (ret)
-		return ret;
+		goto err_remove;
 
 	if (PRIV(dev)->state < DEV_PROBED)
 		return 0;
 	ret = failsafe_eal_init(dev);
 	if (ret)
-		return ret;
+		goto err_remove;
 	if (PRIV(dev)->state < DEV_ACTIVE)
 		return 0;
 	inactive = 0;
@@ -278,15 +339,14 @@ failsafe_eth_dev_state_sync(struct rte_eth_dev *dev)
 			inactive |= UINT32_C(1) << i;
 	ret = dev->dev_ops->dev_configure(dev);
 	if (ret)
-		return ret;
+		goto err_remove;
 	FOREACH_SUBDEV(sdev, i, dev) {
 		if (inactive & (UINT32_C(1) << i)) {
 			ret = fs_eth_dev_conf_apply(dev, sdev);
 			if (ret) {
 				ERROR("Could not apply configuration to sub_device %d",
 				      i);
-				/* TODO: disable device */
-				return ret;
+				goto err_remove;
 			}
 		}
 	}
@@ -300,6 +360,30 @@ failsafe_eth_dev_state_sync(struct rte_eth_dev *dev)
 		return 0;
 	ret = dev->dev_ops->dev_start(dev);
 	if (ret)
-		return ret;
+		goto err_remove;
+	return 0;
+err_remove:
+	FOREACH_SUBDEV(sdev, i, dev)
+		if (sdev->state != PRIV(dev)->state)
+			sdev->remove = 1;
+	return ret;
+}
+
+int
+failsafe_eth_rmv_event_callback(uint8_t port_id __rte_unused,
+				enum rte_eth_event_type event __rte_unused,
+				void *cb_arg, void *out __rte_unused)
+{
+	struct sub_device *sdev = cb_arg;
+
+	/* Switch as soon as possible tx_dev. */
+	fs_switch_dev(sdev->fs_dev, sdev);
+	/* Use safe bursts in any case. */
+	set_burst_fn(sdev->fs_dev, 1);
+	/*
+	 * Async removal, the sub-PMD will try to unregister
+	 * the callback at the source of the current thread context.
+	 */
+	sdev->remove = 1;
 	return 0;
 }
diff --git a/drivers/net/failsafe/failsafe_ops.c b/drivers/net/failsafe/failsafe_ops.c
index ac3257e..e8edf15 100644
--- a/drivers/net/failsafe/failsafe_ops.c
+++ b/drivers/net/failsafe/failsafe_ops.c
@@ -34,6 +34,7 @@
 #include <stdint.h>
 
 #include <rte_debug.h>
+#include <rte_atomic.h>
 #include <rte_ethdev.h>
 #include <rte_malloc.h>
 #include <rte_flow.h>
@@ -204,9 +205,21 @@ fs_dev_configure(struct rte_eth_dev *dev)
 		}
 	}
 	FOREACH_SUBDEV(sdev, i, dev) {
+		int rmv_interrupt = 0;
+
 		if (sdev->state != DEV_PROBED)
 			continue;
+
+		rmv_interrupt = ETH(sdev)->data->dev_flags &
+				RTE_ETH_DEV_INTR_RMV;
+		if (rmv_interrupt) {
+			DEBUG("Enabling RMV interrupts for sub_device %d", i);
+			dev->data->dev_conf.intr_conf.rmv = 1;
+		} else {
+			DEBUG("sub_device %d does not support RMV event", i);
+		}
 		DEBUG("Configuring sub-device %d", i);
+		sdev->remove = 0;
 		ret = rte_eth_dev_configure(PORT_ID(sdev),
 					dev->data->nb_rx_queues,
 					dev->data->nb_tx_queues,
@@ -215,6 +228,16 @@ fs_dev_configure(struct rte_eth_dev *dev)
 			ERROR("Could not configure sub_device %d", i);
 			return ret;
 		}
+		if (rmv_interrupt) {
+			ret = rte_eth_dev_callback_register(PORT_ID(sdev),
+					RTE_ETH_EVENT_INTR_RMV,
+					failsafe_eth_rmv_event_callback,
+					sdev);
+			if (ret)
+				WARN("Failed to register RMV callback for sub_device %d",
+				     SUB_ID(sdev));
+		}
+		dev->data->dev_conf.intr_conf.rmv = 0;
 		sdev->state = DEV_ACTIVE;
 	}
 	if (PRIV(dev)->state < DEV_ACTIVE)
@@ -240,7 +263,7 @@ fs_dev_start(struct rte_eth_dev *dev)
 	}
 	if (PRIV(dev)->state < DEV_STARTED)
 		PRIV(dev)->state = DEV_STARTED;
-	fs_switch_dev(dev);
+	fs_switch_dev(dev, NULL);
 	return 0;
 }
 
@@ -351,10 +374,14 @@ fs_rx_queue_setup(struct rte_eth_dev *dev,
 		fs_rx_queue_release(rxq);
 		dev->data->rx_queues[rx_queue_id] = NULL;
 	}
-	rxq = rte_zmalloc(NULL, sizeof(*rxq),
+	rxq = rte_zmalloc(NULL,
+			  sizeof(*rxq) +
+			  sizeof(rte_atomic64_t) * PRIV(dev)->subs_tail,
 			  RTE_CACHE_LINE_SIZE);
 	if (rxq == NULL)
 		return -ENOMEM;
+	FOREACH_SUBDEV(sdev, i, dev)
+		rte_atomic64_init(&rxq->refcnt[i]);
 	rxq->qid = rx_queue_id;
 	rxq->socket_id = socket_id;
 	rxq->info.mp = mb_pool;
@@ -414,10 +441,14 @@ fs_tx_queue_setup(struct rte_eth_dev *dev,
 		fs_tx_queue_release(txq);
 		dev->data->tx_queues[tx_queue_id] = NULL;
 	}
-	txq = rte_zmalloc("ethdev TX queue", sizeof(*txq),
+	txq = rte_zmalloc("ethdev TX queue",
+			  sizeof(*txq) +
+			  sizeof(rte_atomic64_t) * PRIV(dev)->subs_tail,
 			  RTE_CACHE_LINE_SIZE);
 	if (txq == NULL)
 		return -ENOMEM;
+	FOREACH_SUBDEV(sdev, i, dev)
+		rte_atomic64_init(&txq->refcnt[i]);
 	txq->qid = tx_queue_id;
 	txq->socket_id = socket_id;
 	txq->info.conf = *tx_conf;
diff --git a/drivers/net/failsafe/failsafe_private.h b/drivers/net/failsafe/failsafe_private.h
index aac4754..80bbc61 100644
--- a/drivers/net/failsafe/failsafe_private.h
+++ b/drivers/net/failsafe/failsafe_private.h
@@ -36,6 +36,7 @@
 
 #include <sys/queue.h>
 
+#include <rte_atomic.h>
 #include <rte_dev.h>
 #include <rte_ethdev.h>
 #include <rte_devargs.h>
@@ -65,6 +66,7 @@ struct rxq {
 	uint8_t last_polled;
 	unsigned int socket_id;
 	struct rte_eth_rxq_info info;
+	rte_atomic64_t refcnt[];
 };
 
 struct txq {
@@ -72,6 +74,7 @@ struct txq {
 	uint16_t qid;
 	unsigned int socket_id;
 	struct rte_eth_txq_info info;
+	rte_atomic64_t refcnt[];
 };
 
 struct rte_flow {
@@ -101,6 +104,10 @@ struct sub_device {
 	enum dev_state state;
 	/* Some device are defined as a command line */
 	char *cmdline;
+	/* fail-safe device backreference */
+	struct rte_eth_dev *fs_dev;
+	/* flag calling for recollection */
+	volatile unsigned int remove:1;
 };
 
 struct fs_priv {
@@ -168,6 +175,10 @@ int failsafe_eal_uninit(struct rte_eth_dev *dev);
 /* ETH_DEV */
 
 int failsafe_eth_dev_state_sync(struct rte_eth_dev *dev);
+void failsafe_dev_remove(struct rte_eth_dev *dev);
+int failsafe_eth_rmv_event_callback(uint8_t port_id,
+				    enum rte_eth_event_type type,
+				    void *arg, void *out);
 
 /* GLOBALS */
 
@@ -233,6 +244,39 @@ extern int mac_from_arg;
 #define SUBOPS(s, ops) \
 	(ETH(s)->dev_ops->ops)
 
+/**
+ * Atomic guard
+ */
+
+/**
+ * a: (rte_atomic64_t)
+ */
+#define FS_ATOMIC_P(a) \
+	rte_atomic64_add(&(a), 1)
+
+/**
+ * a: (rte_atomic64_t)
+ */
+#define FS_ATOMIC_V(a) \
+	rte_atomic64_sub(&(a), 1)
+
+/**
+ * s: (struct sub_device *)
+ * i: uint16_t qid
+ */
+#define FS_ATOMIC_RX(s, i) \
+	rte_atomic64_read( \
+	 &((struct rxq *)((s)->fs_dev->data->rx_queues[i]))->refcnt[(s)->sid] \
+	)
+/**
+ * s: (struct sub_device *)
+ * i: uint16_t qid
+ */
+#define FS_ATOMIC_TX(s, i) \
+	rte_atomic64_read( \
+	 &((struct txq *)((s)->fs_dev->data->tx_queues[i]))->refcnt[(s)->sid] \
+	)
+
 #define LOG__(level, m, ...) \
 	RTE_LOG(level, PMD, "net_failsafe: " m "%c", __VA_ARGS__)
 #define LOG_(level, ...) LOG__(level, __VA_ARGS__, '\n')
@@ -257,33 +301,45 @@ fs_find_next(struct rte_eth_dev *dev, uint8_t sid,
 	return sid;
 }
 
+/*
+ * Switch emitting device.
+ * If banned is set, banned must not be considered for
+ * the role of emitting device.
+ */
 static inline void
-fs_switch_dev(struct rte_eth_dev *dev)
+fs_switch_dev(struct rte_eth_dev *dev,
+	      struct sub_device *banned)
 {
+	struct sub_device *txd;
 	enum dev_state req_state;
 
 	req_state = PRIV(dev)->state;
-	if (PREFERRED_SUBDEV(dev)->state >= req_state) {
-		if (TX_SUBDEV(dev) != PREFERRED_SUBDEV(dev) &&
-		    (TX_SUBDEV(dev) == NULL ||
+	txd = TX_SUBDEV(dev);
+	if (PREFERRED_SUBDEV(dev)->state >= req_state &&
+	    PREFERRED_SUBDEV(dev) != banned) {
+		if (txd != PREFERRED_SUBDEV(dev) &&
+		    (txd == NULL ||
 		     (req_state == DEV_STARTED) ||
-		     (TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < DEV_STARTED))) {
+		     (txd && txd->state < DEV_STARTED))) {
 			DEBUG("Switching tx_dev to preferred sub_device");
 			PRIV(dev)->subs_tx = 0;
 		}
-	} else if ((TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < req_state) ||
-		   TX_SUBDEV(dev) == NULL) {
+	} else if ((txd && txd->state < req_state) ||
+		   txd == NULL ||
+		   txd == banned) {
 		struct sub_device *sdev;
 		uint8_t i;
 
 		/* Using acceptable device */
 		FOREACH_SUBDEV_STATE(sdev, i, dev, req_state) {
+			if (sdev == banned)
+				continue;
 			DEBUG("Switching tx_dev to sub_device %d",
 			      i);
 			PRIV(dev)->subs_tx = i;
 			break;
 		}
-	} else if (TX_SUBDEV(dev) && TX_SUBDEV(dev)->state < req_state) {
+	} else if (txd && txd->state < req_state) {
 		DEBUG("No device ready, deactivating tx_dev");
 		PRIV(dev)->subs_tx = PRIV(dev)->subs_tail;
 	} else {
diff --git a/drivers/net/failsafe/failsafe_rxtx.c b/drivers/net/failsafe/failsafe_rxtx.c
index 5e380fe..7311421 100644
--- a/drivers/net/failsafe/failsafe_rxtx.c
+++ b/drivers/net/failsafe/failsafe_rxtx.c
@@ -31,6 +31,7 @@
  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <rte_atomic.h>
 #include <rte_debug.h>
 #include <rte_mbuf.h>
 #include <rte_ethdev.h>
@@ -112,8 +113,10 @@ failsafe_rx_burst(void *queue,
 		if (unlikely(fs_rx_unsafe(sdev)))
 			continue;
 		sub_rxq = ETH(sdev)->data->rx_queues[rxq->qid];
+		FS_ATOMIC_P(rxq->refcnt[sdev->sid]);
 		nb_rx = ETH(sdev)->
 			rx_pkt_burst(sub_rxq, rx_pkts, nb_pkts);
+		FS_ATOMIC_V(rxq->refcnt[sdev->sid]);
 		if (nb_rx) {
 			rxq->last_polled = i;
 			return nb_rx;
@@ -146,8 +149,10 @@ failsafe_rx_burst_fast(void *queue,
 		sdev = &priv->subs[i];
 		RTE_ASSERT(!fs_rx_unsafe(sdev));
 		sub_rxq = ETH(sdev)->data->rx_queues[rxq->qid];
+		FS_ATOMIC_P(rxq->refcnt[sdev->sid]);
 		nb_rx = ETH(sdev)->
 			rx_pkt_burst(sub_rxq, rx_pkts, nb_pkts);
+		FS_ATOMIC_V(rxq->refcnt[sdev->sid]);
 		if (nb_rx) {
 			rxq->last_polled = i;
 			return nb_rx;
@@ -164,13 +169,17 @@ failsafe_tx_burst(void *queue,
 	struct sub_device *sdev;
 	struct txq *txq;
 	void *sub_txq;
+	uint16_t nb_tx;
 
 	txq = queue;
 	sdev = TX_SUBDEV(txq->priv->dev);
 	if (unlikely(fs_tx_unsafe(sdev)))
 		return 0;
 	sub_txq = ETH(sdev)->data->tx_queues[txq->qid];
-	return ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+	FS_ATOMIC_P(txq->refcnt[sdev->sid]);
+	nb_tx = ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+	FS_ATOMIC_V(txq->refcnt[sdev->sid]);
+	return nb_tx;
 }
 
 uint16_t
@@ -181,10 +190,14 @@ failsafe_tx_burst_fast(void *queue,
 	struct sub_device *sdev;
 	struct txq *txq;
 	void *sub_txq;
+	uint16_t nb_tx;
 
 	txq = queue;
 	sdev = TX_SUBDEV(txq->priv->dev);
 	RTE_ASSERT(!fs_tx_unsafe(sdev));
 	sub_txq = ETH(sdev)->data->tx_queues[txq->qid];
-	return ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+	FS_ATOMIC_P(txq->refcnt[sdev->sid]);
+	nb_tx = ETH(sdev)->tx_pkt_burst(sub_txq, tx_pkts, nb_pkts);
+	FS_ATOMIC_V(txq->refcnt[sdev->sid]);
+	return nb_tx;
 }
-- 
2.1.4



More information about the dev mailing list