[dpdk-dev] [PATCH 05/14] net/mlx5: add multiport IB device support to probing

Slava Ovsiienko viacheslavo at mellanox.com
Thu Mar 21 13:54:19 CET 2019


> -----Original Message-----
> From: Shahaf Shuler
> Sent: Thursday, March 21, 2019 14:15
> To: Slava Ovsiienko <viacheslavo at mellanox.com>; dev at dpdk.org
> Subject: RE: [PATCH 05/14] net/mlx5: add multiport IB device support to
> probing
> 
> Thursday, March 21, 2019 10:11 AM, Viacheslav Ovsiienko:
> > Subject: [PATCH 05/14] net/mlx5: add multiport IB device support to
> > probing
> >
> > mlx5_pci_probe() routine is refactored to probe the ports of found
> > Infiniband devices. All active ports (with attached network
> > interface), belonging to the same Infiniband device will use the
> > signle shared Infiniband context of that device.
> >
> > Signed-off-by: Viacheslav Ovsiienko <viacheslavo at mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c | 302 +++++++++++++++++++++++++++++++++---
> > ------------
> >  1 file changed, 210 insertions(+), 92 deletions(-)
> >
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index
> > 89c30af..100e9f4 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -130,6 +130,16 @@
> >  /** Driver-specific log messages type. */  int mlx5_logtype;
> >
> > +/** Data associated with devices to spawn. */ struct
> > +mlx5_dev_spawn_data {
> > +	uint32_t ifindex; /**< Network interface index. */
> > +	uint32_t max_port; /**< IB device maximal port index. */
> > +	uint32_t ibv_port; /**< IB device physical port index. */
> > +	struct mlx5_switch_info info; /**< Switch information. */
> > +	struct ibv_device *ibv_dev; /**< Associated IB device. */
> > +	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */ };
> > +
> >  /**
> >   * Prepare shared data between primary and secondary process.
> >   */
> > @@ -716,12 +726,10 @@
> >   *
> >   * @param dpdk_dev
> >   *   Backing DPDK device.
> > - * @param ibv_dev
> > - *   Verbs device.
> > + * @param spawn
> > + *   Verbs device parameters (name, port, switch_info) to spawn.
> >   * @param config
> >   *   Device configuration parameters.
> > - * @param[in] switch_info
> > - *   Switch properties of Ethernet device.
> >   *
> >   * @return
> >   *   A valid Ethernet device object on success, NULL otherwise and
> rte_errno
> > @@ -732,10 +740,11 @@
> >   */
> >  static struct rte_eth_dev *
> >  mlx5_dev_spawn(struct rte_device *dpdk_dev,
> > -	       struct ibv_device *ibv_dev,
> > -	       struct mlx5_dev_config config,
> > -	       const struct mlx5_switch_info *switch_info)
> > +	       struct mlx5_dev_spawn_data *spawn,
> > +	       struct mlx5_dev_config config)
> >  {
> > +	const struct mlx5_switch_info *switch_info = &spawn->info;
> > +	struct ibv_device *ibv_dev = spawn->ibv_dev;
> >  	struct ibv_context *ctx = NULL;
> >  	struct ibv_device_attr_ex attr;
> >  	struct ibv_port_attr port_attr;
> > @@ -952,7 +961,7 @@
> >  		return eth_dev;
> >  	}
> >  	/* Check port status. */
> > -	err = mlx5_glue->query_port(ctx, 1, &port_attr);
> > +	err = mlx5_glue->query_port(ctx, spawn->ibv_port, &port_attr);
> >  	if (err) {
> >  		DRV_LOG(ERR, "port query failed: %s", strerror(err));
> >  		goto error;
> > @@ -1316,14 +1325,6 @@
> >  	return NULL;
> >  }
> >
> > -/** Data associated with devices to spawn. */ -struct
> > mlx5_dev_spawn_data {
> > -	unsigned int ifindex; /**< Network interface index. */
> > -	struct mlx5_switch_info info; /**< Switch information. */
> > -	struct ibv_device *ibv_dev; /**< Associated IB device. */
> > -	struct rte_eth_dev *eth_dev; /**< Associated Ethernet device. */
> > -};
> > -
> >  /**
> >   * Comparison callback to sort device data.
> >   *
> > @@ -1380,7 +1381,9 @@ struct mlx5_dev_spawn_data {
> >  	       struct rte_pci_device *pci_dev)  {
> >  	struct ibv_device **ibv_list;
> > -	unsigned int n = 0;
> > +	unsigned int nd = 0;
> > +	unsigned int np = 0;
> > +	unsigned int ns = 0;
> 
> This fields names are not informative. Find a better ones.

Would the adding clarifying comments be enough ?

nd - Number of (PCI) Devices   (nd != 1 means we have multiple devices with the same BDF - old schema)
np - Number of (device) Ports (nd =1, np 1...n means we have new multiport device)
ns - Number to Spawn  (deduced index - number of iterations)

This names are used as indices, long names may make code less readable, IMHO.

> 
> >  	struct mlx5_dev_config dev_config;
> >  	int ret;
> >
> > @@ -1392,8 +1395,14 @@ struct mlx5_dev_spawn_data {
> >  		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
> >  		return -rte_errno;
> >  	}
> > -
> > +	/*
> > +	 * First scan the list of all Infiniband devices to find
> > +	 * matching ones, gathering into the list.
> > +	 */
> >  	struct ibv_device *ibv_match[ret + 1];
> > +	int nl_route = -1;
> > +	int nl_rdma = -1;
> > +	unsigned int i;
> >
> >  	while (ret-- > 0) {
> >  		struct rte_pci_addr pci_addr;
> > @@ -1408,77 +1417,183 @@ struct mlx5_dev_spawn_data {
> >  			continue;
> >  		DRV_LOG(INFO, "PCI information matches for device \"%s\"",
> >  			ibv_list[ret]->name);
> > -		ibv_match[n++] = ibv_list[ret];
> > +		ibv_match[nd++] = ibv_list[ret];
> > +	}
> > +	ibv_match[nd] = NULL;
> > +	if (!nd) {
> > +		/* No device macthes, just complain and bail out. */
> > +		mlx5_glue->free_device_list(ibv_list);
> > +		DRV_LOG(WARNING,
> > +			"no Verbs device matches PCI device " PCI_PRI_FMT
> > ","
> > +			" are kernel drivers loaded?",
> > +			pci_dev->addr.domain, pci_dev->addr.bus,
> > +			pci_dev->addr.devid, pci_dev->addr.function);
> > +		rte_errno = ENOENT;
> > +		ret = -rte_errno;
> > +		return ret;
> > +	}
> > +	nl_route = mlx5_nl_init(NETLINK_ROUTE);
> > +	nl_rdma = mlx5_nl_init(NETLINK_RDMA);
> > +	if (nd == 1) {
> > +		/*
> > +		 * Found single matching device may have multiple ports.
> > +		 * Each port may be representor, we have to check the port
> > +		 * number and check the representors existence.
> > +		 */
> > +		if (nl_rdma >= 0)
> > +			np = mlx5_nl_portnum(nl_rdma, ibv_match[0]-
> > >name);
> > +		if (!np)
> > +			DRV_LOG(WARNING, "can not get IB device \"%s\""
> > +					 " ports number", ibv_match[0]-
> > >name);
> 
> This warning is misleading. On old kernels it is expected to have multiple IB
> devices instead of a single one w/ multiple ports.
> The level should be changed for debug, and the syntax to express it is not an
> error.
> 
> >  	}
> > -	ibv_match[n] = NULL;
> > -
> > -	struct mlx5_dev_spawn_data list[n];
> > -	int nl_route = n ? mlx5_nl_init(NETLINK_ROUTE) : -1;
> > -	int nl_rdma = n ? mlx5_nl_init(NETLINK_RDMA) : -1;
> > -	unsigned int i;
> > -	unsigned int u;
> > -
> >  	/*
> > -	 * The existence of several matching entries (n > 1) means port
> > -	 * representors have been instantiated. No existing Verbs call nor
> > -	 * /sys entries can tell them apart, this can only be done through
> > -	 * Netlink calls assuming kernel drivers are recent enough to
> > -	 * support them.
> > -	 *
> > -	 * In the event of identification failure through Netlink, try again
> > -	 * through sysfs, then either:
> > -	 *
> > -	 * 1. No device matches (n == 0), complain and bail out.
> > -	 * 2. A single IB device matches (n == 1) and is not a representor,
> > -	 *    assume no switch support.
> > -	 * 3. Otherwise no safe assumptions can be made; complain louder
> > and
> > -	 *    bail out.
> > +	 * Now we can determine the maximal
> > +	 * amount of devices to be spawned.
> >  	 */
> > -	for (i = 0; i != n; ++i) {
> > -		list[i].ibv_dev = ibv_match[i];
> > -		list[i].eth_dev = NULL;
> > -		if (nl_rdma < 0)
> > -			list[i].ifindex = 0;
> > -		else
> > -			list[i].ifindex = mlx5_nl_ifindex
> > -				(nl_rdma, list[i].ibv_dev->name, 1);
> > -		if (nl_route < 0 ||
> > -		    !list[i].ifindex ||
> > -		    mlx5_nl_switch_info(nl_route, list[i].ifindex,
> > -					&list[i].info) ||
> > -		    ((!list[i].info.representor && !list[i].info.master) &&
> > -		     mlx5_sysfs_switch_info(list[i].ifindex, &list[i].info))) {
> > -			list[i].ifindex = 0;
> > -			memset(&list[i].info, 0, sizeof(list[i].info));
> > -			continue;
> > +	struct mlx5_dev_spawn_data list[np ? np : nd];
> > +
> > +	if (np > 1) {
> > +		/*
> > +		 * Signle IB device with multiple ports found,
> > +		 * it may be E-Switch master device and representors.
> > +		 * We have to perform identification trough the ports.
> > +		 */
> > +		assert(nl_rdma >= 0);
> > +		assert(ns == 0);
> > +		assert(nd == 1);
> > +		for (i = 1; i <= np; ++i) {
> > +			list[ns].max_port = np;
> > +			list[ns].ibv_port = i;
> > +			list[ns].ibv_dev = ibv_match[0];
> > +			list[ns].eth_dev = NULL;
> > +			list[ns].ifindex = mlx5_nl_ifindex
> > +					(nl_rdma, list[ns].ibv_dev->name, i);
> > +			if (!list[ns].ifindex) {
> > +				/*
> > +				 * No network interface index found for the
> > +				 * specified port, it means there is no
> > +				 * representor on this port. It's OK,
> > +				 * there can be disabled ports, for example
> > +				 * if sriov_numvfs < sriov_totalvfs.
> > +				 */
> > +				continue;
> > +			}
> > +			ret = -1;
> > +			if (nl_route >= 0)
> > +				ret = mlx5_nl_switch_info
> > +					       (nl_route,
> > +						list[ns].ifindex,
> > +						&list[ns].info);
> > +			if (ret || (!list[ns].info.representor &&
> > +				    !list[ns].info.master)) {
> > +				/*
> > +				 * We failed to recognize representors with
> > +				 * Netlink, let's try to perform the task
> > +				 * with sysfs.
> > +				 */
> > +				ret =  mlx5_sysfs_switch_info
> > +						(list[ns].ifindex,
> > +						 &list[ns].info);
> > +			}
> > +			if (!ret && (list[ns].info.representor ^
> > +				     list[ns].info.master))
> > +				ns++;
> >  		}
> > -	}
> > -	if (nl_rdma >= 0)
> > -		close(nl_rdma);
> > -	if (nl_route >= 0)
> > -		close(nl_route);
> > -	/* Count unidentified devices. */
> > -	for (u = 0, i = 0; i != n; ++i)
> > -		if (!list[i].info.master && !list[i].info.representor)
> > -			++u;
> > -	if (u) {
> > -		if (n == 1 && u == 1) {
> > -			/* Case #2. */
> > -			DRV_LOG(INFO, "no switch support detected");
> > -		} else {
> > -			/* Case #3. */
> > +		if (!ns) {
> > +			DRV_LOG(ERR,
> > +				"unable to recognize master/representors"
> > +				" on the IB device with multiple ports");
> > +			rte_errno = ENOENT;
> > +			ret = -rte_errno;
> > +			goto exit;
> > +		}
> > +	} else {
> > +		/*
> > +		 * The existence of several matching entries (nd > 1) means
> > +		 * port representors have been instantiated. No existing
> > Verbs
> > +		 * call nor sysfs entries can tell them apart, this can only
> > +		 * be done through Netlink calls assuming kernel drivers are
> > +		 * recent enough to support them.
> > +		 *
> > +		 * In the event of identification failure through Netlink,
> > +		 * try again through sysfs, then:
> > +		 *
> > +		 * 1. A single IB device matches (nd == 1) with single
> > +		 *    port (np=0/1) and is not a representor, assume
> > +		 *    no switch support.
> > +		 *
> > +		 * 2. Otherwise no safe assumptions can be made;
> > +		 *    complain louder and bail out.
> > +		 */
> > +		np = 1;
> > +		for (i = 0; i != nd; ++i) {
> > +			memset(&list[ns].info, 0, sizeof(list[ns].info));
> > +			list[ns].max_port = 1;
> > +			list[ns].ibv_port = 1;
> > +			list[ns].ibv_dev = ibv_match[i];
> > +			list[ns].eth_dev = NULL;
> > +			list[ns].ifindex = 0;
> > +			if (nl_rdma >= 0)
> > +				list[ns].ifindex = mlx5_nl_ifindex
> > +					(nl_rdma, list[ns].ibv_dev->name, 1);
> > +			if (!list[ns].ifindex) {
> > +				/*
> > +				 * No network interface index found for the
> > +				 * specified device, it means there it is not
> > +				 * a representor/master.
> > +				 */
> > +				continue;
> > +			}
> > +			ret = -1;
> > +			if (nl_route >= 0)
> > +				ret = mlx5_nl_switch_info
> > +					       (nl_route,
> > +						list[ns].ifindex,
> > +						&list[ns].info);
> > +			if (ret || (!list[ns].info.representor &&
> > +				    !list[ns].info.master)) {
> > +				/*
> > +				 * We failed to recognize representors with
> > +				 * Netlink, let's try to perform the task
> > +				 * with sysfs.
> > +				 */
> > +				ret =  mlx5_sysfs_switch_info
> > +						(list[ns].ifindex,
> > +						 &list[ns].info);
> > +			}
> > +			if (!ret && (list[ns].info.representor ^
> > +				     list[ns].info.master)) {
> > +				ns++;
> > +			} else if ((nd == 1) &&
> > +				   !list[ns].info.representor &&
> > +				   !list[ns].info.master) {
> > +				/*
> > +				 * Single IB device with
> > +				 * one physical port and
> > +				 * attached network device.
> > +				 * May be SRIOV is not enabled
> > +				 * or there is no representors.
> > +				 */
> > +				DRV_LOG(INFO, "no E-Switch support
> > detected");
> > +				ns++;
> > +				break;
> > +			}
> > +		}
> > +		if (!ns) {
> >  			DRV_LOG(ERR,
> > -				"unable to tell which of the matching
> > devices"
> > -				" is the master (lack of kernel support?)");
> > -			n = 0;
> > +				"unable to recognize master/representors"
> > +				" on the multiple IB devices");
> > +			rte_errno = ENOENT;
> > +			ret = -rte_errno;
> > +			goto exit;
> >  		}
> >  	}
> > +	assert(ns);
> >  	/*
> >  	 * Sort list to probe devices in natural order for users convenience
> >  	 * (i.e. master first, then representors from lowest to highest ID).
> >  	 */
> > -	if (n)
> > -		qsort(list, n, sizeof(*list), mlx5_dev_spawn_data_cmp);
> > +	qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
> >  	/* Default configuration. */
> >  	dev_config = (struct mlx5_dev_config){
> >  		.hw_padding = 0,
> > @@ -1497,7 +1612,7 @@ struct mlx5_dev_spawn_data {
> >  			.min_rxqs_num = MLX5_MPRQ_MIN_RXQS,
> >  		},
> >  	};
> > -	/* Device speicific configuration. */
> > +	/* Device specific configuration. */
> >  	switch (pci_dev->id.device_id) {
> >  	case PCI_DEVICE_ID_MELLANOX_CONNECTX5BF:
> >  		dev_config.txqs_vec =
> > MLX5_VPMD_MAX_TXQS_BLUEFIELD; @@ -1514,12 +1629,12 @@ struct
> > mlx5_dev_spawn_data {
> >  	/* Set architecture-dependent default value if unset. */
> >  	if (dev_config.txqs_vec == MLX5_ARG_UNSET)
> >  		dev_config.txqs_vec = MLX5_VPMD_MAX_TXQS;
> > -	for (i = 0; i != n; ++i) {
> > +	for (i = 0; i != ns; ++i) {
> >  		uint32_t restore;
> >
> >  		list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
> > -						 list[i].ibv_dev, dev_config,
> > -						 &list[i].info);
> > +						 &list[i],
> > +						 dev_config);
> >  		if (!list[i].eth_dev) {
> >  			if (rte_errno != EBUSY && rte_errno != EEXIST)
> >  				break;
> > @@ -1532,16 +1647,7 @@ struct mlx5_dev_spawn_data {
> >  		list[i].eth_dev->data->dev_flags |= restore;
> >  		rte_eth_dev_probing_finish(list[i].eth_dev);
> >  	}
> > -	mlx5_glue->free_device_list(ibv_list);
> > -	if (!n) {
> > -		DRV_LOG(WARNING,
> > -			"no Verbs device matches PCI device " PCI_PRI_FMT
> > ","
> > -			" are kernel drivers loaded?",
> > -			pci_dev->addr.domain, pci_dev->addr.bus,
> > -			pci_dev->addr.devid, pci_dev->addr.function);
> > -		rte_errno = ENOENT;
> > -		ret = -rte_errno;
> > -	} else if (i != n) {
> > +	if (i != ns) {
> >  		DRV_LOG(ERR,
> >  			"probe of PCI device " PCI_PRI_FMT " aborted after"
> >  			" encountering an error: %s",
> > @@ -1563,6 +1669,18 @@ struct mlx5_dev_spawn_data {
> >  	} else {
> >  		ret = 0;
> >  	}
> > +exit:
> > +	/*
> > +	 * Do the routine cleanup:
> > +	 * - close opened Netlink sockets
> > +	 * - free the Infiniband device list
> > +	 */
> > +	if (nl_rdma >= 0)
> > +		close(nl_rdma);
> > +	if (nl_route >= 0)
> > +		close(nl_route);
> > +	assert(ibv_list);
> > +	mlx5_glue->free_device_list(ibv_list);
> >  	return ret;
> >  }
> >
> > --
> > 1.8.3.1



More information about the dev mailing list