[dpdk-dev,3/7] net/mlx5: split PCI from generic probing code

Message ID 20180525161814.13873-4-adrien.mazarguil@6wind.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply patch file failure

Commit Message

Adrien Mazarguil May 25, 2018, 4:35 p.m. UTC
  All the generic probing code needs is an IB device. While this device is
currently supplied by a PCI lookup, other methods will be added soon.

This patch divides the original function, which has become huge over time,
as follows:

1. PCI-specific (mlx5_pci_probe()).
2. All ports of a Verbs device (mlx5_dev_spawn()).
3. A given port of a Verbs device (mlx5_dev_spawn_one()).

(Patch based on prior work from Yuanhan Liu)

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
---
 drivers/net/mlx5/mlx5.c | 332 ++++++++++++++++++++++++++-----------------
 1 file changed, 201 insertions(+), 131 deletions(-)
  

Comments

Xueming Li June 10, 2018, 12:59 p.m. UTC | #1
Hi Adrien,

The logic looks much more clear now with the split.

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Adrien Mazarguil
> Sent: Saturday, May 26, 2018 12:35 AM
> To: Shahaf Shuler <shahafs@mellanox.com>
> Cc: dev@dpdk.org
> Subject: [dpdk-dev] [PATCH 3/7] net/mlx5: split PCI from generic probing code
> 
> All the generic probing code needs is an IB device. While this device is currently supplied by a PCI
> lookup, other methods will be added soon.
> 
> This patch divides the original function, which has become huge over time, as follows:
> 
> 1. PCI-specific (mlx5_pci_probe()).
> 2. All ports of a Verbs device (mlx5_dev_spawn()).
> 3. A given port of a Verbs device (mlx5_dev_spawn_one()).
> 
> (Patch based on prior work from Yuanhan Liu)
> 
> Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c | 332 ++++++++++++++++++++++++++-----------------
>  1 file changed, 201 insertions(+), 131 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c index 41a542ebc..7a812ef93 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -633,30 +633,34 @@ mlx5_uar_init_secondary(struct rte_eth_dev *dev)  }
> 
>  /**
> - * DPDK callback to register a PCI device.
> - *
> - * This function creates an Ethernet device for each port of a given
> - * PCI device.
> + * Spawn an Ethernet device from Verbs information.
>   *
> - * @param[in] pci_drv
> - *   PCI driver structure (mlx5_driver).
> - * @param[in] pci_dev
> - *   PCI device information.
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + * @param[in] attr
> + *   Verbs device attributes.
> + * @param port
> + *   Verbs port to use (indexed from 1).
>   *
>   * @return
> - *   0 on success, a negative errno value otherwise and rte_errno is set.
> + *   A valid Ethernet device object on success, NULL otherwise and rte_errno
> + *   is set.
>   */
> -static int
> -mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> -	       struct rte_pci_device *pci_dev)
> +static struct rte_eth_dev *
> +mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
> +		   struct ibv_device *ibv_dev,
> +		   int vf,
> +		   const struct ibv_device_attr_ex *attr,
> +		   unsigned int port)
>  {
> -	struct ibv_device **list = NULL;
> -	struct ibv_device *ibv_dev;
> -	struct ibv_context *ctx = NULL;
> -	struct ibv_device_attr_ex attr;
> +	struct ibv_context *ctx;
>  	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
> +	struct rte_eth_dev *eth_dev = NULL;
>  	int err = 0;
> -	unsigned int vf = 0;
>  	unsigned int mps;
>  	unsigned int cqe_comp;
>  	unsigned int tunnel_en = 0;
> @@ -668,71 +672,18 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	unsigned int mprq_max_stride_size_n = 0;
>  	unsigned int mprq_min_stride_num_n = 0;
>  	unsigned int mprq_max_stride_num_n = 0;
> -	int i;
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
>  	struct ibv_counter_set_description cs_desc;  #endif
> 
>  	/* Prepare shared data between primary and secondary process. */
>  	mlx5_prepare_shared_data();
> -	assert(pci_drv == &mlx5_driver);
> -	list = mlx5_glue->get_device_list(&i);
> -	if (list == NULL) {
> -		assert(errno);
> -		err = errno;
> -		if (errno == ENOSYS)
> -			DRV_LOG(ERR,
> -				"cannot list devices, is ib_uverbs loaded?");
> -		goto error;
> -	}
> -	assert(i >= 0);
> -	/*
> -	 * For each listed device, check related sysfs entry against
> -	 * the provided PCI ID.
> -	 */
> -	while (i != 0) {
> -		struct rte_pci_addr pci_addr;
> -
> -		--i;
> -		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
> -		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
> -			continue;
> -		if ((pci_dev->addr.domain != pci_addr.domain) ||
> -		    (pci_dev->addr.bus != pci_addr.bus) ||
> -		    (pci_dev->addr.devid != pci_addr.devid) ||
> -		    (pci_dev->addr.function != pci_addr.function))
> -			continue;
> -		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> -			list[i]->name);
> -		vf = ((pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
> -		      (pci_dev->id.device_id ==
> -		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
> -		ctx = mlx5_glue->open_device(list[i]);
> -		rte_errno = errno;
> -		err = rte_errno;
> -		break;
> -	}
> -	if (ctx == NULL) {
> -		switch (err) {
> -		case 0:
> -			DRV_LOG(ERR,
> -				"cannot access device, is mlx5_ib loaded?");
> -			err = ENODEV;
> -			break;
> -		case EINVAL:
> -			DRV_LOG(ERR,
> -				"cannot use device, are drivers up to date?");
> -			break;
> -		}
> -		goto error;
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		return NULL;
>  	}
> -	ibv_dev = list[i];
> -	DRV_LOG(DEBUG, "device opened");
>  #ifdef HAVE_IBV_MLX5_MOD_SWP
>  	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;  #endif @@ -820,20 +771,11 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
>  		" old OFED/rdma-core version or firmware configuration");  #endif
> -	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> -	if (err) {
> -		DEBUG("ibv_query_device_ex() failed");
> -		goto error;
> -	}
> -	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> -	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
> +	{
>  		char name[RTE_ETH_NAME_MAX_LEN];
> -		int len;
> -		uint32_t port = i + 1; /* ports are indexed from one */
>  		struct ibv_port_attr port_attr;
>  		struct ibv_pd *pd = NULL;
>  		struct priv *priv = NULL;
> -		struct rte_eth_dev *eth_dev = NULL;
>  		struct ether_addr mac;
>  		struct mlx5_dev_config config = {
>  			.cqe_comp = cqe_comp,
> @@ -857,11 +799,11 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			},
>  		};
> 
> -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> -			 pci_dev->addr.domain, pci_dev->addr.bus,
> -			 pci_dev->addr.devid, pci_dev->addr.function);
> -		if (attr.orig_attr.phys_port_cnt > 1)
> -			snprintf(name + len, sizeof(name), " port %u", i);
> +		if (attr->orig_attr.phys_port_cnt > 1)
> +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> +		else
> +			snprintf(name, sizeof(name), "%s port %u",
> +				 dpdk_dev->name, port);

Name contains port only if phys_port_cnt > 1 in previous logic, are you sure?

>  		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
>  			eth_dev = rte_eth_dev_attach_secondary(name);
>  			if (eth_dev == NULL) {
> @@ -870,7 +812,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				err = rte_errno;
>  				goto error;
>  			}
> -			eth_dev->device = &pci_dev->device;
> +			eth_dev->device = dpdk_dev;
>  			eth_dev->dev_ops = &mlx5_dev_sec_ops;
>  			err = mlx5_uar_init_secondary(eth_dev);
>  			if (err) {
> @@ -898,16 +840,10 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				mlx5_select_rx_function(eth_dev);
>  			eth_dev->tx_pkt_burst =
>  				mlx5_select_tx_function(eth_dev);
> -			rte_eth_dev_probing_finish(eth_dev);
> -			continue;
> +			mlx5_glue->close_device(ctx);
> +			return eth_dev;
>  		}
>  		DRV_LOG(DEBUG, "using port %u", port);
> -		if (!ctx)
> -			ctx = mlx5_glue->open_device(ibv_dev);
> -		if (ctx == NULL) {
> -			err = ENODEV;
> -			goto port_error;
> -		}
>  		/* Check port status. */
>  		err = mlx5_glue->query_port(ctx, port, &port_attr);
>  		if (err) {
> @@ -945,23 +881,23 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		priv->ctx = ctx;
>  		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
>  			sizeof(priv->ibdev_path));
> -		priv->device_attr = attr;
> +		priv->device_attr = *attr;
>  		priv->port = port;
>  		priv->pd = pd;
>  		priv->mtu = ETHER_MTU;
> -		err = mlx5_args(&config, pci_dev->device.devargs);
> +		err = mlx5_args(&config, dpdk_dev->devargs);
>  		if (err) {
>  			err = rte_errno;
>  			DRV_LOG(ERR, "failed to process device arguments: %s",
>  				strerror(rte_errno));
>  			goto port_error;
>  		}
> -		config.hw_csum = !!(attr.device_cap_flags_ex &
> +		config.hw_csum = !!(attr->device_cap_flags_ex &
>  				    IBV_DEVICE_RAW_IP_CSUM);
>  		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
>  			(config.hw_csum ? "" : "not "));
>  #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
> -		config.flow_counter_en = !!attr.max_counter_sets;
> +		config.flow_counter_en = !!attr->max_counter_sets;
>  		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
>  		DRV_LOG(DEBUG,
>  			"counter type = %d, num of cs = %ld, attributes = %d", @@ -969,7 +905,7 @@
> mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			cs_desc.attributes);
>  #endif
>  		config.ind_table_max_size =
> -			attr.rss_caps.max_rwq_indirection_table_size;
> +			attr->rss_caps.max_rwq_indirection_table_size;
>  		/* Remove this check once DPDK supports larger/variable
>  		 * indirection tables. */
>  		if (config.ind_table_max_size >
> @@ -977,28 +913,28 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
>  		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
>  			config.ind_table_max_size);
> -		config.hw_vlan_strip = !!(attr.raw_packet_caps &
> +		config.hw_vlan_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
>  		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
>  			(config.hw_vlan_strip ? "" : "not "));
> 
> -		config.hw_fcs_strip = !!(attr.raw_packet_caps &
> +		config.hw_fcs_strip = !!(attr->raw_packet_caps &
>  					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
>  		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
>  			(config.hw_fcs_strip ? "" : "not "));
> 
>  #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
> -		config.hw_padding = !!attr.rx_pad_end_addr_align;
> +		config.hw_padding = !!attr->rx_pad_end_addr_align;
>  #endif
>  		DRV_LOG(DEBUG,
>  			"hardware Rx end alignment padding is %ssupported",
>  			(config.hw_padding ? "" : "not "));
>  		config.vf = vf;
> -		config.tso = (attr.tso_caps.max_tso > 0 &&
> -			      (attr.tso_caps.supported_qpts &
> +		config.tso = (attr->tso_caps.max_tso > 0 &&
> +			      (attr->tso_caps.supported_qpts &
>  			      (1 << IBV_QPT_RAW_PACKET)));
>  		if (config.tso)
> -			config.tso_max_payload_sz = attr.tso_caps.max_tso;
> +			config.tso_max_payload_sz = attr->tso_caps.max_tso;
>  		if (config.mps && !mps) {
>  			DRV_LOG(ERR,
>  				"multi-packet send not supported on this device"
> @@ -1039,8 +975,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  		eth_dev->data->dev_private = priv;
>  		priv->dev_data = eth_dev->data;
>  		eth_dev->data->mac_addrs = priv->mac;
> -		eth_dev->device = &pci_dev->device;
> -		rte_eth_copy_pci_info(eth_dev, pci_dev);
> +		eth_dev->device = dpdk_dev;
>  		eth_dev->device->driver = &mlx5_driver.driver;
>  		err = mlx5_uar_init_primary(eth_dev);
>  		if (err) {
> @@ -1145,13 +1080,7 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  				 priv, mem_event_cb);
>  		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
>  		rte_eth_dev_probing_finish(eth_dev);
> -		/*
> -		 * Each eth_dev instance is assigned its own Verbs context,
> -		 * since this one is consumed, let the next iteration open
> -		 * another.
> -		 */
> -		ctx = NULL;
> -		continue;
> +		return eth_dev;
>  port_error:
>  		if (priv)
>  			rte_free(priv);
> @@ -1159,24 +1088,165 @@ mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
>  			claim_zero(mlx5_glue->dealloc_pd(pd));
>  		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
>  			rte_eth_dev_release_port(eth_dev);
> -		break;
>  	}
> -	/*
> -	 * XXX if something went wrong in the loop above, there is a resource
> -	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
> -	 * long as the dpdk does not provide a way to deallocate a ethdev and a
> -	 * way to enumerate the registered ethdevs to free the previous ones.
> -	 */
>  error:
>  	if (ctx)
>  		claim_zero(mlx5_glue->close_device(ctx));
> -	if (list)
> -		mlx5_glue->free_device_list(list);
> -	if (err) {
> -		rte_errno = err;
> +	assert(err > 0);
> +	rte_errno = err;
> +	return NULL;
> +}
> +
> +/**
> + * Spawn Ethernet devices from Verbs information, one per detected port.
> + *
> + * @param dpdk_dev
> + *   Backing DPDK device.
> + * @param ibv_dev
> + *   Verbs device.
> + * @param vf
> + *   If nonzero, enable VF-specific features.
> + *
> + * @return
> + *   A NULL-terminated list of Ethernet device objects on success, NULL
> + *   otherwise and rte_errno is set. Caller is expected to release list
> + *   memory through free().
> + */
> +static struct rte_eth_dev **
> +mlx5_dev_spawn(struct rte_device *dpdk_dev,
> +	       struct ibv_device *ibv_dev,
> +	       int vf)
> +{
> +	struct rte_eth_dev **eth_list = NULL;
> +	struct ibv_context *ctx;
> +	struct ibv_device_attr_ex attr;
> +	unsigned int i;
> +	int ret;
> +
> +	errno = 0;
> +	ctx = mlx5_glue->open_device(ibv_dev);
> +	if (!ctx) {
> +		rte_errno = errno ? errno : ENODEV;
> +		if (rte_errno == ENODEV)
> +			DRV_LOG(ERR,
> +				"cannot access device, is mlx5_ib loaded?");
> +		else
> +			DRV_LOG(ERR,
> +				"cannot use device, are drivers up to date?");
> +		return NULL;
> +	}
> +	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
> +	mlx5_glue->close_device(ctx);
> +	if (ret) {
> +		rte_errno = ret;
> +		DRV_LOG(ERR, "unable to query device information: %s",
> +			strerror(rte_errno));
> +		return NULL;
> +	}
> +	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
> +	eth_list = malloc(sizeof(*eth_list) *
> +			  (attr.orig_attr.phys_port_cnt + 1));
> +	if (!eth_list) {
> +		rte_errno = errno;
> +		return NULL;
> +	}
> +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> +						 &attr, i + 1);
> +		if (eth_list[i])
> +			continue;
> +		/* Save rte_errno and roll back in case of failure. */
> +		ret = rte_errno;
> +		while (i--) {
> +			mlx5_dev_close(eth_list[i]);
> +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> +				rte_free(eth_list[i]->data->dev_private);
> +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> +		}
> +		free(eth_list);
> +		rte_errno = ret;
> +		return NULL;

The code is correct, but I personally prefer to move complex error handling to 
dedicate "error:" block to make the code clear.

> +	}
> +	eth_list[i] = NULL;
> +	return eth_list;
> +}
> +
> +/**
> + * DPDK callback to register a PCI device.
> + *
> + * This function creates an Ethernet device for each port of a given
> + * PCI device.
> + *
> + * @param[in] pci_drv
> + *   PCI driver structure (mlx5_driver).
> + * @param[in] pci_dev
> + *   PCI device information.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> +	       struct rte_pci_device *pci_dev) {
> +	struct ibv_device **ibv_list;
> +	struct rte_eth_dev **eth_list = NULL;
> +	int vf;
> +	int ret;
> +
> +	assert(pci_drv == &mlx5_driver);
> +	switch (pci_dev->id.device_id) {
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> +		vf = 1;
> +		break;
> +	default:
> +		vf = 0;
> +	}

How about use a macro for vf detection and invoke in mlx5_dev_spawn_one().
Seems it not used in outer callers.
	
> +	errno = 0;
> +	ibv_list = mlx5_glue->get_device_list(&ret);
> +	if (!ibv_list) {
> +		rte_errno = errno ? errno : ENOSYS;
> +		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
>  		return -rte_errno;
>  	}
> -	return 0;
> +	while (ret-- > 0) {
> +		struct rte_pci_addr pci_addr;
> +
> +		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
> +		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
> +			continue;
> +		if (pci_dev->addr.domain != pci_addr.domain ||
> +		    pci_dev->addr.bus != pci_addr.bus ||
> +		    pci_dev->addr.devid != pci_addr.devid ||
> +		    pci_dev->addr.function != pci_addr.function)
> +			continue;
> +		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
> +			ibv_list[ret]->name);
> +		break;
> +	}
> +	if (ret >= 0)
> +		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
> +	mlx5_glue->free_device_list(ibv_list);
> +	if (!eth_list || !*eth_list) {
> +		DRV_LOG(WARNING,
> +			"no Verbs device matches PCI " PCI_PRI_FMT ","
> +			" are kernel drivers loaded?",
> +			pci_dev->addr.domain, pci_dev->addr.bus,
> +			pci_dev->addr.devid, pci_dev->addr.function);
> +		rte_errno = ENOENT;
> +		ret = -rte_errno;
> +	} else {
> +		for (ret = 0; eth_list[ret]; ++ret) {
> +			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
> +			rte_eth_dev_probing_finish(eth_list[ret]);
> +		}
> +		ret = 0;
> +	}
> +	free(eth_list);
> +	return ret;
>  }
> 
>  static const struct rte_pci_id mlx5_pci_id_map[] = {
> --
> 2.11.0
  
Adrien Mazarguil June 12, 2018, 1:20 p.m. UTC | #2
On Sun, Jun 10, 2018 at 12:59:06PM +0000, Xueming(Steven) Li wrote:
> Hi Adrien,
> 
> The logic looks much more clear now with the split.
<snip>
> > -		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
> > -			 pci_dev->addr.domain, pci_dev->addr.bus,
> > -			 pci_dev->addr.devid, pci_dev->addr.function);
> > -		if (attr.orig_attr.phys_port_cnt > 1)
> > -			snprintf(name + len, sizeof(name), " port %u", i);
> > +		if (attr->orig_attr.phys_port_cnt > 1)
> > +			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
> > +		else
> > +			snprintf(name, sizeof(name), "%s port %u",
> > +				 dpdk_dev->name, port);
> 
> Name contains port only if phys_port_cnt > 1 in previous logic, are you sure?

Nice catch, will fix it for v2. This wasn't noticed because this code is
replaced in a subsequent patch of the series.

<snip>
> > +	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
> > +		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
> > +						 &attr, i + 1);
> > +		if (eth_list[i])
> > +			continue;
> > +		/* Save rte_errno and roll back in case of failure. */
> > +		ret = rte_errno;
> > +		while (i--) {
> > +			mlx5_dev_close(eth_list[i]);
> > +			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
> > +				rte_free(eth_list[i]->data->dev_private);
> > +			claim_zero(rte_eth_dev_release_port(eth_list[i]));
> > +		}
> > +		free(eth_list);
> > +		rte_errno = ret;
> > +		return NULL;
> 
> The code is correct, but I personally prefer to move complex error handling to 
> dedicate "error:" block to make the code clear.

Since it's the only place where this failure can occur, I'll leave it as is
on the basis that doing so saves a goto statement. Those should be avoided
where possible. It would have been a different story if the same error
handling code was called from multiple places.

<snip>
> > +static int
> > +mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
> > +	       struct rte_pci_device *pci_dev) {
> > +	struct ibv_device **ibv_list;
> > +	struct rte_eth_dev **eth_list = NULL;
> > +	int vf;
> > +	int ret;
> > +
> > +	assert(pci_drv == &mlx5_driver);
> > +	switch (pci_dev->id.device_id) {
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
> > +	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
> > +		vf = 1;
> > +		break;
> > +	default:
> > +		vf = 0;
> > +	}
> 
> How about use a macro for vf detection and invoke in mlx5_dev_spawn_one().
> Seems it not used in outer callers.

mlx5_dev_spawn_one() can be invoked with IB devices not backed by PCI
(e.g. vdevs), for which the caller may still knowingly ask for VF behavior,
either by user request or through other means.

In this case, the caller happens to be a PCI probing function which, based
on the device ID, easily determines whether VF behavior shall be requested.

This is basically the only place where PCI device ID can be checked. Adding
a macro here would only obfuscate this check. Adding it in
mlx5_dev_spawn_one() would entangle PCI and generic code again, the opposite
of the purpose of this patch, therefore I'll leave it unmodified for v2.
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index 41a542ebc..7a812ef93 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -633,30 +633,34 @@  mlx5_uar_init_secondary(struct rte_eth_dev *dev)
 }
 
 /**
- * DPDK callback to register a PCI device.
- *
- * This function creates an Ethernet device for each port of a given
- * PCI device.
+ * Spawn an Ethernet device from Verbs information.
  *
- * @param[in] pci_drv
- *   PCI driver structure (mlx5_driver).
- * @param[in] pci_dev
- *   PCI device information.
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ * @param[in] attr
+ *   Verbs device attributes.
+ * @param port
+ *   Verbs port to use (indexed from 1).
  *
  * @return
- *   0 on success, a negative errno value otherwise and rte_errno is set.
+ *   A valid Ethernet device object on success, NULL otherwise and rte_errno
+ *   is set.
  */
-static int
-mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
-	       struct rte_pci_device *pci_dev)
+static struct rte_eth_dev *
+mlx5_dev_spawn_one(struct rte_device *dpdk_dev,
+		   struct ibv_device *ibv_dev,
+		   int vf,
+		   const struct ibv_device_attr_ex *attr,
+		   unsigned int port)
 {
-	struct ibv_device **list = NULL;
-	struct ibv_device *ibv_dev;
-	struct ibv_context *ctx = NULL;
-	struct ibv_device_attr_ex attr;
+	struct ibv_context *ctx;
 	struct mlx5dv_context dv_attr = { .comp_mask = 0 };
+	struct rte_eth_dev *eth_dev = NULL;
 	int err = 0;
-	unsigned int vf = 0;
 	unsigned int mps;
 	unsigned int cqe_comp;
 	unsigned int tunnel_en = 0;
@@ -668,71 +672,18 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	unsigned int mprq_max_stride_size_n = 0;
 	unsigned int mprq_min_stride_num_n = 0;
 	unsigned int mprq_max_stride_num_n = 0;
-	int i;
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
 	struct ibv_counter_set_description cs_desc;
 #endif
 
 	/* Prepare shared data between primary and secondary process. */
 	mlx5_prepare_shared_data();
-	assert(pci_drv == &mlx5_driver);
-	list = mlx5_glue->get_device_list(&i);
-	if (list == NULL) {
-		assert(errno);
-		err = errno;
-		if (errno == ENOSYS)
-			DRV_LOG(ERR,
-				"cannot list devices, is ib_uverbs loaded?");
-		goto error;
-	}
-	assert(i >= 0);
-	/*
-	 * For each listed device, check related sysfs entry against
-	 * the provided PCI ID.
-	 */
-	while (i != 0) {
-		struct rte_pci_addr pci_addr;
-
-		--i;
-		DRV_LOG(DEBUG, "checking device \"%s\"", list[i]->name);
-		if (mlx5_ibv_device_to_pci_addr(list[i], &pci_addr))
-			continue;
-		if ((pci_dev->addr.domain != pci_addr.domain) ||
-		    (pci_dev->addr.bus != pci_addr.bus) ||
-		    (pci_dev->addr.devid != pci_addr.devid) ||
-		    (pci_dev->addr.function != pci_addr.function))
-			continue;
-		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
-			list[i]->name);
-		vf = ((pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5VF) ||
-		      (pci_dev->id.device_id ==
-		       PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF));
-		ctx = mlx5_glue->open_device(list[i]);
-		rte_errno = errno;
-		err = rte_errno;
-		break;
-	}
-	if (ctx == NULL) {
-		switch (err) {
-		case 0:
-			DRV_LOG(ERR,
-				"cannot access device, is mlx5_ib loaded?");
-			err = ENODEV;
-			break;
-		case EINVAL:
-			DRV_LOG(ERR,
-				"cannot use device, are drivers up to date?");
-			break;
-		}
-		goto error;
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		return NULL;
 	}
-	ibv_dev = list[i];
-	DRV_LOG(DEBUG, "device opened");
 #ifdef HAVE_IBV_MLX5_MOD_SWP
 	dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
 #endif
@@ -820,20 +771,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 	DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
 		" old OFED/rdma-core version or firmware configuration");
 #endif
-	err = mlx5_glue->query_device_ex(ctx, NULL, &attr);
-	if (err) {
-		DEBUG("ibv_query_device_ex() failed");
-		goto error;
-	}
-	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
-	for (i = 0; i < attr.orig_attr.phys_port_cnt; i++) {
+	{
 		char name[RTE_ETH_NAME_MAX_LEN];
-		int len;
-		uint32_t port = i + 1; /* ports are indexed from one */
 		struct ibv_port_attr port_attr;
 		struct ibv_pd *pd = NULL;
 		struct priv *priv = NULL;
-		struct rte_eth_dev *eth_dev = NULL;
 		struct ether_addr mac;
 		struct mlx5_dev_config config = {
 			.cqe_comp = cqe_comp,
@@ -857,11 +799,11 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			},
 		};
 
-		len = snprintf(name, sizeof(name), PCI_PRI_FMT,
-			 pci_dev->addr.domain, pci_dev->addr.bus,
-			 pci_dev->addr.devid, pci_dev->addr.function);
-		if (attr.orig_attr.phys_port_cnt > 1)
-			snprintf(name + len, sizeof(name), " port %u", i);
+		if (attr->orig_attr.phys_port_cnt > 1)
+			snprintf(name, sizeof(name), "%s", dpdk_dev->name);
+		else
+			snprintf(name, sizeof(name), "%s port %u",
+				 dpdk_dev->name, port);
 		if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
 			eth_dev = rte_eth_dev_attach_secondary(name);
 			if (eth_dev == NULL) {
@@ -870,7 +812,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				err = rte_errno;
 				goto error;
 			}
-			eth_dev->device = &pci_dev->device;
+			eth_dev->device = dpdk_dev;
 			eth_dev->dev_ops = &mlx5_dev_sec_ops;
 			err = mlx5_uar_init_secondary(eth_dev);
 			if (err) {
@@ -898,16 +840,10 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				mlx5_select_rx_function(eth_dev);
 			eth_dev->tx_pkt_burst =
 				mlx5_select_tx_function(eth_dev);
-			rte_eth_dev_probing_finish(eth_dev);
-			continue;
+			mlx5_glue->close_device(ctx);
+			return eth_dev;
 		}
 		DRV_LOG(DEBUG, "using port %u", port);
-		if (!ctx)
-			ctx = mlx5_glue->open_device(ibv_dev);
-		if (ctx == NULL) {
-			err = ENODEV;
-			goto port_error;
-		}
 		/* Check port status. */
 		err = mlx5_glue->query_port(ctx, port, &port_attr);
 		if (err) {
@@ -945,23 +881,23 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		priv->ctx = ctx;
 		strncpy(priv->ibdev_path, priv->ctx->device->ibdev_path,
 			sizeof(priv->ibdev_path));
-		priv->device_attr = attr;
+		priv->device_attr = *attr;
 		priv->port = port;
 		priv->pd = pd;
 		priv->mtu = ETHER_MTU;
-		err = mlx5_args(&config, pci_dev->device.devargs);
+		err = mlx5_args(&config, dpdk_dev->devargs);
 		if (err) {
 			err = rte_errno;
 			DRV_LOG(ERR, "failed to process device arguments: %s",
 				strerror(rte_errno));
 			goto port_error;
 		}
-		config.hw_csum = !!(attr.device_cap_flags_ex &
+		config.hw_csum = !!(attr->device_cap_flags_ex &
 				    IBV_DEVICE_RAW_IP_CSUM);
 		DRV_LOG(DEBUG, "checksum offloading is %ssupported",
 			(config.hw_csum ? "" : "not "));
 #ifdef HAVE_IBV_DEVICE_COUNTERS_SET_SUPPORT
-		config.flow_counter_en = !!attr.max_counter_sets;
+		config.flow_counter_en = !!attr->max_counter_sets;
 		mlx5_glue->describe_counter_set(ctx, 0, &cs_desc);
 		DRV_LOG(DEBUG,
 			"counter type = %d, num of cs = %ld, attributes = %d",
@@ -969,7 +905,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			cs_desc.attributes);
 #endif
 		config.ind_table_max_size =
-			attr.rss_caps.max_rwq_indirection_table_size;
+			attr->rss_caps.max_rwq_indirection_table_size;
 		/* Remove this check once DPDK supports larger/variable
 		 * indirection tables. */
 		if (config.ind_table_max_size >
@@ -977,28 +913,28 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			config.ind_table_max_size = ETH_RSS_RETA_SIZE_512;
 		DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
 			config.ind_table_max_size);
-		config.hw_vlan_strip = !!(attr.raw_packet_caps &
+		config.hw_vlan_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
 		DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
 			(config.hw_vlan_strip ? "" : "not "));
 
-		config.hw_fcs_strip = !!(attr.raw_packet_caps &
+		config.hw_fcs_strip = !!(attr->raw_packet_caps &
 					 IBV_RAW_PACKET_CAP_SCATTER_FCS);
 		DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
 			(config.hw_fcs_strip ? "" : "not "));
 
 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
-		config.hw_padding = !!attr.rx_pad_end_addr_align;
+		config.hw_padding = !!attr->rx_pad_end_addr_align;
 #endif
 		DRV_LOG(DEBUG,
 			"hardware Rx end alignment padding is %ssupported",
 			(config.hw_padding ? "" : "not "));
 		config.vf = vf;
-		config.tso = (attr.tso_caps.max_tso > 0 &&
-			      (attr.tso_caps.supported_qpts &
+		config.tso = (attr->tso_caps.max_tso > 0 &&
+			      (attr->tso_caps.supported_qpts &
 			      (1 << IBV_QPT_RAW_PACKET)));
 		if (config.tso)
-			config.tso_max_payload_sz = attr.tso_caps.max_tso;
+			config.tso_max_payload_sz = attr->tso_caps.max_tso;
 		if (config.mps && !mps) {
 			DRV_LOG(ERR,
 				"multi-packet send not supported on this device"
@@ -1039,8 +975,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 		eth_dev->data->dev_private = priv;
 		priv->dev_data = eth_dev->data;
 		eth_dev->data->mac_addrs = priv->mac;
-		eth_dev->device = &pci_dev->device;
-		rte_eth_copy_pci_info(eth_dev, pci_dev);
+		eth_dev->device = dpdk_dev;
 		eth_dev->device->driver = &mlx5_driver.driver;
 		err = mlx5_uar_init_primary(eth_dev);
 		if (err) {
@@ -1145,13 +1080,7 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 				 priv, mem_event_cb);
 		rte_rwlock_write_unlock(&mlx5_shared_data->mem_event_rwlock);
 		rte_eth_dev_probing_finish(eth_dev);
-		/*
-		 * Each eth_dev instance is assigned its own Verbs context,
-		 * since this one is consumed, let the next iteration open
-		 * another.
-		 */
-		ctx = NULL;
-		continue;
+		return eth_dev;
 port_error:
 		if (priv)
 			rte_free(priv);
@@ -1159,24 +1088,165 @@  mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
 			claim_zero(mlx5_glue->dealloc_pd(pd));
 		if (eth_dev && rte_eal_process_type() == RTE_PROC_PRIMARY)
 			rte_eth_dev_release_port(eth_dev);
-		break;
 	}
-	/*
-	 * XXX if something went wrong in the loop above, there is a resource
-	 * leak (ctx, pd, priv, dpdk ethdev) but we can do nothing about it as
-	 * long as the dpdk does not provide a way to deallocate a ethdev and a
-	 * way to enumerate the registered ethdevs to free the previous ones.
-	 */
 error:
 	if (ctx)
 		claim_zero(mlx5_glue->close_device(ctx));
-	if (list)
-		mlx5_glue->free_device_list(list);
-	if (err) {
-		rte_errno = err;
+	assert(err > 0);
+	rte_errno = err;
+	return NULL;
+}
+
+/**
+ * Spawn Ethernet devices from Verbs information, one per detected port.
+ *
+ * @param dpdk_dev
+ *   Backing DPDK device.
+ * @param ibv_dev
+ *   Verbs device.
+ * @param vf
+ *   If nonzero, enable VF-specific features.
+ *
+ * @return
+ *   A NULL-terminated list of Ethernet device objects on success, NULL
+ *   otherwise and rte_errno is set. Caller is expected to release list
+ *   memory through free().
+ */
+static struct rte_eth_dev **
+mlx5_dev_spawn(struct rte_device *dpdk_dev,
+	       struct ibv_device *ibv_dev,
+	       int vf)
+{
+	struct rte_eth_dev **eth_list = NULL;
+	struct ibv_context *ctx;
+	struct ibv_device_attr_ex attr;
+	unsigned int i;
+	int ret;
+
+	errno = 0;
+	ctx = mlx5_glue->open_device(ibv_dev);
+	if (!ctx) {
+		rte_errno = errno ? errno : ENODEV;
+		if (rte_errno == ENODEV)
+			DRV_LOG(ERR,
+				"cannot access device, is mlx5_ib loaded?");
+		else
+			DRV_LOG(ERR,
+				"cannot use device, are drivers up to date?");
+		return NULL;
+	}
+	ret = mlx5_glue->query_device_ex(ctx, NULL, &attr);
+	mlx5_glue->close_device(ctx);
+	if (ret) {
+		rte_errno = ret;
+		DRV_LOG(ERR, "unable to query device information: %s",
+			strerror(rte_errno));
+		return NULL;
+	}
+	DRV_LOG(INFO, "%u port(s) detected", attr.orig_attr.phys_port_cnt);
+	eth_list = malloc(sizeof(*eth_list) *
+			  (attr.orig_attr.phys_port_cnt + 1));
+	if (!eth_list) {
+		rte_errno = errno;
+		return NULL;
+	}
+	for (i = 0; i < attr.orig_attr.phys_port_cnt; ++i) {
+		eth_list[i] = mlx5_dev_spawn_one(dpdk_dev, ibv_dev, vf,
+						 &attr, i + 1);
+		if (eth_list[i])
+			continue;
+		/* Save rte_errno and roll back in case of failure. */
+		ret = rte_errno;
+		while (i--) {
+			mlx5_dev_close(eth_list[i]);
+			if (rte_eal_process_type() == RTE_PROC_PRIMARY)
+				rte_free(eth_list[i]->data->dev_private);
+			claim_zero(rte_eth_dev_release_port(eth_list[i]));
+		}
+		free(eth_list);
+		rte_errno = ret;
+		return NULL;
+	}
+	eth_list[i] = NULL;
+	return eth_list;
+}
+
+/**
+ * DPDK callback to register a PCI device.
+ *
+ * This function creates an Ethernet device for each port of a given
+ * PCI device.
+ *
+ * @param[in] pci_drv
+ *   PCI driver structure (mlx5_driver).
+ * @param[in] pci_dev
+ *   PCI device information.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
+	       struct rte_pci_device *pci_dev)
+{
+	struct ibv_device **ibv_list;
+	struct rte_eth_dev **eth_list = NULL;
+	int vf;
+	int ret;
+
+	assert(pci_drv == &mlx5_driver);
+	switch (pci_dev->id.device_id) {
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
+	case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
+		vf = 1;
+		break;
+	default:
+		vf = 0;
+	}
+	errno = 0;
+	ibv_list = mlx5_glue->get_device_list(&ret);
+	if (!ibv_list) {
+		rte_errno = errno ? errno : ENOSYS;
+		DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
 		return -rte_errno;
 	}
-	return 0;
+	while (ret-- > 0) {
+		struct rte_pci_addr pci_addr;
+
+		DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
+		if (mlx5_ibv_device_to_pci_addr(ibv_list[ret], &pci_addr))
+			continue;
+		if (pci_dev->addr.domain != pci_addr.domain ||
+		    pci_dev->addr.bus != pci_addr.bus ||
+		    pci_dev->addr.devid != pci_addr.devid ||
+		    pci_dev->addr.function != pci_addr.function)
+			continue;
+		DRV_LOG(INFO, "PCI information matches, using device \"%s\"",
+			ibv_list[ret]->name);
+		break;
+	}
+	if (ret >= 0)
+		eth_list = mlx5_dev_spawn(&pci_dev->device, ibv_list[ret], vf);
+	mlx5_glue->free_device_list(ibv_list);
+	if (!eth_list || !*eth_list) {
+		DRV_LOG(WARNING,
+			"no Verbs device matches PCI " PCI_PRI_FMT ","
+			" are kernel drivers loaded?",
+			pci_dev->addr.domain, pci_dev->addr.bus,
+			pci_dev->addr.devid, pci_dev->addr.function);
+		rte_errno = ENOENT;
+		ret = -rte_errno;
+	} else {
+		for (ret = 0; eth_list[ret]; ++ret) {
+			rte_eth_copy_pci_info(eth_list[ret], pci_dev);
+			rte_eth_dev_probing_finish(eth_list[ret]);
+		}
+		ret = 0;
+	}
+	free(eth_list);
+	return ret;
 }
 
 static const struct rte_pci_id mlx5_pci_id_map[] = {