[5/6] net/mlx5: support PCI device DMA map and unmap

Message ID a6e88d434eea7d2d8407008bb94298c2d74a2bc9.1550048188.git.shahafs@mellanox.com (mailing list archive)
State Superseded, archived
Headers
Series introduce DMA memory mapping for external memory |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Shahaf Shuler Feb. 13, 2019, 9:10 a.m. UTC
  The implementation reuses the external memory registration work done by
commit[1].

Note about representors:

The current representor design will not work
with those map and unmap functions. The reason is that for representors
we have multiple IB devices share the same PCI function, so mapping will
happen only on one of the representors and not all of them.

While it is possible to implement such support, the IB representor
design is going to be changed during DPDK19.05. The new design will have
a single IB device for all representors, hence sharing of a single
memory region between all representors will be possible.

[1]
commit 7e43a32ee060
("net/mlx5: support externally allocated static memory")

Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
---
 drivers/net/mlx5/mlx5.c      |   2 +
 drivers/net/mlx5/mlx5_mr.c   | 146 ++++++++++++++++++++++++++++++++++++++
 drivers/net/mlx5/mlx5_rxtx.h |   5 ++
 3 files changed, 153 insertions(+)
  

Comments

Gaëtan Rivet Feb. 13, 2019, 11:35 a.m. UTC | #1
On Wed, Feb 13, 2019 at 11:10:25AM +0200, Shahaf Shuler wrote:
> The implementation reuses the external memory registration work done by
> commit[1].
> 
> Note about representors:
> 
> The current representor design will not work
> with those map and unmap functions. The reason is that for representors
> we have multiple IB devices share the same PCI function, so mapping will
> happen only on one of the representors and not all of them.
> 
> While it is possible to implement such support, the IB representor
> design is going to be changed during DPDK19.05. The new design will have
> a single IB device for all representors, hence sharing of a single
> memory region between all representors will be possible.
> 
> [1]
> commit 7e43a32ee060
> ("net/mlx5: support externally allocated static memory")
> 
> Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> ---
>  drivers/net/mlx5/mlx5.c      |   2 +
>  drivers/net/mlx5/mlx5_mr.c   | 146 ++++++++++++++++++++++++++++++++++++++
>  drivers/net/mlx5/mlx5_rxtx.h |   5 ++
>  3 files changed, 153 insertions(+)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index a913a5955f..7c91701713 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -1626,6 +1626,8 @@ static struct rte_pci_driver mlx5_driver = {
>  	.id_table = mlx5_pci_id_map,
>  	.probe = mlx5_pci_probe,
>  	.remove = mlx5_pci_remove,
> +	.map = mlx5_dma_map,
> +	.unmap = mlx5_dma_unmap,
>  	.drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
>  		      RTE_PCI_DRV_PROBE_AGAIN),
>  };
> diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> index 32be6a5445..7059181959 100644
> --- a/drivers/net/mlx5/mlx5_mr.c
> +++ b/drivers/net/mlx5/mlx5_mr.c
> @@ -14,6 +14,7 @@
>  #include <rte_mempool.h>
>  #include <rte_malloc.h>
>  #include <rte_rwlock.h>
> +#include <rte_bus_pci.h>
>  
>  #include "mlx5.h"
>  #include "mlx5_mr.h"
> @@ -1215,6 +1216,151 @@ mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
>  }
>  
>  /**
> + * Finds the first ethdev that match the pci device.
> + * The existence of multiple ethdev per pci device is only with representors.
> + * On such case, it is enough to get only one of the ports as they all share
> + * the same ibv context.
> + *
> + * @param pdev
> + *   Pointer to the PCI device.
> + *
> + * @return
> + *   Pointer to the ethdev if found, NULL otherwise.
> + */
> +static struct rte_eth_dev *
> +pci_dev_to_eth_dev(struct rte_pci_device *pdev)
> +{
> +	struct rte_eth_dev *dev;
> +	const char *drv_name;
> +	uint16_t port_id = 0;
> +
> +	/**
> +	 *  We really need to iterate all eth devices regardless of
> +	 *  their owner.
> +	 */
> +	while (port_id < RTE_MAX_ETHPORTS) {
> +		port_id = rte_eth_find_next(port_id);
> +		if (port_id >= RTE_MAX_ETHPORTS)
> +			break;
> +		dev = &rte_eth_devices[port_id];
> +		drv_name = dev->device->driver->name;
> +		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
> +			     sizeof(MLX5_DRIVER_NAME) + 1) &&
> +		    pdev == RTE_DEV_TO_PCI(dev->device)) {
> +			/* found the PCI device. */
> +			return dev;
> +		}
> +	}
> +	return NULL;
> +}

Might I interest you in the new API?

   {
           struct rte_dev_iterator it;
           struct rte_device *dev;
   
           RTE_DEV_FOREACH(dev, "class=eth", &it)
                   if (dev == &pdev->device)
                           return it.class_device;
           return NULL;
   }

> +
> +/**
> + * DPDK callback to DMA map external memory to a PCI device.
> + *
> + * @param pdev
> + *   Pointer to the PCI device.
> + * @param addr
> + *   Starting virtual address of memory to be mapped.
> + * @param iova
> + *   Starting IOVA address of memory to be mapped.
> + * @param len
> + *   Length of memory segment being mapped.
> + *
> + * @return
> + *   0 on success, negative value on error.
> + */
> +int
> +mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
> +	     uint64_t iova __rte_unused, size_t len)
> +{
> +	struct rte_eth_dev *dev;
> +	struct mlx5_mr *mr;
> +	struct priv *priv;
> +
> +	dev = pci_dev_to_eth_dev(pdev);
> +	if (!dev) {
> +		DRV_LOG(WARNING, "unable to find matching ethdev "
> +				 "to PCI device %p", (void *)pdev);
> +		return -1;
> +	}
> +	priv = dev->data->dev_private;
> +	mr = mlx5_create_mr_ext(dev, (uintptr_t)addr, len, SOCKET_ID_ANY);
> +	if (!mr) {
> +		DRV_LOG(WARNING,
> +			"port %u unable to dma map", dev->data->port_id);
> +		return -1;
> +	}
> +	rte_rwlock_write_lock(&priv->mr.rwlock);
> +	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> +	/* Insert to the global cache table. */
> +	mr_insert_dev_cache(dev, mr);
> +	rte_rwlock_write_unlock(&priv->mr.rwlock);
> +	return 0;
> +}
> +
> +/**
> + * DPDK callback to DMA unmap external memory to a PCI device.
> + *
> + * @param pdev
> + *   Pointer to the PCI device.
> + * @param addr
> + *   Starting virtual address of memory to be unmapped.
> + * @param iova
> + *   Starting IOVA address of memory to be unmapped.
> + * @param len
> + *   Length of memory segment being unmapped.
> + *
> + * @return
> + *   0 on success, negative value on error.
> + */
> +int
> +mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
> +	       uint64_t iova __rte_unused, size_t len __rte_unused)
> +{
> +	struct rte_eth_dev *dev;
> +	struct priv *priv;
> +	struct mlx5_mr *mr;
> +	struct mlx5_mr_cache entry;
> +
> +	dev = pci_dev_to_eth_dev(pdev);
> +	if (!dev) {
> +		DRV_LOG(WARNING, "unable to find matching ethdev "
> +				 "to PCI device %p", (void *)pdev);
> +		return -1;
> +	}
> +	priv = dev->data->dev_private;
> +	rte_rwlock_read_lock(&priv->mr.rwlock);
> +	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> +	if (!mr) {
> +		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
> +				 "to PCI device %p", (uintptr_t)addr,
> +				 (void *)pdev);
> +		rte_rwlock_read_unlock(&priv->mr.rwlock);
> +		return -1;
> +	}
> +	LIST_REMOVE(mr, mr);
> +	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> +	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> +	      (void *)mr);
> +	mr_rebuild_dev_cache(dev);
> +	/*
> +	 * Flush local caches by propagating invalidation across cores.
> +	 * rte_smp_wmb() is enough to synchronize this event. If one of
> +	 * freed memsegs is seen by other core, that means the memseg
> +	 * has been allocated by allocator, which will come after this
> +	 * free call. Therefore, this store instruction (incrementing
> +	 * generation below) will be guaranteed to be seen by other core
> +	 * before the core sees the newly allocated memory.
> +	 */
> +	++priv->mr.dev_gen;
> +	DEBUG("broadcasting local cache flush, gen=%d",
> +			priv->mr.dev_gen);
> +	rte_smp_wmb();
> +	rte_rwlock_read_unlock(&priv->mr.rwlock);
> +	return 0;
> +}
> +
> +/**
>   * Register MR for entire memory chunks in a Mempool having externally allocated
>   * memory and fill in local cache.
>   *
> diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> index c2529f96bc..f3f84dbac3 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.h
> +++ b/drivers/net/mlx5/mlx5_rxtx.h
> @@ -28,6 +28,7 @@
>  #include <rte_atomic.h>
>  #include <rte_spinlock.h>
>  #include <rte_io.h>
> +#include <rte_bus_pci.h>
>  
>  #include "mlx5_utils.h"
>  #include "mlx5.h"
> @@ -367,6 +368,10 @@ uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
>  uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
>  uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
>  			       struct rte_mempool *mp);
> +int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova,
> +		 size_t len);
> +int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
> +		   size_t len);
>  
>  /**
>   * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
> -- 
> 2.12.0
>
  
Gaëtan Rivet Feb. 13, 2019, 11:44 a.m. UTC | #2
On Wed, Feb 13, 2019 at 12:35:04PM +0100, Gaëtan Rivet wrote:
> On Wed, Feb 13, 2019 at 11:10:25AM +0200, Shahaf Shuler wrote:
> > The implementation reuses the external memory registration work done by
> > commit[1].
> > 
> > Note about representors:
> > 
> > The current representor design will not work
> > with those map and unmap functions. The reason is that for representors
> > we have multiple IB devices share the same PCI function, so mapping will
> > happen only on one of the representors and not all of them.
> > 
> > While it is possible to implement such support, the IB representor
> > design is going to be changed during DPDK19.05. The new design will have
> > a single IB device for all representors, hence sharing of a single
> > memory region between all representors will be possible.
> > 
> > [1]
> > commit 7e43a32ee060
> > ("net/mlx5: support externally allocated static memory")
> > 
> > Signed-off-by: Shahaf Shuler <shahafs@mellanox.com>
> > ---
> >  drivers/net/mlx5/mlx5.c      |   2 +
> >  drivers/net/mlx5/mlx5_mr.c   | 146 ++++++++++++++++++++++++++++++++++++++
> >  drivers/net/mlx5/mlx5_rxtx.h |   5 ++
> >  3 files changed, 153 insertions(+)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> > index a913a5955f..7c91701713 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -1626,6 +1626,8 @@ static struct rte_pci_driver mlx5_driver = {
> >  	.id_table = mlx5_pci_id_map,
> >  	.probe = mlx5_pci_probe,
> >  	.remove = mlx5_pci_remove,
> > +	.map = mlx5_dma_map,
> > +	.unmap = mlx5_dma_unmap,
> >  	.drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
> >  		      RTE_PCI_DRV_PROBE_AGAIN),
> >  };
> > diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
> > index 32be6a5445..7059181959 100644
> > --- a/drivers/net/mlx5/mlx5_mr.c
> > +++ b/drivers/net/mlx5/mlx5_mr.c
> > @@ -14,6 +14,7 @@
> >  #include <rte_mempool.h>
> >  #include <rte_malloc.h>
> >  #include <rte_rwlock.h>
> > +#include <rte_bus_pci.h>
> >  
> >  #include "mlx5.h"
> >  #include "mlx5_mr.h"
> > @@ -1215,6 +1216,151 @@ mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
> >  }
> >  
> >  /**
> > + * Finds the first ethdev that match the pci device.
> > + * The existence of multiple ethdev per pci device is only with representors.
> > + * On such case, it is enough to get only one of the ports as they all share
> > + * the same ibv context.
> > + *
> > + * @param pdev
> > + *   Pointer to the PCI device.
> > + *
> > + * @return
> > + *   Pointer to the ethdev if found, NULL otherwise.
> > + */
> > +static struct rte_eth_dev *
> > +pci_dev_to_eth_dev(struct rte_pci_device *pdev)
> > +{
> > +	struct rte_eth_dev *dev;
> > +	const char *drv_name;
> > +	uint16_t port_id = 0;
> > +
> > +	/**
> > +	 *  We really need to iterate all eth devices regardless of
> > +	 *  their owner.
> > +	 */
> > +	while (port_id < RTE_MAX_ETHPORTS) {
> > +		port_id = rte_eth_find_next(port_id);
> > +		if (port_id >= RTE_MAX_ETHPORTS)
> > +			break;
> > +		dev = &rte_eth_devices[port_id];
> > +		drv_name = dev->device->driver->name;
> > +		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
> > +			     sizeof(MLX5_DRIVER_NAME) + 1) &&
> > +		    pdev == RTE_DEV_TO_PCI(dev->device)) {
> > +			/* found the PCI device. */
> > +			return dev;
> > +		}
> > +	}
> > +	return NULL;
> > +}
> 
> Might I interest you in the new API?
> 
>    {
>            struct rte_dev_iterator it;
>            struct rte_device *dev;
>    
>            RTE_DEV_FOREACH(dev, "class=eth", &it)
>                    if (dev == &pdev->device)
>                            return it.class_device;
>            return NULL;
>    }
> 

On that note, this could be in the PCI bus instead?

> > +
> > +/**
> > + * DPDK callback to DMA map external memory to a PCI device.
> > + *
> > + * @param pdev
> > + *   Pointer to the PCI device.
> > + * @param addr
> > + *   Starting virtual address of memory to be mapped.
> > + * @param iova
> > + *   Starting IOVA address of memory to be mapped.
> > + * @param len
> > + *   Length of memory segment being mapped.
> > + *
> > + * @return
> > + *   0 on success, negative value on error.
> > + */
> > +int
> > +mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
> > +	     uint64_t iova __rte_unused, size_t len)
> > +{
> > +	struct rte_eth_dev *dev;
> > +	struct mlx5_mr *mr;
> > +	struct priv *priv;
> > +
> > +	dev = pci_dev_to_eth_dev(pdev);
> > +	if (!dev) {
> > +		DRV_LOG(WARNING, "unable to find matching ethdev "
> > +				 "to PCI device %p", (void *)pdev);
> > +		return -1;
> > +	}
> > +	priv = dev->data->dev_private;
> > +	mr = mlx5_create_mr_ext(dev, (uintptr_t)addr, len, SOCKET_ID_ANY);
> > +	if (!mr) {
> > +		DRV_LOG(WARNING,
> > +			"port %u unable to dma map", dev->data->port_id);
> > +		return -1;
> > +	}
> > +	rte_rwlock_write_lock(&priv->mr.rwlock);
> > +	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > +	/* Insert to the global cache table. */
> > +	mr_insert_dev_cache(dev, mr);
> > +	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > +	return 0;
> > +}
> > +
> > +/**
> > + * DPDK callback to DMA unmap external memory to a PCI device.
> > + *
> > + * @param pdev
> > + *   Pointer to the PCI device.
> > + * @param addr
> > + *   Starting virtual address of memory to be unmapped.
> > + * @param iova
> > + *   Starting IOVA address of memory to be unmapped.
> > + * @param len
> > + *   Length of memory segment being unmapped.
> > + *
> > + * @return
> > + *   0 on success, negative value on error.
> > + */
> > +int
> > +mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
> > +	       uint64_t iova __rte_unused, size_t len __rte_unused)
> > +{
> > +	struct rte_eth_dev *dev;
> > +	struct priv *priv;
> > +	struct mlx5_mr *mr;
> > +	struct mlx5_mr_cache entry;
> > +
> > +	dev = pci_dev_to_eth_dev(pdev);
> > +	if (!dev) {
> > +		DRV_LOG(WARNING, "unable to find matching ethdev "
> > +				 "to PCI device %p", (void *)pdev);
> > +		return -1;
> > +	}
> > +	priv = dev->data->dev_private;
> > +	rte_rwlock_read_lock(&priv->mr.rwlock);
> > +	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> > +	if (!mr) {
> > +		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
> > +				 "to PCI device %p", (uintptr_t)addr,
> > +				 (void *)pdev);
> > +		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +		return -1;
> > +	}
> > +	LIST_REMOVE(mr, mr);
> > +	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > +	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> > +	      (void *)mr);
> > +	mr_rebuild_dev_cache(dev);
> > +	/*
> > +	 * Flush local caches by propagating invalidation across cores.
> > +	 * rte_smp_wmb() is enough to synchronize this event. If one of
> > +	 * freed memsegs is seen by other core, that means the memseg
> > +	 * has been allocated by allocator, which will come after this
> > +	 * free call. Therefore, this store instruction (incrementing
> > +	 * generation below) will be guaranteed to be seen by other core
> > +	 * before the core sees the newly allocated memory.
> > +	 */
> > +	++priv->mr.dev_gen;
> > +	DEBUG("broadcasting local cache flush, gen=%d",
> > +			priv->mr.dev_gen);
> > +	rte_smp_wmb();
> > +	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > +	return 0;
> > +}
> > +
> > +/**
> >   * Register MR for entire memory chunks in a Mempool having externally allocated
> >   * memory and fill in local cache.
> >   *
> > diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
> > index c2529f96bc..f3f84dbac3 100644
> > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > @@ -28,6 +28,7 @@
> >  #include <rte_atomic.h>
> >  #include <rte_spinlock.h>
> >  #include <rte_io.h>
> > +#include <rte_bus_pci.h>
> >  
> >  #include "mlx5_utils.h"
> >  #include "mlx5.h"
> > @@ -367,6 +368,10 @@ uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
> >  uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
> >  uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
> >  			       struct rte_mempool *mp);
> > +int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova,
> > +		 size_t len);
> > +int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
> > +		   size_t len);
> >  
> >  /**
> >   * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
> > -- 
> > 2.12.0
> > 
> 
> -- 
> Gaëtan Rivet
> 6WIND
  
Shahaf Shuler Feb. 13, 2019, 7:11 p.m. UTC | #3
Wednesday, February 13, 2019 1:44 PM, Gaëtan Rivet:
> Subject: Re: [PATCH 5/6] net/mlx5: support PCI device DMA map and unmap
> 
> On Wed, Feb 13, 2019 at 12:35:04PM +0100, Gaëtan Rivet wrote:
> > On Wed, Feb 13, 2019 at 11:10:25AM +0200, Shahaf Shuler wrote:
> > > The implementation reuses the external memory registration work done

[..]

> > > +
> > > +	/**
> > > +	 *  We really need to iterate all eth devices regardless of
> > > +	 *  their owner.
> > > +	 */
> > > +	while (port_id < RTE_MAX_ETHPORTS) {
> > > +		port_id = rte_eth_find_next(port_id);
> > > +		if (port_id >= RTE_MAX_ETHPORTS)
> > > +			break;
> > > +		dev = &rte_eth_devices[port_id];
> > > +		drv_name = dev->device->driver->name;
> > > +		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
> > > +			     sizeof(MLX5_DRIVER_NAME) + 1) &&
> > > +		    pdev == RTE_DEV_TO_PCI(dev->device)) {
> > > +			/* found the PCI device. */
> > > +			return dev;
> > > +		}
> > > +	}
> > > +	return NULL;
> > > +}
> >
> > Might I interest you in the new API?

Good suggestion, will have a look on it in depth. 

> >
> >    {
> >            struct rte_dev_iterator it;
> >            struct rte_device *dev;
> >
> >            RTE_DEV_FOREACH(dev, "class=eth", &it)
> >                    if (dev == &pdev->device)
> >                            return it.class_device;
> >            return NULL;
> >    }
> >
> 
> On that note, this could be in the PCI bus instead?

We can put it on the PCI bus, but it would mean the PCI bus will not be device agnostic. 
Currently, I couldn't find any reference to eth_dev on the PCI bus, besides a single macro which convert to pci device that doesn't really do type checks. 

Having it in, would mean the PCI will need start to distinguish between ethdev, crypto dev and what ever devices exists on its bus. 

> 
> > > +
> > > +/**
> > > + * DPDK callback to DMA map external memory to a PCI device.
> > > + *
> > > + * @param pdev
> > > + *   Pointer to the PCI device.
> > > + * @param addr
> > > + *   Starting virtual address of memory to be mapped.
> > > + * @param iova
> > > + *   Starting IOVA address of memory to be mapped.
> > > + * @param len
> > > + *   Length of memory segment being mapped.
> > > + *
> > > + * @return
> > > + *   0 on success, negative value on error.
> > > + */
> > > +int
> > > +mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
> > > +	     uint64_t iova __rte_unused, size_t len) {
> > > +	struct rte_eth_dev *dev;
> > > +	struct mlx5_mr *mr;
> > > +	struct priv *priv;
> > > +
> > > +	dev = pci_dev_to_eth_dev(pdev);
> > > +	if (!dev) {
> > > +		DRV_LOG(WARNING, "unable to find matching ethdev "
> > > +				 "to PCI device %p", (void *)pdev);
> > > +		return -1;
> > > +	}
> > > +	priv = dev->data->dev_private;
> > > +	mr = mlx5_create_mr_ext(dev, (uintptr_t)addr, len,
> SOCKET_ID_ANY);
> > > +	if (!mr) {
> > > +		DRV_LOG(WARNING,
> > > +			"port %u unable to dma map", dev->data->port_id);
> > > +		return -1;
> > > +	}
> > > +	rte_rwlock_write_lock(&priv->mr.rwlock);
> > > +	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
> > > +	/* Insert to the global cache table. */
> > > +	mr_insert_dev_cache(dev, mr);
> > > +	rte_rwlock_write_unlock(&priv->mr.rwlock);
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > > + * DPDK callback to DMA unmap external memory to a PCI device.
> > > + *
> > > + * @param pdev
> > > + *   Pointer to the PCI device.
> > > + * @param addr
> > > + *   Starting virtual address of memory to be unmapped.
> > > + * @param iova
> > > + *   Starting IOVA address of memory to be unmapped.
> > > + * @param len
> > > + *   Length of memory segment being unmapped.
> > > + *
> > > + * @return
> > > + *   0 on success, negative value on error.
> > > + */
> > > +int
> > > +mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
> > > +	       uint64_t iova __rte_unused, size_t len __rte_unused) {
> > > +	struct rte_eth_dev *dev;
> > > +	struct priv *priv;
> > > +	struct mlx5_mr *mr;
> > > +	struct mlx5_mr_cache entry;
> > > +
> > > +	dev = pci_dev_to_eth_dev(pdev);
> > > +	if (!dev) {
> > > +		DRV_LOG(WARNING, "unable to find matching ethdev "
> > > +				 "to PCI device %p", (void *)pdev);
> > > +		return -1;
> > > +	}
> > > +	priv = dev->data->dev_private;
> > > +	rte_rwlock_read_lock(&priv->mr.rwlock);
> > > +	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
> > > +	if (!mr) {
> > > +		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't
> registered "
> > > +				 "to PCI device %p", (uintptr_t)addr,
> > > +				 (void *)pdev);
> > > +		rte_rwlock_read_unlock(&priv->mr.rwlock);
> > > +		return -1;
> > > +	}
> > > +	LIST_REMOVE(mr, mr);
> > > +	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
> > > +	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
> > > +	      (void *)mr);
> > > +	mr_rebuild_dev_cache(dev);
> > > +	/*
> > > +	 * Flush local caches by propagating invalidation across cores.
> > > +	 * rte_smp_wmb() is enough to synchronize this event. If one of
> > > +	 * freed memsegs is seen by other core, that means the memseg
> > > +	 * has been allocated by allocator, which will come after this
> > > +	 * free call. Therefore, this store instruction (incrementing
> > > +	 * generation below) will be guaranteed to be seen by other core
> > > +	 * before the core sees the newly allocated memory.
> > > +	 */
> > > +	++priv->mr.dev_gen;
> > > +	DEBUG("broadcasting local cache flush, gen=%d",
> > > +			priv->mr.dev_gen);
> > > +	rte_smp_wmb();
> > > +	rte_rwlock_read_unlock(&priv->mr.rwlock);
> > > +	return 0;
> > > +}
> > > +
> > > +/**
> > >   * Register MR for entire memory chunks in a Mempool having externally
> allocated
> > >   * memory and fill in local cache.
> > >   *
> > > diff --git a/drivers/net/mlx5/mlx5_rxtx.h
> > > b/drivers/net/mlx5/mlx5_rxtx.h index c2529f96bc..f3f84dbac3 100644
> > > --- a/drivers/net/mlx5/mlx5_rxtx.h
> > > +++ b/drivers/net/mlx5/mlx5_rxtx.h
> > > @@ -28,6 +28,7 @@
> > >  #include <rte_atomic.h>
> > >  #include <rte_spinlock.h>
> > >  #include <rte_io.h>
> > > +#include <rte_bus_pci.h>
> > >
> > >  #include "mlx5_utils.h"
> > >  #include "mlx5.h"
> > > @@ -367,6 +368,10 @@ uint32_t mlx5_rx_addr2mr_bh(struct
> > > mlx5_rxq_data *rxq, uintptr_t addr);  uint32_t
> > > mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
> uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t
> addr,
> > >  			       struct rte_mempool *mp);
> > > +int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t
> iova,
> > > +		 size_t len);
> > > +int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t
> iova,
> > > +		   size_t len);
> > >
> > >  /**
> > >   * Provide safe 64bit store operation to mlx5 UAR region for both
> > > 32bit and
> > > --
> > > 2.12.0
> > >
> >
> > --
> > Gaëtan Rivet
> > 6WIND
> 
> --
> Gaëtan Rivet
> 6WIND
  
Gaëtan Rivet Feb. 14, 2019, 10:21 a.m. UTC | #4
On Wed, Feb 13, 2019 at 07:11:35PM +0000, Shahaf Shuler wrote:
> Wednesday, February 13, 2019 1:44 PM, Gaëtan Rivet:
> > Subject: Re: [PATCH 5/6] net/mlx5: support PCI device DMA map and unmap
> > 
> > On Wed, Feb 13, 2019 at 12:35:04PM +0100, Gaëtan Rivet wrote:
> > > On Wed, Feb 13, 2019 at 11:10:25AM +0200, Shahaf Shuler wrote:
> > > > The implementation reuses the external memory registration work done
> 
> [..]
> 
> > > > +
> > > > +	/**
> > > > +	 *  We really need to iterate all eth devices regardless of
> > > > +	 *  their owner.
> > > > +	 */
> > > > +	while (port_id < RTE_MAX_ETHPORTS) {
> > > > +		port_id = rte_eth_find_next(port_id);
> > > > +		if (port_id >= RTE_MAX_ETHPORTS)
> > > > +			break;
> > > > +		dev = &rte_eth_devices[port_id];
> > > > +		drv_name = dev->device->driver->name;
> > > > +		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
> > > > +			     sizeof(MLX5_DRIVER_NAME) + 1) &&
> > > > +		    pdev == RTE_DEV_TO_PCI(dev->device)) {
> > > > +			/* found the PCI device. */
> > > > +			return dev;
> > > > +		}
> > > > +	}
> > > > +	return NULL;
> > > > +}
> > >
> > > Might I interest you in the new API?
> 
> Good suggestion, will have a look on it in depth. 
> 
> > >
> > >    {
> > >            struct rte_dev_iterator it;
> > >            struct rte_device *dev;
> > >
> > >            RTE_DEV_FOREACH(dev, "class=eth", &it)
> > >                    if (dev == &pdev->device)
> > >                            return it.class_device;
> > >            return NULL;
> > >    }
> > >
> > 
> > On that note, this could be in the PCI bus instead?
> 
> We can put it on the PCI bus, but it would mean the PCI bus will not be device agnostic. 
> Currently, I couldn't find any reference to eth_dev on the PCI bus, besides a single macro which convert to pci device that doesn't really do type checks. 
> 
> Having it in, would mean the PCI will need start to distinguish between ethdev, crypto dev and what ever devices exists on its bus. 
> 

I think it's worth thinking about it.
It can stay class-agnostic:

    void *
    rte_pci_device_class(struct rte_pci_device *pdev, const char *class)
    {
            char devstr[15+strlen(class)];
            struct rte_dev_iterator it;
            struct rte_device *dev;
    
            snprintf(devstr, sizeof(devstr), "bus=pci/class=%s", class);
            RTE_DEV_FOREACH(dev, devstr, &it)
                    if (dev == &pdev->device)
                            return it.class_device;
            return NULL;
    }

(not a fan of the stack VLA but whatever.)
then:

    eth_dev = rte_pci_device_class(pdev, "eth");

Whichever type of device could be returned. Only limit is that you have
to know beforehand what is the device type of the PCI device you are
querying about, but that's necessary anyway.

And if it was instead a crypto dev, it would return NULL.
  
Shahaf Shuler Feb. 21, 2019, 9:21 a.m. UTC | #5
Thursday, February 14, 2019 12:22 PM, Gaëtan Rivet:
> Subject: Re: [PATCH 5/6] net/mlx5: support PCI device DMA map and unmap
> 
> On Wed, Feb 13, 2019 at 07:11:35PM +0000, Shahaf Shuler wrote:
> > Wednesday, February 13, 2019 1:44 PM, Gaëtan Rivet:
> > > Subject: Re: [PATCH 5/6] net/mlx5: support PCI device DMA map and
> > > unmap
> > >
> > > On Wed, Feb 13, 2019 at 12:35:04PM +0100, Gaëtan Rivet wrote:
> > > > On Wed, Feb 13, 2019 at 11:10:25AM +0200, Shahaf Shuler wrote:
> > > > > The implementation reuses the external memory registration work
> > > > > done
> >
> > [..]
> >
> > > > > +
> > > > > +	/**
> > > > > +	 *  We really need to iterate all eth devices regardless of
> > > > > +	 *  their owner.
> > > > > +	 */
> > > > > +	while (port_id < RTE_MAX_ETHPORTS) {
> > > > > +		port_id = rte_eth_find_next(port_id);
> > > > > +		if (port_id >= RTE_MAX_ETHPORTS)
> > > > > +			break;
> > > > > +		dev = &rte_eth_devices[port_id];
> > > > > +		drv_name = dev->device->driver->name;
> > > > > +		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
> > > > > +			     sizeof(MLX5_DRIVER_NAME) + 1) &&
> > > > > +		    pdev == RTE_DEV_TO_PCI(dev->device)) {
> > > > > +			/* found the PCI device. */
> > > > > +			return dev;
> > > > > +		}
> > > > > +	}
> > > > > +	return NULL;
> > > > > +}
> > > >
> > > > Might I interest you in the new API?
> >
> > Good suggestion, will have a look on it in depth.
> >
> > > >
> > > >    {
> > > >            struct rte_dev_iterator it;
> > > >            struct rte_device *dev;
> > > >
> > > >            RTE_DEV_FOREACH(dev, "class=eth", &it)
> > > >                    if (dev == &pdev->device)
> > > >                            return it.class_device;
> > > >            return NULL;
> > > >    }
> > > >
> > >
> > > On that note, this could be in the PCI bus instead?

Looking in more depth into it. it looks like ethdev is the only device class which register its type.
So putting a generic iterator for every class on the PCI bus looks an overkill to me at this point. 

I think I will take the above suggestion to replace the internal PMD code. 

> >
> > We can put it on the PCI bus, but it would mean the PCI bus will not be
> device agnostic.
> > Currently, I couldn't find any reference to eth_dev on the PCI bus, besides
> a single macro which convert to pci device that doesn't really do type checks.
> >
> > Having it in, would mean the PCI will need start to distinguish between
> ethdev, crypto dev and what ever devices exists on its bus.
> >
> 
> I think it's worth thinking about it.
> It can stay class-agnostic:
> 
>     void *
>     rte_pci_device_class(struct rte_pci_device *pdev, const char *class)
>     {
>             char devstr[15+strlen(class)];
>             struct rte_dev_iterator it;
>             struct rte_device *dev;
> 
>             snprintf(devstr, sizeof(devstr), "bus=pci/class=%s", class);
>             RTE_DEV_FOREACH(dev, devstr, &it)
>                     if (dev == &pdev->device)
>                             return it.class_device;
>             return NULL;
>     }
> 
> (not a fan of the stack VLA but whatever.)
> then:
> 
>     eth_dev = rte_pci_device_class(pdev, "eth");
> 
> Whichever type of device could be returned. Only limit is that you have to
> know beforehand what is the device type of the PCI device you are querying
> about, but that's necessary anyway.
> 
> And if it was instead a crypto dev, it would return NULL.
> 
> --
> Gaëtan Rivet
> 6WIND
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index a913a5955f..7c91701713 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -1626,6 +1626,8 @@  static struct rte_pci_driver mlx5_driver = {
 	.id_table = mlx5_pci_id_map,
 	.probe = mlx5_pci_probe,
 	.remove = mlx5_pci_remove,
+	.map = mlx5_dma_map,
+	.unmap = mlx5_dma_unmap,
 	.drv_flags = (RTE_PCI_DRV_INTR_LSC | RTE_PCI_DRV_INTR_RMV |
 		      RTE_PCI_DRV_PROBE_AGAIN),
 };
diff --git a/drivers/net/mlx5/mlx5_mr.c b/drivers/net/mlx5/mlx5_mr.c
index 32be6a5445..7059181959 100644
--- a/drivers/net/mlx5/mlx5_mr.c
+++ b/drivers/net/mlx5/mlx5_mr.c
@@ -14,6 +14,7 @@ 
 #include <rte_mempool.h>
 #include <rte_malloc.h>
 #include <rte_rwlock.h>
+#include <rte_bus_pci.h>
 
 #include "mlx5.h"
 #include "mlx5_mr.h"
@@ -1215,6 +1216,151 @@  mlx5_mr_update_ext_mp_cb(struct rte_mempool *mp, void *opaque,
 }
 
 /**
+ * Finds the first ethdev that match the pci device.
+ * The existence of multiple ethdev per pci device is only with representors.
+ * On such case, it is enough to get only one of the ports as they all share
+ * the same ibv context.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ *
+ * @return
+ *   Pointer to the ethdev if found, NULL otherwise.
+ */
+static struct rte_eth_dev *
+pci_dev_to_eth_dev(struct rte_pci_device *pdev)
+{
+	struct rte_eth_dev *dev;
+	const char *drv_name;
+	uint16_t port_id = 0;
+
+	/**
+	 *  We really need to iterate all eth devices regardless of
+	 *  their owner.
+	 */
+	while (port_id < RTE_MAX_ETHPORTS) {
+		port_id = rte_eth_find_next(port_id);
+		if (port_id >= RTE_MAX_ETHPORTS)
+			break;
+		dev = &rte_eth_devices[port_id];
+		drv_name = dev->device->driver->name;
+		if (!strncmp(drv_name, MLX5_DRIVER_NAME,
+			     sizeof(MLX5_DRIVER_NAME) + 1) &&
+		    pdev == RTE_DEV_TO_PCI(dev->device)) {
+			/* found the PCI device. */
+			return dev;
+		}
+	}
+	return NULL;
+}
+
+/**
+ * DPDK callback to DMA map external memory to a PCI device.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ * @param addr
+ *   Starting virtual address of memory to be mapped.
+ * @param iova
+ *   Starting IOVA address of memory to be mapped.
+ * @param len
+ *   Length of memory segment being mapped.
+ *
+ * @return
+ *   0 on success, negative value on error.
+ */
+int
+mlx5_dma_map(struct rte_pci_device *pdev, void *addr,
+	     uint64_t iova __rte_unused, size_t len)
+{
+	struct rte_eth_dev *dev;
+	struct mlx5_mr *mr;
+	struct priv *priv;
+
+	dev = pci_dev_to_eth_dev(pdev);
+	if (!dev) {
+		DRV_LOG(WARNING, "unable to find matching ethdev "
+				 "to PCI device %p", (void *)pdev);
+		return -1;
+	}
+	priv = dev->data->dev_private;
+	mr = mlx5_create_mr_ext(dev, (uintptr_t)addr, len, SOCKET_ID_ANY);
+	if (!mr) {
+		DRV_LOG(WARNING,
+			"port %u unable to dma map", dev->data->port_id);
+		return -1;
+	}
+	rte_rwlock_write_lock(&priv->mr.rwlock);
+	LIST_INSERT_HEAD(&priv->mr.mr_list, mr, mr);
+	/* Insert to the global cache table. */
+	mr_insert_dev_cache(dev, mr);
+	rte_rwlock_write_unlock(&priv->mr.rwlock);
+	return 0;
+}
+
+/**
+ * DPDK callback to DMA unmap external memory to a PCI device.
+ *
+ * @param pdev
+ *   Pointer to the PCI device.
+ * @param addr
+ *   Starting virtual address of memory to be unmapped.
+ * @param iova
+ *   Starting IOVA address of memory to be unmapped.
+ * @param len
+ *   Length of memory segment being unmapped.
+ *
+ * @return
+ *   0 on success, negative value on error.
+ */
+int
+mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr,
+	       uint64_t iova __rte_unused, size_t len __rte_unused)
+{
+	struct rte_eth_dev *dev;
+	struct priv *priv;
+	struct mlx5_mr *mr;
+	struct mlx5_mr_cache entry;
+
+	dev = pci_dev_to_eth_dev(pdev);
+	if (!dev) {
+		DRV_LOG(WARNING, "unable to find matching ethdev "
+				 "to PCI device %p", (void *)pdev);
+		return -1;
+	}
+	priv = dev->data->dev_private;
+	rte_rwlock_read_lock(&priv->mr.rwlock);
+	mr = mr_lookup_dev_list(dev, &entry, (uintptr_t)addr);
+	if (!mr) {
+		DRV_LOG(WARNING, "address 0x%" PRIxPTR " wasn't registered "
+				 "to PCI device %p", (uintptr_t)addr,
+				 (void *)pdev);
+		rte_rwlock_read_unlock(&priv->mr.rwlock);
+		return -1;
+	}
+	LIST_REMOVE(mr, mr);
+	LIST_INSERT_HEAD(&priv->mr.mr_free_list, mr, mr);
+	DEBUG("port %u remove MR(%p) from list", dev->data->port_id,
+	      (void *)mr);
+	mr_rebuild_dev_cache(dev);
+	/*
+	 * Flush local caches by propagating invalidation across cores.
+	 * rte_smp_wmb() is enough to synchronize this event. If one of
+	 * freed memsegs is seen by other core, that means the memseg
+	 * has been allocated by allocator, which will come after this
+	 * free call. Therefore, this store instruction (incrementing
+	 * generation below) will be guaranteed to be seen by other core
+	 * before the core sees the newly allocated memory.
+	 */
+	++priv->mr.dev_gen;
+	DEBUG("broadcasting local cache flush, gen=%d",
+			priv->mr.dev_gen);
+	rte_smp_wmb();
+	rte_rwlock_read_unlock(&priv->mr.rwlock);
+	return 0;
+}
+
+/**
  * Register MR for entire memory chunks in a Mempool having externally allocated
  * memory and fill in local cache.
  *
diff --git a/drivers/net/mlx5/mlx5_rxtx.h b/drivers/net/mlx5/mlx5_rxtx.h
index c2529f96bc..f3f84dbac3 100644
--- a/drivers/net/mlx5/mlx5_rxtx.h
+++ b/drivers/net/mlx5/mlx5_rxtx.h
@@ -28,6 +28,7 @@ 
 #include <rte_atomic.h>
 #include <rte_spinlock.h>
 #include <rte_io.h>
+#include <rte_bus_pci.h>
 
 #include "mlx5_utils.h"
 #include "mlx5.h"
@@ -367,6 +368,10 @@  uint32_t mlx5_rx_addr2mr_bh(struct mlx5_rxq_data *rxq, uintptr_t addr);
 uint32_t mlx5_tx_mb2mr_bh(struct mlx5_txq_data *txq, struct rte_mbuf *mb);
 uint32_t mlx5_tx_update_ext_mp(struct mlx5_txq_data *txq, uintptr_t addr,
 			       struct rte_mempool *mp);
+int mlx5_dma_map(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+		 size_t len);
+int mlx5_dma_unmap(struct rte_pci_device *pdev, void *addr, uint64_t iova,
+		   size_t len);
 
 /**
  * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and