[dpdk-dev,RFC] eal/memory: introducing an option to set iova as va

Message ID 20170524161101.22863-1-santosh.shukla@caviumnetworks.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Santosh Shukla May 24, 2017, 4:11 p.m. UTC
  Some NPU hardware like OCTEONTX follows push model to get
the packet from the pktio device. Where packet allocation
and freeing done by the HW. Since HW can operate only on
IOVA with help of SMMU/IOMMU, When packet receives from the
Ethernet device, It is the IOVA address(which is PA in existing scheme).

Mapping IOVA as PA is expensive on those HW, where every
packet needs to be converted to VA from PA/IOVA.

This patch proposes the scheme where the user can set IOVA
as VA by using an eal command line argument. That helps to
avoid costly lookup for VA in SW by leveraging the SMMU
translation feature.

Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
---
 lib/librte_eal/bsdapp/eal/eal_memory.c          |  6 ++++++
 lib/librte_eal/bsdapp/eal/rte_eal_version.map   |  7 +++++++
 lib/librte_eal/common/eal_common_options.c      |  6 ++++++
 lib/librte_eal/common/eal_internal_cfg.h        |  1 +
 lib/librte_eal/common/eal_options.h             |  2 ++
 lib/librte_eal/common/include/rte_memory.h      |  3 +++
 lib/librte_eal/linuxapp/eal/eal_memory.c        | 12 ++++++++++++
 lib/librte_eal/linuxapp/eal/rte_eal_version.map |  7 +++++++
 8 files changed, 44 insertions(+)
  

Comments

Santosh Shukla June 2, 2017, 4:24 a.m. UTC | #1
Ping?

On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:

> Some NPU hardware like OCTEONTX follows push model to get
> the packet from the pktio device. Where packet allocation
> and freeing done by the HW. Since HW can operate only on
> IOVA with help of SMMU/IOMMU, When packet receives from the
> Ethernet device, It is the IOVA address(which is PA in existing scheme).
>
> Mapping IOVA as PA is expensive on those HW, where every
> packet needs to be converted to VA from PA/IOVA.
>
> This patch proposes the scheme where the user can set IOVA
> as VA by using an eal command line argument. That helps to
> avoid costly lookup for VA in SW by leveraging the SMMU
> translation feature.
>
> Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
> ---
>  lib/librte_eal/bsdapp/eal/eal_memory.c          |  6 ++++++
>  lib/librte_eal/bsdapp/eal/rte_eal_version.map   |  7 +++++++
>  lib/librte_eal/common/eal_common_options.c      |  6 ++++++
>  lib/librte_eal/common/eal_internal_cfg.h        |  1 +
>  lib/librte_eal/common/eal_options.h             |  2 ++
>  lib/librte_eal/common/include/rte_memory.h      |  3 +++
>  lib/librte_eal/linuxapp/eal/eal_memory.c        | 12 ++++++++++++
>  lib/librte_eal/linuxapp/eal/rte_eal_version.map |  7 +++++++
>  8 files changed, 44 insertions(+)
>
> diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
> index 3614da8db..6c8c2c96e 100644
> --- a/lib/librte_eal/bsdapp/eal/eal_memory.c
> +++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
> @@ -47,6 +47,12 @@
>  
>  #define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE))
>  
> +int __rte_unused
> +rte_mem_is_iova_as_va(void)
> +{
> +	return internal_config.iova_va;
> +}
> +
>  /*
>   * Get physical address of any mapped virtual address in the current process.
>   */
> diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> index 2e48a7366..6e020ca7f 100644
> --- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
> @@ -193,3 +193,10 @@ DPDK_17.05 {
>  	vfio_get_group_no;
>  
>  } DPDK_17.02;
> +
> +DPDK_17.08 {
> +	global:
> +
> +	rte_mem_is_iova_as_va;
> +
> +} DPDK_17.05;
> diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
> index f470195f3..164123ef0 100644
> --- a/lib/librte_eal/common/eal_common_options.c
> +++ b/lib/librte_eal/common/eal_common_options.c
> @@ -95,6 +95,7 @@ eal_long_options[] = {
>  	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
>  	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
>  	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
> +	{OPT_IOVA_AS_VA,        0, NULL, OPT_IOVA_AS_VA_NUM       },
>  	{0,                     0, NULL, 0                        }
>  };
>  
> @@ -876,6 +877,10 @@ eal_parse_common_option(int opt, const char *optarg,
>  		conf->no_pci = 1;
>  		break;
>  
> +	case OPT_IOVA_AS_VA_NUM:
> +		conf->iova_va = 1;
> +		break;
> +
>  	case OPT_NO_HPET_NUM:
>  		conf->no_hpet = 1;
>  		break;
> @@ -1083,5 +1088,6 @@ eal_common_usage(void)
>  	       "  --"OPT_NO_PCI"            Disable PCI\n"
>  	       "  --"OPT_NO_HPET"           Disable HPET\n"
>  	       "  --"OPT_NO_SHCONF"         No shared config (mmap'd files)\n"
> +	       "  --"OPT_IOVA_AS_VA"        Use va addr as iova\n"
>  	       "\n", RTE_MAX_LCORE);
>  }
> diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
> index 7b7e8c887..6293ed0aa 100644
> --- a/lib/librte_eal/common/eal_internal_cfg.h
> +++ b/lib/librte_eal/common/eal_internal_cfg.h
> @@ -84,6 +84,7 @@ struct internal_config {
>  	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
>  
>  	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
> +	volatile unsigned iova_va;	  /**< use va addr as iova */
>  	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
>  };
>  extern struct internal_config internal_config; /**< Global EAL configuration. */
> diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
> index a881c62e2..50f98edea 100644
> --- a/lib/librte_eal/common/eal_options.h
> +++ b/lib/librte_eal/common/eal_options.h
> @@ -83,6 +83,8 @@ enum {
>  	OPT_VMWARE_TSC_MAP_NUM,
>  #define OPT_XEN_DOM0          "xen-dom0"
>  	OPT_XEN_DOM0_NUM,
> +#define OPT_IOVA_AS_VA        "iova-va"
> +	OPT_IOVA_AS_VA_NUM,
>  	OPT_LONG_MAX_NUM
>  };
>  
> diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
> index 4aa5d1f77..c6c57e138 100644
> --- a/lib/librte_eal/common/include/rte_memory.h
> +++ b/lib/librte_eal/common/include/rte_memory.h
> @@ -258,6 +258,9 @@ rte_mem_phy2mch(int32_t memseg_id __rte_unused, const phys_addr_t phy_addr)
>  }
>  #endif
>  
> +int
> +rte_mem_is_iova_as_va(void);
> +
>  #ifdef __cplusplus
>  }
>  #endif
> diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
> index ebe068334..d7a373ba2 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_memory.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
> @@ -109,6 +109,12 @@ int rte_xen_dom0_supported(void)
>  }
>  #endif
>  
> +int
> +rte_mem_is_iova_as_va(void)
> +{
> +	return internal_config.iova_va;
> +}
> +
>  /**
>   * @file
>   * Huge page mapping under linux
> @@ -169,6 +175,9 @@ rte_mem_virt2phy(const void *virtaddr)
>  	int page_size;
>  	off_t offset;
>  
> +	if (rte_mem_is_iova_as_va())
> +		return (uintptr_t)virtaddr;
> +
>  	/* when using dom0, /proc/self/pagemap always returns 0, check in
>  	 * dpdk memory by browsing the memsegs */
>  	if (rte_xen_dom0_supported()) {
> @@ -480,6 +489,9 @@ map_all_hugepages(struct hugepage_file *hugepg_tbl,
>  		}
>  		else {
>  			hugepg_tbl[i].final_va = virtaddr;
> +
> +			if (rte_mem_is_iova_as_va())
> +				hugepg_tbl[i].physaddr = (uintptr_t)virtaddr;
>  		}
>  
>  		if (orig) {
> diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> index 670bab3a5..b0ba2233f 100644
> --- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> +++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
> @@ -198,3 +198,10 @@ DPDK_17.05 {
>  	vfio_get_group_no;
>  
>  } DPDK_17.02;
> +
> +DPDK_17.08 {
> +	global:
> +
> +	rte_mem_is_iova_as_va;
> +
> +} DPDK_17.05;
  
Bruce Richardson June 2, 2017, 9:27 a.m. UTC | #2
On Fri, Jun 02, 2017 at 09:54:46AM +0530, santosh wrote:
> Ping?
> 
> On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:
> 
> > Some NPU hardware like OCTEONTX follows push model to get
> > the packet from the pktio device. Where packet allocation
> > and freeing done by the HW. Since HW can operate only on
> > IOVA with help of SMMU/IOMMU, When packet receives from the
> > Ethernet device, It is the IOVA address(which is PA in existing scheme).
> >
> > Mapping IOVA as PA is expensive on those HW, where every
> > packet needs to be converted to VA from PA/IOVA.
> >
> > This patch proposes the scheme where the user can set IOVA
> > as VA by using an eal command line argument. That helps to
> > avoid costly lookup for VA in SW by leveraging the SMMU
> > translation feature.
> >
> > Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
> > ---
Hi,

I agree this is a problem that needs to be solved, but this doesn't look
like a particularly future-proofed solution. Given that we should
use the IOMMU on as many platforms as possible for protection, we
probably need to find an automatic way for DPDK to use IO addresses
correctly. Is this therefore better done as part of the VFIO and
UIO-specific code in EAL - as that is the part that knows how the memory
mapping is done, and in the VFIO case, what address ranges were
programmed in. The mempool driver was something else I considered but it
is probably too high a level to implement this.

So, in short, I don't particularly like this solution, but I could live
with it as a short-term option. Longer term though, I think we need a
better way to support using IO addresses rather than physical addresses
- I just don't know what that would look like or where it would sit/live.

/Bruce
  
Santosh Shukla June 5, 2017, 4:54 a.m. UTC | #3
Hi Bruce,


On Friday 02 June 2017 02:57 PM, Bruce Richardson wrote:
> On Fri, Jun 02, 2017 at 09:54:46AM +0530, santosh wrote:
>> Ping?
>>
>> On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:
>>
>>> Some NPU hardware like OCTEONTX follows push model to get
>>> the packet from the pktio device. Where packet allocation
>>> and freeing done by the HW. Since HW can operate only on
>>> IOVA with help of SMMU/IOMMU, When packet receives from the
>>> Ethernet device, It is the IOVA address(which is PA in existing scheme).
>>>
>>> Mapping IOVA as PA is expensive on those HW, where every
>>> packet needs to be converted to VA from PA/IOVA.
>>>
>>> This patch proposes the scheme where the user can set IOVA
>>> as VA by using an eal command line argument. That helps to
>>> avoid costly lookup for VA in SW by leveraging the SMMU
>>> translation feature.
>>>
>>> Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
>>> ---
> Hi,
>
> I agree this is a problem that needs to be solved, but this doesn't look
> like a particularly future-proofed solution. Given that we should
> use the IOMMU on as many platforms as possible for protection, we
> probably need to find an automatic way for DPDK to use IO addresses
> correctly. Is this therefore better done as part of the VFIO and
> UIO-specific code in EAL - as that is the part that knows how the memory
> mapping is done, and in the VFIO case, what address ranges were
> programmed in. The mempool driver was something else I considered but it
> is probably too high a level to implement this.

The other approach which we evaluated, Its detail:
0) Introduce a new bus api whose job is to detect iommu capable devices on that
bus {/ are those devices bind to iommu capable driver or not?}. Let's call that
api rte_bus_chk_iommu_dev();

1) The scheme is like If _all_ the devices bind to iommu kdrv then return iova=va
2) Otherwise switch to default mode i.e.. iova=pa.
3) Based on rte_bus_chk_iommu_dev() return value, 
accordingly program iova=va Or iova=pa in vfio_type1/spapr_map(). 

4) User from the command line can always override iova=va, 
in case if he wants to default scheme( iova=pa mode). For that purpose - Introduce eal
option something like --iova-pa Or --override-iova Or --iova-default 
or some better name.

Proposed API snap:

enum iova_mode {
    iova_va;
    iova_pa;
    iova_unknown;
};

/**
 * Look for iommu devices on that Bus.
 * And find out that those devices bind to iommu
 * capable driver example vfio.
 *
 *
 * @return
 *      On success return valid iova mode (iova_va or iova_pa)
 *      On failure return iova_unkown.
 */
typedef int (*rte_bus_chk_iommu_dev_t)(void);


By this approach, 
- We can automatically detect iova is va or pa
and then program accordingly. 
- Also, the user can always switch to default iova mode.
- Drivers like dpaa2 can use this API to detect iova mode then 
program dma_map accordingly. Currently they are doing in ifdef-way.

Comments? thoughts? Or if anyone has better proposal then, please
suggest.

> So, in short, I don't particularly like this solution, but I could live
> with it as a short-term option. Longer term though, I think we need a
> better way to support using IO addresses rather than physical addresses
> - I just don't know what that would look like or where it would sit/live.
>
> /Bruce

Thanks,.
  
Bruce Richardson June 6, 2017, 9:57 a.m. UTC | #4
On Mon, Jun 05, 2017 at 10:24:11AM +0530, santosh wrote:
> Hi Bruce,
> 
> 
> On Friday 02 June 2017 02:57 PM, Bruce Richardson wrote:
> > On Fri, Jun 02, 2017 at 09:54:46AM +0530, santosh wrote:
> >> Ping?
> >>
> >> On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:
> >>
> >>> Some NPU hardware like OCTEONTX follows push model to get
> >>> the packet from the pktio device. Where packet allocation
> >>> and freeing done by the HW. Since HW can operate only on
> >>> IOVA with help of SMMU/IOMMU, When packet receives from the
> >>> Ethernet device, It is the IOVA address(which is PA in existing scheme).
> >>>
> >>> Mapping IOVA as PA is expensive on those HW, where every
> >>> packet needs to be converted to VA from PA/IOVA.
> >>>
> >>> This patch proposes the scheme where the user can set IOVA
> >>> as VA by using an eal command line argument. That helps to
> >>> avoid costly lookup for VA in SW by leveraging the SMMU
> >>> translation feature.
> >>>
> >>> Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
> >>> ---
> > Hi,
> >
> > I agree this is a problem that needs to be solved, but this doesn't look
> > like a particularly future-proofed solution. Given that we should
> > use the IOMMU on as many platforms as possible for protection, we
> > probably need to find an automatic way for DPDK to use IO addresses
> > correctly. Is this therefore better done as part of the VFIO and
> > UIO-specific code in EAL - as that is the part that knows how the memory
> > mapping is done, and in the VFIO case, what address ranges were
> > programmed in. The mempool driver was something else I considered but it
> > is probably too high a level to implement this.
> 
> The other approach which we evaluated, Its detail:
> 0) Introduce a new bus api whose job is to detect iommu capable devices on that
> bus {/ are those devices bind to iommu capable driver or not?}. Let's call that
> api rte_bus_chk_iommu_dev();
> 
> 1) The scheme is like If _all_ the devices bind to iommu kdrv then return iova=va
> 2) Otherwise switch to default mode i.e.. iova=pa.
> 3) Based on rte_bus_chk_iommu_dev() return value, 
> accordingly program iova=va Or iova=pa in vfio_type1/spapr_map(). 
> 
> 4) User from the command line can always override iova=va, 
> in case if he wants to default scheme( iova=pa mode). For that purpose - Introduce eal
> option something like --iova-pa Or --override-iova Or --iova-default 
> or some better name.
> 
> Proposed API snap:
> 
> enum iova_mode {
>     iova_va;
>     iova_pa;
>     iova_unknown;
> };
> 
> /**
>  * Look for iommu devices on that Bus.
>  * And find out that those devices bind to iommu
>  * capable driver example vfio.
>  *
>  *
>  * @return
>  *      On success return valid iova mode (iova_va or iova_pa)
>  *      On failure return iova_unkown.
>  */
> typedef int (*rte_bus_chk_iommu_dev_t)(void);
> 
> 
> By this approach, 
> - We can automatically detect iova is va or pa
> and then program accordingly. 
> - Also, the user can always switch to default iova mode.
> - Drivers like dpaa2 can use this API to detect iova mode then 
> program dma_map accordingly. Currently they are doing in ifdef-way.
> 
> Comments? thoughts? Or if anyone has better proposal then, please
> suggest.
> 

That sounds a more complete solution. However, it's probably a lot of
work to implement. :-)

I also wonder if we want to simplify things a little and disallow
mixed-mode operation i.e. all devices have to use UIO or all use VFIO?
Would that help to allow simplification or other options. Having a whole
new bus type seems strange for this. Can each bus just report whether
it's members require physical addresses. Then the EAL can manage a
single flag to report whether we are using VA or PA?

/Bruce
  
Gaëtan Rivet June 6, 2017, 10:13 a.m. UTC | #5
On Tue, Jun 06, 2017 at 10:57:20AM +0100, Bruce Richardson wrote:
> On Mon, Jun 05, 2017 at 10:24:11AM +0530, santosh wrote:
> > Hi Bruce,
> > 
> > 
> > On Friday 02 June 2017 02:57 PM, Bruce Richardson wrote:
> > > On Fri, Jun 02, 2017 at 09:54:46AM +0530, santosh wrote:
> > >> Ping?
> > >>
> > >> On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:
> > >>
> > >>> Some NPU hardware like OCTEONTX follows push model to get
> > >>> the packet from the pktio device. Where packet allocation
> > >>> and freeing done by the HW. Since HW can operate only on
> > >>> IOVA with help of SMMU/IOMMU, When packet receives from the
> > >>> Ethernet device, It is the IOVA address(which is PA in existing scheme).
> > >>>
> > >>> Mapping IOVA as PA is expensive on those HW, where every
> > >>> packet needs to be converted to VA from PA/IOVA.
> > >>>
> > >>> This patch proposes the scheme where the user can set IOVA
> > >>> as VA by using an eal command line argument. That helps to
> > >>> avoid costly lookup for VA in SW by leveraging the SMMU
> > >>> translation feature.
> > >>>
> > >>> Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
> > >>> ---
> > > Hi,
> > >
> > > I agree this is a problem that needs to be solved, but this doesn't look
> > > like a particularly future-proofed solution. Given that we should
> > > use the IOMMU on as many platforms as possible for protection, we
> > > probably need to find an automatic way for DPDK to use IO addresses
> > > correctly. Is this therefore better done as part of the VFIO and
> > > UIO-specific code in EAL - as that is the part that knows how the memory
> > > mapping is done, and in the VFIO case, what address ranges were
> > > programmed in. The mempool driver was something else I considered but it
> > > is probably too high a level to implement this.
> > 
> > The other approach which we evaluated, Its detail:
> > 0) Introduce a new bus api whose job is to detect iommu capable devices on that
> > bus {/ are those devices bind to iommu capable driver or not?}. Let's call that
> > api rte_bus_chk_iommu_dev();
> > 
> > 1) The scheme is like If _all_ the devices bind to iommu kdrv then return iova=va
> > 2) Otherwise switch to default mode i.e.. iova=pa.
> > 3) Based on rte_bus_chk_iommu_dev() return value, 
> > accordingly program iova=va Or iova=pa in vfio_type1/spapr_map(). 
> > 
> > 4) User from the command line can always override iova=va, 
> > in case if he wants to default scheme( iova=pa mode). For that purpose - Introduce eal
> > option something like --iova-pa Or --override-iova Or --iova-default 
> > or some better name.
> > 
> > Proposed API snap:
> > 
> > enum iova_mode {
> >     iova_va;
> >     iova_pa;
> >     iova_unknown;
> > };
> > 
> > /**
> >  * Look for iommu devices on that Bus.
> >  * And find out that those devices bind to iommu
> >  * capable driver example vfio.
> >  *
> >  *
> >  * @return
> >  *      On success return valid iova mode (iova_va or iova_pa)
> >  *      On failure return iova_unkown.
> >  */
> > typedef int (*rte_bus_chk_iommu_dev_t)(void);
> > 
> > 
> > By this approach, 
> > - We can automatically detect iova is va or pa
> > and then program accordingly. 
> > - Also, the user can always switch to default iova mode.
> > - Drivers like dpaa2 can use this API to detect iova mode then 
> > program dma_map accordingly. Currently they are doing in ifdef-way.
> > 
> > Comments? thoughts? Or if anyone has better proposal then, please
> > suggest.
> > 
> 
> That sounds a more complete solution. However, it's probably a lot of
> work to implement. :-)
> 
> I also wonder if we want to simplify things a little and disallow
> mixed-mode operation i.e. all devices have to use UIO or all use VFIO?
> Would that help to allow simplification or other options. Having a whole
> new bus type seems strange for this. Can each bus just report whether
> it's members require physical addresses. Then the EAL can manage a
> single flag to report whether we are using VA or PA?
> 

Implementing this at a bus level requires all buses to have drivers
iterators, which are currently not exposed, or force all buses to
actively report drivers capabilities upon successful probing. The former
is a sizeable evolution while the latter leads to having duplicated code
in all bus->probe() implementation, which seems unsound.

I may be mistaken, but is this iova mode not currently limited to
VFIO? Should this API be made generic for all buses or is it only
relevant to the PCI bus?

If it can stay specific to the PCI bus, then it should simplify greatly
the implementation.
  
Jerin Jacob June 6, 2017, 10:38 a.m. UTC | #6
-----Original Message-----
> Date: Tue, 6 Jun 2017 10:57:20 +0100
> From: Bruce Richardson <bruce.richardson@intel.com>
> To: santosh <santosh.shukla@caviumnetworks.com>
> CC: thomas@monjalon.net, dev@dpdk.org, jerin.jacob@caviumnetworks.com,
>  hemant.agrawal@nxp.com
> Subject: Re: [dpdk-dev] [RFC] eal/memory: introducing an option to set iova
>  as va
> User-Agent: Mutt/1.8.1 (2017-04-11)
> 
> On Mon, Jun 05, 2017 at 10:24:11AM +0530, santosh wrote:
> > Hi Bruce,
> > 
> > 
> > On Friday 02 June 2017 02:57 PM, Bruce Richardson wrote:
> > > On Fri, Jun 02, 2017 at 09:54:46AM +0530, santosh wrote:
> > >> Ping?
> > >>
> > >> On Wednesday 24 May 2017 09:41 PM, Santosh Shukla wrote:
> > >>
> > >>> Some NPU hardware like OCTEONTX follows push model to get
> > >>> the packet from the pktio device. Where packet allocation
> > >>> and freeing done by the HW. Since HW can operate only on
> > >>> IOVA with help of SMMU/IOMMU, When packet receives from the
> > >>> Ethernet device, It is the IOVA address(which is PA in existing scheme).
> > >>>
> > >>> Mapping IOVA as PA is expensive on those HW, where every
> > >>> packet needs to be converted to VA from PA/IOVA.
> > >>>
> > >>> This patch proposes the scheme where the user can set IOVA
> > >>> as VA by using an eal command line argument. That helps to
> > >>> avoid costly lookup for VA in SW by leveraging the SMMU
> > >>> translation feature.
> > >>>
> > >>> Signed-off-by: Santosh Shukla <santosh.shukla@caviumnetworks.com>
> > >>> ---
> > > Hi,
> > >
> > > I agree this is a problem that needs to be solved, but this doesn't look
> > > like a particularly future-proofed solution. Given that we should
> > > use the IOMMU on as many platforms as possible for protection, we
> > > probably need to find an automatic way for DPDK to use IO addresses
> > > correctly. Is this therefore better done as part of the VFIO and
> > > UIO-specific code in EAL - as that is the part that knows how the memory
> > > mapping is done, and in the VFIO case, what address ranges were
> > > programmed in. The mempool driver was something else I considered but it
> > > is probably too high a level to implement this.
> > 
> > The other approach which we evaluated, Its detail:
> > 0) Introduce a new bus api whose job is to detect iommu capable devices on that
> > bus {/ are those devices bind to iommu capable driver or not?}. Let's call that
> > api rte_bus_chk_iommu_dev();
> > 
> > 1) The scheme is like If _all_ the devices bind to iommu kdrv then return iova=va
> > 2) Otherwise switch to default mode i.e.. iova=pa.
> > 3) Based on rte_bus_chk_iommu_dev() return value, 
> > accordingly program iova=va Or iova=pa in vfio_type1/spapr_map(). 
> > 
> > 4) User from the command line can always override iova=va, 
> > in case if he wants to default scheme( iova=pa mode). For that purpose - Introduce eal
> > option something like --iova-pa Or --override-iova Or --iova-default 
> > or some better name.
> > 
> > Proposed API snap:
> > 
> > enum iova_mode {
> >     iova_va;
> >     iova_pa;
> >     iova_unknown;
> > };
> > 
> > /**
> >  * Look for iommu devices on that Bus.
> >  * And find out that those devices bind to iommu
> >  * capable driver example vfio.
> >  *
> >  *
> >  * @return
> >  *      On success return valid iova mode (iova_va or iova_pa)
> >  *      On failure return iova_unkown.
> >  */
> > typedef int (*rte_bus_chk_iommu_dev_t)(void);
> > 
> > 
> > By this approach, 
> > - We can automatically detect iova is va or pa
> > and then program accordingly. 
> > - Also, the user can always switch to default iova mode.
> > - Drivers like dpaa2 can use this API to detect iova mode then 
> > program dma_map accordingly. Currently they are doing in ifdef-way.
> > 
> > Comments? thoughts? Or if anyone has better proposal then, please
> > suggest.
> > 
> 
> That sounds a more complete solution. However, it's probably a lot of
> work to implement. :-)
> 
> I also wonder if we want to simplify things a little and disallow
> mixed-mode operation i.e. all devices have to use UIO or all use VFIO?
> Would that help to allow simplification or other options. Having a whole
> new bus type seems strange for this. Can each bus just report whether
> it's members require physical addresses. Then the EAL can manage a
> single flag to report whether we are using VA or PA?

That's the plan. Each bus op can say, VA or PA or Don't care(in the
case of vdev). And rte_bus aggregation function check all the buses
preferred address scheme and decide the mode of operation. Yes, We will
keep aggregation logic simple now, where when all bus says to go with VA
and Don't care, we will go with VA else PA.

> /Bruce
  
Jerin Jacob June 6, 2017, 10:41 a.m. UTC | #7
-----Original Message-----
> Date: Tue, 6 Jun 2017 12:13:08 +0200
> From: Gaëtan Rivet <gaetan.rivet@6wind.com>
> To: Bruce Richardson <bruce.richardson@intel.com>
> Cc: santosh <santosh.shukla@caviumnetworks.com>, thomas@monjalon.net,
>  dev@dpdk.org, jerin.jacob@caviumnetworks.com, hemant.agrawal@nxp.com
> Subject: Re: [dpdk-dev] [RFC] eal/memory: introducing an option to set iova
>  as va
> User-Agent: Mutt/1.5.23 (2014-03-12)
> 
> > 
> > That sounds a more complete solution. However, it's probably a lot of
> > work to implement. :-)
> > 
> > I also wonder if we want to simplify things a little and disallow
> > mixed-mode operation i.e. all devices have to use UIO or all use VFIO?
> > Would that help to allow simplification or other options. Having a whole
> > new bus type seems strange for this. Can each bus just report whether
> > it's members require physical addresses. Then the EAL can manage a
> > single flag to report whether we are using VA or PA?
> > 
> 
> Implementing this at a bus level requires all buses to have drivers
> iterators, which are currently not exposed, or force all buses to
> actively report drivers capabilities upon successful probing. The former
> is a sizeable evolution while the latter leads to having duplicated code
> in all bus->probe() implementation, which seems unsound.
> 
> I may be mistaken, but is this iova mode not currently limited to
> VFIO? Should this API be made generic for all buses or is it only
> relevant to the PCI bus?
> 
> If it can stay specific to the PCI bus, then it should simplify greatly
> the implementation.

It not PCI bus specific. We can have VFIO platform bus too. NXP bus is a
VFIO platform bus. I think, This will help NXP bus as well as currently
they are using #ifdef scheme to select PA vs VA.
  

Patch

diff --git a/lib/librte_eal/bsdapp/eal/eal_memory.c b/lib/librte_eal/bsdapp/eal/eal_memory.c
index 3614da8db..6c8c2c96e 100644
--- a/lib/librte_eal/bsdapp/eal/eal_memory.c
+++ b/lib/librte_eal/bsdapp/eal/eal_memory.c
@@ -47,6 +47,12 @@ 
 
 #define EAL_PAGE_SIZE (sysconf(_SC_PAGESIZE))
 
+int __rte_unused
+rte_mem_is_iova_as_va(void)
+{
+	return internal_config.iova_va;
+}
+
 /*
  * Get physical address of any mapped virtual address in the current process.
  */
diff --git a/lib/librte_eal/bsdapp/eal/rte_eal_version.map b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
index 2e48a7366..6e020ca7f 100644
--- a/lib/librte_eal/bsdapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/bsdapp/eal/rte_eal_version.map
@@ -193,3 +193,10 @@  DPDK_17.05 {
 	vfio_get_group_no;
 
 } DPDK_17.02;
+
+DPDK_17.08 {
+	global:
+
+	rte_mem_is_iova_as_va;
+
+} DPDK_17.05;
diff --git a/lib/librte_eal/common/eal_common_options.c b/lib/librte_eal/common/eal_common_options.c
index f470195f3..164123ef0 100644
--- a/lib/librte_eal/common/eal_common_options.c
+++ b/lib/librte_eal/common/eal_common_options.c
@@ -95,6 +95,7 @@  eal_long_options[] = {
 	{OPT_VFIO_INTR,         1, NULL, OPT_VFIO_INTR_NUM        },
 	{OPT_VMWARE_TSC_MAP,    0, NULL, OPT_VMWARE_TSC_MAP_NUM   },
 	{OPT_XEN_DOM0,          0, NULL, OPT_XEN_DOM0_NUM         },
+	{OPT_IOVA_AS_VA,        0, NULL, OPT_IOVA_AS_VA_NUM       },
 	{0,                     0, NULL, 0                        }
 };
 
@@ -876,6 +877,10 @@  eal_parse_common_option(int opt, const char *optarg,
 		conf->no_pci = 1;
 		break;
 
+	case OPT_IOVA_AS_VA_NUM:
+		conf->iova_va = 1;
+		break;
+
 	case OPT_NO_HPET_NUM:
 		conf->no_hpet = 1;
 		break;
@@ -1083,5 +1088,6 @@  eal_common_usage(void)
 	       "  --"OPT_NO_PCI"            Disable PCI\n"
 	       "  --"OPT_NO_HPET"           Disable HPET\n"
 	       "  --"OPT_NO_SHCONF"         No shared config (mmap'd files)\n"
+	       "  --"OPT_IOVA_AS_VA"        Use va addr as iova\n"
 	       "\n", RTE_MAX_LCORE);
 }
diff --git a/lib/librte_eal/common/eal_internal_cfg.h b/lib/librte_eal/common/eal_internal_cfg.h
index 7b7e8c887..6293ed0aa 100644
--- a/lib/librte_eal/common/eal_internal_cfg.h
+++ b/lib/librte_eal/common/eal_internal_cfg.h
@@ -84,6 +84,7 @@  struct internal_config {
 	const char *hugepage_dir;         /**< specific hugetlbfs directory to use */
 
 	unsigned num_hugepage_sizes;      /**< how many sizes on this system */
+	volatile unsigned iova_va;	  /**< use va addr as iova */
 	struct hugepage_info hugepage_info[MAX_HUGEPAGE_SIZES];
 };
 extern struct internal_config internal_config; /**< Global EAL configuration. */
diff --git a/lib/librte_eal/common/eal_options.h b/lib/librte_eal/common/eal_options.h
index a881c62e2..50f98edea 100644
--- a/lib/librte_eal/common/eal_options.h
+++ b/lib/librte_eal/common/eal_options.h
@@ -83,6 +83,8 @@  enum {
 	OPT_VMWARE_TSC_MAP_NUM,
 #define OPT_XEN_DOM0          "xen-dom0"
 	OPT_XEN_DOM0_NUM,
+#define OPT_IOVA_AS_VA        "iova-va"
+	OPT_IOVA_AS_VA_NUM,
 	OPT_LONG_MAX_NUM
 };
 
diff --git a/lib/librte_eal/common/include/rte_memory.h b/lib/librte_eal/common/include/rte_memory.h
index 4aa5d1f77..c6c57e138 100644
--- a/lib/librte_eal/common/include/rte_memory.h
+++ b/lib/librte_eal/common/include/rte_memory.h
@@ -258,6 +258,9 @@  rte_mem_phy2mch(int32_t memseg_id __rte_unused, const phys_addr_t phy_addr)
 }
 #endif
 
+int
+rte_mem_is_iova_as_va(void);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_eal/linuxapp/eal/eal_memory.c b/lib/librte_eal/linuxapp/eal/eal_memory.c
index ebe068334..d7a373ba2 100644
--- a/lib/librte_eal/linuxapp/eal/eal_memory.c
+++ b/lib/librte_eal/linuxapp/eal/eal_memory.c
@@ -109,6 +109,12 @@  int rte_xen_dom0_supported(void)
 }
 #endif
 
+int
+rte_mem_is_iova_as_va(void)
+{
+	return internal_config.iova_va;
+}
+
 /**
  * @file
  * Huge page mapping under linux
@@ -169,6 +175,9 @@  rte_mem_virt2phy(const void *virtaddr)
 	int page_size;
 	off_t offset;
 
+	if (rte_mem_is_iova_as_va())
+		return (uintptr_t)virtaddr;
+
 	/* when using dom0, /proc/self/pagemap always returns 0, check in
 	 * dpdk memory by browsing the memsegs */
 	if (rte_xen_dom0_supported()) {
@@ -480,6 +489,9 @@  map_all_hugepages(struct hugepage_file *hugepg_tbl,
 		}
 		else {
 			hugepg_tbl[i].final_va = virtaddr;
+
+			if (rte_mem_is_iova_as_va())
+				hugepg_tbl[i].physaddr = (uintptr_t)virtaddr;
 		}
 
 		if (orig) {
diff --git a/lib/librte_eal/linuxapp/eal/rte_eal_version.map b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
index 670bab3a5..b0ba2233f 100644
--- a/lib/librte_eal/linuxapp/eal/rte_eal_version.map
+++ b/lib/librte_eal/linuxapp/eal/rte_eal_version.map
@@ -198,3 +198,10 @@  DPDK_17.05 {
 	vfio_get_group_no;
 
 } DPDK_17.02;
+
+DPDK_17.08 {
+	global:
+
+	rte_mem_is_iova_as_va;
+
+} DPDK_17.05;