[dpdk-dev] vfio: fix sPAPR IOMMU DMA window size

Message ID 1502118665-27439-1-git-send-email-jpf@zurich.ibm.com (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Jonas Pfefferle1 Aug. 7, 2017, 3:11 p.m. UTC
  DMA window size needs to be big enough to span all memory segment's
physical addresses. We do not need multiple levels of IOMMU tables
as we already span ~70TB of physical memory with 16MB hugepages.

Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
---
 lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)
  

Comments

Alexey Kardashevskiy Aug. 8, 2017, 7:38 a.m. UTC | #1
On 08/08/17 01:11, Jonas Pfefferle wrote:
> DMA window size needs to be big enough to span all memory segment's
> physical addresses. We do not need multiple levels of IOMMU tables
> as we already span ~70TB of physical memory with 16MB hugepages.
> 
> Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> ---
>  lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
>  1 file changed, 22 insertions(+), 3 deletions(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 946df7e..8502216 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
>  	return 0;
>  }
>  
> +static uint64_t
> +roundup_next_pow2(uint64_t n)
> +{
> +	uint32_t i;
> +
> +	n--;
> +	for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> +		n |= n >> i;
> +
> +	return ++n;
> +}
> +

wow :)

QEMU does it using __builtin_ctzll() (used below for the page_shift)
without a loop:

https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382


Anyway, seems working.


Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>




>  static int
>  vfio_spapr_dma_map(int vfio_container_fd)
>  {
> @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
>  		return -1;
>  	}
>  
> -	/* calculate window size based on number of hugepages configured */
> -	create.window_size = rte_eal_get_physmem_size();
> +	/* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */
> +	/* create DMA window from 0 to max(phys_addr + len) */
> +	/* sPAPR requires window size to be a power of 2 */
> +	create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);
>  	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> -	create.levels = 2;
> +	create.levels = 1;
>  
>  	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
>  	if (ret) {
> @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
>  		return -1;
>  	}
>  
> +	if (create.start_addr != 0) {
> +		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
> +		return -1;
> +	}
> +
>  	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
>  	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
>  		struct vfio_iommu_type1_dma_map dma_map;
>
  
Jonas Pfefferle1 Aug. 8, 2017, 7:56 a.m. UTC | #2
Alexey Kardashevskiy <aik@ozlabs.ru> wrote on 08/08/2017 09:38:00 AM:

> From: Alexey Kardashevskiy <aik@ozlabs.ru>
> To: Jonas Pfefferle <jpf@zurich.ibm.com>, anatoly.burakov@intel.com
> Cc: dev@dpdk.org
> Date: 08/08/2017 09:38 AM
> Subject: Re: [PATCH] vfio: fix sPAPR IOMMU DMA window size
>
> On 08/08/17 01:11, Jonas Pfefferle wrote:
> > DMA window size needs to be big enough to span all memory segment's
> > physical addresses. We do not need multiple levels of IOMMU tables
> > as we already span ~70TB of physical memory with 16MB hugepages.
> >
> > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> > ---
> >  lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---
> >  1 file changed, 22 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_vfio.c
> > index 946df7e..8502216 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> >     return 0;
> >  }
> >
> > +static uint64_t
> > +roundup_next_pow2(uint64_t n)
> > +{
> > +   uint32_t i;
> > +
> > +   n--;
> > +   for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> > +      n |= n >> i;
> > +
> > +   return ++n;
> > +}
> > +
>
> wow :)
>
> QEMU does it using __builtin_ctzll() (used below for the page_shift)
> without a loop:
>
> https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/
> host-
>
utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382

>
>
> Anyway, seems working.

Ok let me fix that :)

>
>
> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
>
>
>
>
> >  static int
> >  vfio_spapr_dma_map(int vfio_container_fd)
> >  {
> > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> >        return -1;
> >     }
> >
> > -   /* calculate window size based on number of hugepages configured */
> > -   create.window_size = rte_eal_get_physmem_size();
> > +   /* physicaly pages are sorted descending i.e. ms[0].phys_addr is
max */
> > +   /* create DMA window from 0 to max(phys_addr + len) */
> > +   /* sPAPR requires window size to be a power of 2 */
> > +   create.window_size = roundup_next_pow2(ms[0].phys_addr + ms
[0].len);
> >     create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> > -   create.levels = 2;
> > +   create.levels = 1;
> >
> >     ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE,
&create);
> >     if (ret) {
> > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> >        return -1;
> >     }
> >
> > +   if (create.start_addr != 0) {
> > +      RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
> > +      return -1;
> > +   }
> > +
> >     /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> >     for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> >        struct vfio_iommu_type1_dma_map dma_map;
> >
>
>
> --
> Alexey
>
  
Ananyev, Konstantin Aug. 8, 2017, 8:27 a.m. UTC | #3
> -----Original Message-----

> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Alexey Kardashevskiy

> Sent: Tuesday, August 8, 2017 10:38 AM

> To: Jonas Pfefferle <jpf@zurich.ibm.com>; Burakov, Anatoly <anatoly.burakov@intel.com>

> Cc: dev@dpdk.org

> Subject: Re: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size

> 

> On 08/08/17 01:11, Jonas Pfefferle wrote:

> > DMA window size needs to be big enough to span all memory segment's

> > physical addresses. We do not need multiple levels of IOMMU tables

> > as we already span ~70TB of physical memory with 16MB hugepages.

> >

> > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>

> > ---

> >  lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 ++++++++++++++++++++++---

> >  1 file changed, 22 insertions(+), 3 deletions(-)

> >

> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > index 946df7e..8502216 100644

> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c

> > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)

> >  	return 0;

> >  }

> >

> > +static uint64_t

> > +roundup_next_pow2(uint64_t n)

> > +{

> > +	uint32_t i;

> > +

> > +	n--;

> > +	for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)

> > +		n |= n >> i;

> > +

> > +	return ++n;

> > +}

> > +

> 

> wow :)

> 

> QEMU does it using __builtin_ctzll() (used below for the page_shift)

> without a loop:

> 

> https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-

> utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382

> 

> 

> Anyway, seems working.


As I remember, there already exists rte_align64pow2().
Konstantin

> 

> 

> Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>

> 

> 

> 

> 

> >  static int

> >  vfio_spapr_dma_map(int vfio_container_fd)

> >  {

> > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)

> >  		return -1;

> >  	}

> >

> > -	/* calculate window size based on number of hugepages configured */

> > -	create.window_size = rte_eal_get_physmem_size();

> > +	/* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */

> > +	/* create DMA window from 0 to max(phys_addr + len) */

> > +	/* sPAPR requires window size to be a power of 2 */

> > +	create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);

> >  	create.page_shift = __builtin_ctzll(ms->hugepage_sz);

> > -	create.levels = 2;

> > +	create.levels = 1;

> >

> >  	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);

> >  	if (ret) {

> > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)

> >  		return -1;

> >  	}

> >

> > +	if (create.start_addr != 0) {

> > +		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");

> > +		return -1;

> > +	}

> > +

> >  	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */

> >  	for (i = 0; i < RTE_MAX_MEMSEG; i++) {

> >  		struct vfio_iommu_type1_dma_map dma_map;

> >

> 

> 

> --

> Alexey
  
Jonas Pfefferle1 Aug. 8, 2017, 8:47 a.m. UTC | #4
"Ananyev, Konstantin" <konstantin.ananyev@intel.com> wrote on 08/08/2017
10:27:28 AM:

> From: "Ananyev, Konstantin" <konstantin.ananyev@intel.com>
> To: Alexey Kardashevskiy <aik@ozlabs.ru>, Jonas Pfefferle
> <jpf@zurich.ibm.com>, "Burakov, Anatoly" <anatoly.burakov@intel.com>
> Cc: "dev@dpdk.org" <dev@dpdk.org>
> Date: 08/08/2017 10:27 AM
> Subject: RE: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size
>
>
>
> > -----Original Message-----
> > From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Alexey
Kardashevskiy
> > Sent: Tuesday, August 8, 2017 10:38 AM
> > To: Jonas Pfefferle <jpf@zurich.ibm.com>; Burakov, Anatoly
> <anatoly.burakov@intel.com>
> > Cc: dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH] vfio: fix sPAPR IOMMU DMA window size
> >
> > On 08/08/17 01:11, Jonas Pfefferle wrote:
> > > DMA window size needs to be big enough to span all memory segment's
> > > physical addresses. We do not need multiple levels of IOMMU tables
> > > as we already span ~70TB of physical memory with 16MB hugepages.
> > >
> > > Signed-off-by: Jonas Pfefferle <jpf@zurich.ibm.com>
> > > ---
> > >  lib/librte_eal/linuxapp/eal/eal_vfio.c | 25 +++++++++++++++++++++
+---
> > >  1 file changed, 22 insertions(+), 3 deletions(-)
> > >
> > > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_vfio.c
> > > index 946df7e..8502216 100644
> > > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > > @@ -722,6 +722,18 @@ vfio_type1_dma_map(int vfio_container_fd)
> > >     return 0;
> > >  }
> > >
> > > +static uint64_t
> > > +roundup_next_pow2(uint64_t n)
> > > +{
> > > +   uint32_t i;
> > > +
> > > +   n--;
> > > +   for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
> > > +      n |= n >> i;
> > > +
> > > +   return ++n;
> > > +}
> > > +
> >
> > wow :)
> >
> > QEMU does it using __builtin_ctzll() (used below for the page_shift)
> > without a loop:
> >
> > https://git.qemu.org/gitweb.cgi?p=qemu.git;a=blob;f=include/qemu/host-
> >
>
utils.h;h=95cf4f4163e50457cdf808263065ca5ef3f935da;hb=f22ab6cb0c47bd2a2785b7d58130949bd7d8d9af#l382

> >
> >
> > Anyway, seems working.
>
> As I remember, there already exists rte_align64pow2().
> Konstantin

Thanks. Fixed it.

>
> >
> >
> > Reviewed-by: Alexey Kardashevskiy <aik@ozlabs.ru>
> >
> >
> >
> >
> > >  static int
> > >  vfio_spapr_dma_map(int vfio_container_fd)
> > >  {
> > > @@ -759,10 +771,12 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > >        return -1;
> > >     }
> > >
> > > -   /* calculate window size based on number of hugepages configured
*/
> > > -   create.window_size = rte_eal_get_physmem_size();
> > > +   /* physicaly pages are sorted descending i.e. ms
> [0].phys_addr is max */
> > > +   /* create DMA window from 0 to max(phys_addr + len) */
> > > +   /* sPAPR requires window size to be a power of 2 */
> > > +   create.window_size = roundup_next_pow2(ms[0].phys_addr + ms
[0].len);
> > >     create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> > > -   create.levels = 2;
> > > +   create.levels = 1;
> > >
> > >     ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE,
&create);
> > >     if (ret) {
> > > @@ -771,6 +785,11 @@ vfio_spapr_dma_map(int vfio_container_fd)
> > >        return -1;
> > >     }
> > >
> > > +   if (create.start_addr != 0) {
> > > +      RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
> > > +      return -1;
> > > +   }
> > > +
> > >     /* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> > >     for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> > >        struct vfio_iommu_type1_dma_map dma_map;
> > >
> >
> >
> > --
> > Alexey
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 946df7e..8502216 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -722,6 +722,18 @@  vfio_type1_dma_map(int vfio_container_fd)
 	return 0;
 }
 
+static uint64_t
+roundup_next_pow2(uint64_t n)
+{
+	uint32_t i;
+
+	n--;
+	for (i = 1; i < sizeof(n) * CHAR_BIT; i += i)
+		n |= n >> i;
+
+	return ++n;
+}
+
 static int
 vfio_spapr_dma_map(int vfio_container_fd)
 {
@@ -759,10 +771,12 @@  vfio_spapr_dma_map(int vfio_container_fd)
 		return -1;
 	}
 
-	/* calculate window size based on number of hugepages configured */
-	create.window_size = rte_eal_get_physmem_size();
+	/* physicaly pages are sorted descending i.e. ms[0].phys_addr is max */
+	/* create DMA window from 0 to max(phys_addr + len) */
+	/* sPAPR requires window size to be a power of 2 */
+	create.window_size = roundup_next_pow2(ms[0].phys_addr + ms[0].len);
 	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
-	create.levels = 2;
+	create.levels = 1;
 
 	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
 	if (ret) {
@@ -771,6 +785,11 @@  vfio_spapr_dma_map(int vfio_container_fd)
 		return -1;
 	}
 
+	if (create.start_addr != 0) {
+		RTE_LOG(ERR, EAL, "  DMA window start address != 0\n");
+		return -1;
+	}
+
 	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
 	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
 		struct vfio_iommu_type1_dma_map dma_map;