[dpdk-dev] eal: sPAPR IOMMU support in pci probing for vfio-pci in ppc64le

Message ID b89545d7530c68e4ab0fb4de05b4bf143e5c99a6.1486707309.git.gowrishankar.m@linux.vnet.ibm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel compilation success Compilation OK

Commit Message

Gowrishankar Feb. 10, 2017, 6:18 a.m. UTC
  From: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>

Below changes adds pci probing support for vfio-pci devices in power8.

Signed-off-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
Acked-by: Chao Zhu <chaozhu@linux.vnet.ibm.com>
---
 lib/librte_eal/linuxapp/eal/eal_vfio.c | 88 ++++++++++++++++++++++++++++++++++
 lib/librte_eal/linuxapp/eal/eal_vfio.h |  1 +
 2 files changed, 89 insertions(+)
  

Comments

Gowrishankar Feb. 11, 2017, 3:26 a.m. UTC | #1
Hi Thomas,
I see rc3 out. Could this patch also go in 17.02 (rc4 ?).

This patch is ppc64le specific (w/o affecting other arch) and it enables 
pmd over vfio-pci be useful for this arch.

Thanks,
Gowrishankar

On Friday 10 February 2017 11:48 AM, Gowrishankar wrote:
> From: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
>
> Below changes adds pci probing support for vfio-pci devices in power8.
>
> Signed-off-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
> Acked-by: Chao Zhu <chaozhu@linux.vnet.ibm.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal_vfio.c | 88 ++++++++++++++++++++++++++++++++++
>   lib/librte_eal/linuxapp/eal/eal_vfio.h |  1 +
>   2 files changed, 89 insertions(+)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 702f7a2..1d4fea6 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -50,12 +50,15 @@
>   static struct vfio_config vfio_cfg;
>
>   static int vfio_type1_dma_map(int);
> +static int vfio_spapr_dma_map(int);
>   static int vfio_noiommu_dma_map(int);
>
>   /* IOMMU types we support */
>   static const struct vfio_iommu_type iommu_types[] = {
>   	/* x86 IOMMU, otherwise known as type 1 */
>   	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
> +	/* ppc64 IOMMU, otherwise known as spapr */
> +	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
>   	/* IOMMU-less mode */
>   	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
>   };
> @@ -540,6 +543,91 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>   }
>
>   static int
> +vfio_spapr_dma_map(int vfio_container_fd)
> +{
> +	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> +	int i, ret;
> +
> +	struct vfio_iommu_spapr_register_memory reg = {
> +		.argsz = sizeof(reg),
> +		.flags = 0
> +	};
> +	struct vfio_iommu_spapr_tce_info info = {
> +		.argsz = sizeof(info),
> +	};
> +	struct vfio_iommu_spapr_tce_create create = {
> +		.argsz = sizeof(create),
> +	};
> +	struct vfio_iommu_spapr_tce_remove remove = {
> +		.argsz = sizeof(remove),
> +	};
> +
> +	/* query spapr iommu info */
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* remove default DMA of 32 bit window */
> +	remove.start_addr = info.dma32_window_start;
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* calculate window size based on number of hugepages configured */
> +	create.window_size = rte_eal_get_physmem_size();
> +	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> +	create.levels = 2;
> +
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> +	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> +		struct vfio_iommu_type1_dma_map dma_map;
> +
> +		if (ms[i].addr == NULL)
> +			break;
> +
> +		reg.vaddr = (uintptr_t) ms[i].addr;
> +		reg.size = ms[i].len;
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
> +					"error %i (%s)\n", errno, strerror(errno));
> +			return -1;
> +		}
> +
> +		memset(&dma_map, 0, sizeof(dma_map));
> +		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
> +		dma_map.vaddr = ms[i].addr_64;
> +		dma_map.size = ms[i].len;
> +		dma_map.iova = ms[i].phys_addr;
> +		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
> +
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
> +
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
> +					"error %i (%s)\n", errno, strerror(errno));
> +			return -1;
> +		}
> +
> +	}
> +
> +	return 0;
> +}
> +
> +static int
>   vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
>   {
>   	/* No-IOMMU mode does not need DMA mapping */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 29f7f3e..533b854 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -53,6 +53,7 @@
>   #endif
>
>   #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
> +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
>
>   #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
>   #define RTE_VFIO_NOIOMMU 8
  
Thomas Monjalon Feb. 11, 2017, 8:18 a.m. UTC | #2
2017-02-11 08:56, gowrishankar muthukrishnan:
> Hi Thomas,
> I see rc3 out. Could this patch also go in 17.02 (rc4 ?).
> 
> This patch is ppc64le specific (w/o affecting other arch) and it enables 
> pmd over vfio-pci be useful for this arch.

You have sent this patch yesterday. We must wait few days to allow
others to comment.

And it is really too late to add such big patch which is not a fix.
We really need to close this release asap without taking any risk, sorry.
  
Gowrishankar Feb. 23, 2017, 5:27 a.m. UTC | #3
Hi,
Could this be reviewed for few more acks (though changes are only for 
ppc64le) ?.

If needed. I can send release notes update separately for this support.

Regards,
Gowrishankar

On Friday 10 February 2017 11:48 AM, Gowrishankar wrote:
> From: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
>
> Below changes adds pci probing support for vfio-pci devices in power8.
>
> Signed-off-by: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
> Acked-by: Chao Zhu <chaozhu@linux.vnet.ibm.com>
> ---
>   lib/librte_eal/linuxapp/eal/eal_vfio.c | 88 ++++++++++++++++++++++++++++++++++
>   lib/librte_eal/linuxapp/eal/eal_vfio.h |  1 +
>   2 files changed, 89 insertions(+)
>
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 702f7a2..1d4fea6 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -50,12 +50,15 @@
>   static struct vfio_config vfio_cfg;
>
>   static int vfio_type1_dma_map(int);
> +static int vfio_spapr_dma_map(int);
>   static int vfio_noiommu_dma_map(int);
>
>   /* IOMMU types we support */
>   static const struct vfio_iommu_type iommu_types[] = {
>   	/* x86 IOMMU, otherwise known as type 1 */
>   	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
> +	/* ppc64 IOMMU, otherwise known as spapr */
> +	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
>   	/* IOMMU-less mode */
>   	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
>   };
> @@ -540,6 +543,91 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>   }
>
>   static int
> +vfio_spapr_dma_map(int vfio_container_fd)
> +{
> +	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> +	int i, ret;
> +
> +	struct vfio_iommu_spapr_register_memory reg = {
> +		.argsz = sizeof(reg),
> +		.flags = 0
> +	};
> +	struct vfio_iommu_spapr_tce_info info = {
> +		.argsz = sizeof(info),
> +	};
> +	struct vfio_iommu_spapr_tce_create create = {
> +		.argsz = sizeof(create),
> +	};
> +	struct vfio_iommu_spapr_tce_remove remove = {
> +		.argsz = sizeof(remove),
> +	};
> +
> +	/* query spapr iommu info */
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* remove default DMA of 32 bit window */
> +	remove.start_addr = info.dma32_window_start;
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* calculate window size based on number of hugepages configured */
> +	create.window_size = rte_eal_get_physmem_size();
> +	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> +	create.levels = 2;
> +
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> +	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> +		struct vfio_iommu_type1_dma_map dma_map;
> +
> +		if (ms[i].addr == NULL)
> +			break;
> +
> +		reg.vaddr = (uintptr_t) ms[i].addr;
> +		reg.size = ms[i].len;
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
> +					"error %i (%s)\n", errno, strerror(errno));
> +			return -1;
> +		}
> +
> +		memset(&dma_map, 0, sizeof(dma_map));
> +		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
> +		dma_map.vaddr = ms[i].addr_64;
> +		dma_map.size = ms[i].len;
> +		dma_map.iova = ms[i].phys_addr;
> +		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
> +
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
> +
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
> +					"error %i (%s)\n", errno, strerror(errno));
> +			return -1;
> +		}
> +
> +	}
> +
> +	return 0;
> +}
> +
> +static int
>   vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
>   {
>   	/* No-IOMMU mode does not need DMA mapping */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 29f7f3e..533b854 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -53,6 +53,7 @@
>   #endif
>
>   #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
> +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
>
>   #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
>   #define RTE_VFIO_NOIOMMU 8
  
Anatoly Burakov March 2, 2017, 3:18 p.m. UTC | #4
> From: Gowrishankar [mailto:gowrishankar.m@linux.vnet.ibm.com]
> Sent: Friday, February 10, 2017 6:18 AM
> To: dev@dpdk.org
> Cc: Chao Zhu <chaozhu@linux.vnet.ibm.com>; Thomas Monjalon
> <thomas.monjalon@6wind.com>; Burakov, Anatoly
> <anatoly.burakov@intel.com>; Pradeep <pradeep@us.ibm.com>;
> Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
> Subject: [PATCH] eal: sPAPR IOMMU support in pci probing for vfio-pci in
> ppc64le
> 
> From: Gowrishankar Muthukrishnan <gowrishankar.m@linux.vnet.ibm.com>
> 
> Below changes adds pci probing support for vfio-pci devices in power8.
> 
> Signed-off-by: Gowrishankar Muthukrishnan
> <gowrishankar.m@linux.vnet.ibm.com>
> Acked-by: Chao Zhu <chaozhu@linux.vnet.ibm.com>
> ---
>  lib/librte_eal/linuxapp/eal/eal_vfio.c | 88
> ++++++++++++++++++++++++++++++++++
>  lib/librte_eal/linuxapp/eal/eal_vfio.h |  1 +
>  2 files changed, 89 insertions(+)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 702f7a2..1d4fea6 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -50,12 +50,15 @@
>  static struct vfio_config vfio_cfg;
> 
>  static int vfio_type1_dma_map(int);
> +static int vfio_spapr_dma_map(int);
>  static int vfio_noiommu_dma_map(int);
> 
>  /* IOMMU types we support */
>  static const struct vfio_iommu_type iommu_types[] = {
>  	/* x86 IOMMU, otherwise known as type 1 */
>  	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
> +	/* ppc64 IOMMU, otherwise known as spapr */
> +	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
>  	/* IOMMU-less mode */
>  	{ RTE_VFIO_NOIOMMU, "No-IOMMU",
> &vfio_noiommu_dma_map},  }; @@ -540,6 +543,91 @@ int
> vfio_setup_device(const char *sysfs_base, const char *dev_addr,  }
> 
>  static int
> +vfio_spapr_dma_map(int vfio_container_fd) {
> +	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
> +	int i, ret;
> +
> +	struct vfio_iommu_spapr_register_memory reg = {
> +		.argsz = sizeof(reg),
> +		.flags = 0
> +	};
> +	struct vfio_iommu_spapr_tce_info info = {
> +		.argsz = sizeof(info),
> +	};
> +	struct vfio_iommu_spapr_tce_create create = {
> +		.argsz = sizeof(create),
> +	};
> +	struct vfio_iommu_spapr_tce_remove remove = {
> +		.argsz = sizeof(remove),
> +	};
> +
> +	/* query spapr iommu info */
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO,
> &info);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* remove default DMA of 32 bit window */
> +	remove.start_addr = info.dma32_window_start;
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE,
> &remove);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* calculate window size based on number of hugepages configured
> */
> +	create.window_size = rte_eal_get_physmem_size();
> +	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
> +	create.levels = 2;
> +
> +	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE,
> &create);
> +	if (ret) {
> +		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
> +				"error %i (%s)\n", errno, strerror(errno));
> +		return -1;
> +	}
> +
> +	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
> +	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
> +		struct vfio_iommu_type1_dma_map dma_map;
> +
> +		if (ms[i].addr == NULL)
> +			break;
> +
> +		reg.vaddr = (uintptr_t) ms[i].addr;
> +		reg.size = ms[i].len;
> +		ret = ioctl(vfio_container_fd,
> VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot register vaddr for
> IOMMU, "
> +					"error %i (%s)\n", errno,
> strerror(errno));
> +			return -1;
> +		}
> +
> +		memset(&dma_map, 0, sizeof(dma_map));
> +		dma_map.argsz = sizeof(struct
> vfio_iommu_type1_dma_map);
> +		dma_map.vaddr = ms[i].addr_64;
> +		dma_map.size = ms[i].len;
> +		dma_map.iova = ms[i].phys_addr;
> +		dma_map.flags = VFIO_DMA_MAP_FLAG_READ |
> VFIO_DMA_MAP_FLAG_WRITE;
> +
> +		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA,
> &dma_map);
> +
> +		if (ret) {
> +			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping,
> "
> +					"error %i (%s)\n", errno,
> strerror(errno));
> +			return -1;
> +		}
> +
> +	}
> +
> +	return 0;
> +}
> +
> +static int
>  vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)  {
>  	/* No-IOMMU mode does not need DMA mapping */ diff --git
> a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> index 29f7f3e..533b854 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
> @@ -53,6 +53,7 @@
>  #endif
> 
>  #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
> +#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU

Does this VFIO type exist for all kernel versions starting with 3.6? it may be worth it to add kernel version a check, like NOIOMMU type has.

> 
>  #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)  #define
> RTE_VFIO_NOIOMMU 8
> --
> 1.9.1
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 702f7a2..1d4fea6 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -50,12 +50,15 @@ 
 static struct vfio_config vfio_cfg;
 
 static int vfio_type1_dma_map(int);
+static int vfio_spapr_dma_map(int);
 static int vfio_noiommu_dma_map(int);
 
 /* IOMMU types we support */
 static const struct vfio_iommu_type iommu_types[] = {
 	/* x86 IOMMU, otherwise known as type 1 */
 	{ RTE_VFIO_TYPE1, "Type 1", &vfio_type1_dma_map},
+	/* ppc64 IOMMU, otherwise known as spapr */
+	{ RTE_VFIO_SPAPR, "sPAPR", &vfio_spapr_dma_map},
 	/* IOMMU-less mode */
 	{ RTE_VFIO_NOIOMMU, "No-IOMMU", &vfio_noiommu_dma_map},
 };
@@ -540,6 +543,91 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 }
 
 static int
+vfio_spapr_dma_map(int vfio_container_fd)
+{
+	const struct rte_memseg *ms = rte_eal_get_physmem_layout();
+	int i, ret;
+
+	struct vfio_iommu_spapr_register_memory reg = {
+		.argsz = sizeof(reg),
+		.flags = 0
+	};
+	struct vfio_iommu_spapr_tce_info info = {
+		.argsz = sizeof(info),
+	};
+	struct vfio_iommu_spapr_tce_create create = {
+		.argsz = sizeof(create),
+	};
+	struct vfio_iommu_spapr_tce_remove remove = {
+		.argsz = sizeof(remove),
+	};
+
+	/* query spapr iommu info */
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_GET_INFO, &info);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  cannot get iommu info, "
+				"error %i (%s)\n", errno, strerror(errno));
+		return -1;
+	}
+
+	/* remove default DMA of 32 bit window */
+	remove.start_addr = info.dma32_window_start;
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_REMOVE, &remove);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  cannot remove default DMA window, "
+				"error %i (%s)\n", errno, strerror(errno));
+		return -1;
+	}
+
+	/* calculate window size based on number of hugepages configured */
+	create.window_size = rte_eal_get_physmem_size();
+	create.page_shift = __builtin_ctzll(ms->hugepage_sz);
+	create.levels = 2;
+
+	ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_TCE_CREATE, &create);
+	if (ret) {
+		RTE_LOG(ERR, EAL, "  cannot create new DMA window, "
+				"error %i (%s)\n", errno, strerror(errno));
+		return -1;
+	}
+
+	/* map all DPDK segments for DMA. use 1:1 PA to IOVA mapping */
+	for (i = 0; i < RTE_MAX_MEMSEG; i++) {
+		struct vfio_iommu_type1_dma_map dma_map;
+
+		if (ms[i].addr == NULL)
+			break;
+
+		reg.vaddr = (uintptr_t) ms[i].addr;
+		reg.size = ms[i].len;
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_SPAPR_REGISTER_MEMORY, &reg);
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot register vaddr for IOMMU, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+
+		memset(&dma_map, 0, sizeof(dma_map));
+		dma_map.argsz = sizeof(struct vfio_iommu_type1_dma_map);
+		dma_map.vaddr = ms[i].addr_64;
+		dma_map.size = ms[i].len;
+		dma_map.iova = ms[i].phys_addr;
+		dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
+
+		ret = ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
+
+		if (ret) {
+			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
+					"error %i (%s)\n", errno, strerror(errno));
+			return -1;
+		}
+
+	}
+
+	return 0;
+}
+
+static int
 vfio_noiommu_dma_map(int __rte_unused vfio_container_fd)
 {
 	/* No-IOMMU mode does not need DMA mapping */
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index 29f7f3e..533b854 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -53,6 +53,7 @@ 
 #endif
 
 #define RTE_VFIO_TYPE1 VFIO_TYPE1_IOMMU
+#define RTE_VFIO_SPAPR VFIO_SPAPR_TCE_v2_IOMMU
 
 #if LINUX_VERSION_CODE < KERNEL_VERSION(4, 5, 0)
 #define RTE_VFIO_NOIOMMU 8