[dpdk-dev] vfio: add hotplug support

Message ID 1489661496-12818-1-git-send-email-alejandro.lucero@netronome.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/Intel-compilation success Compilation OK
ci/checkpatch success coding style OK

Commit Message

Alejandro Lucero March 16, 2017, 10:51 a.m. UTC
  Current device hotplug is just when using UIO. This patch adds
same functionality with VFIO.

It has been validated through tests using IOMMU and also with VFIO
and no-iommu mode.

Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
---
 lib/librte_eal/common/eal_common_pci.c         |   2 +-
 lib/librte_eal/linuxapp/eal/eal_pci.c          |   5 +-
 lib/librte_eal/linuxapp/eal/eal_pci_init.h     |   1 +
 lib/librte_eal/linuxapp/eal/eal_pci_vfio.c     |  82 ++++++++++-
 lib/librte_eal/linuxapp/eal/eal_vfio.c         | 193 +++++++++++++++++++------
 lib/librte_eal/linuxapp/eal/eal_vfio.h         |  11 +-
 lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c |  23 ++-
 lib/librte_ether/rte_ethdev.c                  |   2 +-
 8 files changed, 263 insertions(+), 56 deletions(-)
  

Comments

Eelco Chaudron March 17, 2017, 11:07 a.m. UTC | #1
On 16/03/17 11:51, Alejandro Lucero wrote:
> Current device hotplug is just when using UIO. This patch adds
> same functionality with VFIO.
>
> It has been validated through tests using IOMMU and also with VFIO
> and no-iommu mode.
>
> Signed-off-by: Alejandro Lucero <alejandro.lucero@netronome.com>
> ---
>   lib/librte_eal/common/eal_common_pci.c         |   2 +-
>   lib/librte_eal/linuxapp/eal/eal_pci.c          |   5 +-
>   lib/librte_eal/linuxapp/eal/eal_pci_init.h     |   1 +
>   lib/librte_eal/linuxapp/eal/eal_pci_vfio.c     |  82 ++++++++++-
>   lib/librte_eal/linuxapp/eal/eal_vfio.c         | 193 +++++++++++++++++++------
>   lib/librte_eal/linuxapp/eal/eal_vfio.h         |  11 +-
>   lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c |  23 ++-
>   lib/librte_ether/rte_ethdev.c                  |   2 +-
>   8 files changed, 263 insertions(+), 56 deletions(-)
Thanks Alex for providing this patch to add the VFIO hotplug 
functionality, and to incorporate my offline review comments.

I also tested this patch in combination with Open vSwitch 2.70's hotplug 
feature and did not see any problems.

//Eelco

Reviewed-by: Eelco Chaudron <echaudro@redhat.com>
Tested-by: Eelco Chaudron <echaudro@redhat.com>
  
Burakov, Anatoly March 22, 2017, 1:49 p.m. UTC | #2
Hi Alejandro,

Just a general comment, I think there's now a rule to not split error messages onto multiple lines
even if going above the 80 char limit, to make grepping easier.

> Current device hotplug is just when using UIO. This patch adds
> same functionality with VFIO.
> 
> It has been validated through tests using IOMMU and also with VFIO
> and no-iommu mode.
> 

<...>

>  /* map VFIO resource prototype */
>  int pci_vfio_map_resource(struct rte_pci_device *dev);
> +int pci_vfio_unmap_resource(struct rte_pci_device *dev);

Probably worth it to fix the comment here?

> @@ -517,7 +521,7 @@
>  		}
> 
>  		/* set bus mastering for the device */
> -		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
> +		if (pci_vfio_set_bus_master(vfio_dev_fd, 1)) {

"true"? Doesn't make a difference, but if you're accepting bool, you might as well
use bool values when calling :)

>  			RTE_LOG(ERR, EAL, "  %s cannot set up bus
> mastering!\n", pci_addr);
>  			close(vfio_dev_fd);
>  			rte_free(vfio_res);
> @@ -535,6 +539,76 @@
>  }
> 
>  int
> +pci_vfio_unmap_resource(struct rte_pci_device *dev)
> +{
> +	char pci_addr[PATH_MAX] = {0};
> +	struct rte_pci_addr *loc = &dev->addr;
> +	int i, ret;
> +	struct mapped_pci_resource *vfio_res = NULL;
> +	struct mapped_pci_res_list *vfio_res_list;
> +
> +	struct pci_map *maps;
> +
> +	/* store PCI address string */
> +	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
> +			loc->domain, loc->bus, loc->devid, loc->function);
> +
> +
> +	if (close(dev->intr_handle.fd) < 0) {
> +		RTE_LOG(INFO, EAL, "Error when Closing eventfd file descriptor for %s\n",

"closing"

> +			pci_addr);
> +		return -1;
> +	}
> +
> +	if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, 0)) {

"false"?

> +		RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI
> device!\n",
> +				pci_addr);
> +		return -1;
> +	}
> +
> +	ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
> +				  dev->intr_handle.vfio_dev_fd);
> +	if (ret < 0) {
> +		RTE_LOG(ERR, EAL,
> +			"%s(): cannot release device\n", __func__);
> +		return ret;
> +	}
> +
> +	vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head,
> mapped_pci_res_list);
> +	/* Get vfio_res */
> +	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
> +		if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev-
> >addr)))
> +			continue;
> +		break;
> +	}
> +	/* if we haven't found our tailq entry, something's wrong */
> +	if (vfio_res == NULL) {
> +		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI
> device!\n",
> +				pci_addr);
> +		return -1;
> +	}
> +
> +	/* map BARs */
> +	maps = vfio_res->maps;

Fix comment

> +
> +	RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
> +		pci_addr);
> +	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> +
> +		/* TODO: what about the maps offset field? */

You probably need to perform the same calculations mapping function does if you
are unmapping an MSI-X BAR. As far as I understand, the address space is mapped
in whole (i.e. the maps[i].addr covers the whole BAR), but the things that map into
that address space are mapped using two separate mmap calls. So it's probably a
matter of 1) figuring out if maps[i] points to an MSI-X BAR, 2) figuring out if there
is other stuff in the register that can be mapped, and 3) unmapping that. We pretty
much map those unconditionally, so if the device is up, they're guaranteed to be mapped.

> +		if (maps[i].addr) {
> +			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource
> for %s at %p\n",
> +				pci_addr, maps[i].addr);
> +			pci_unmap_resource(maps[i].addr, maps[i].size);
> +		}
> +	}
> +
> +	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
> +
> +	return 0;
> +}
> +
> +int
>  pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
>  		    struct rte_pci_ioport *p)
>  {
> diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> index 9377a66..42e0f62 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> @@ -68,13 +68,32 @@
>  {
>  	int i;
>  	int vfio_group_fd;
> +	int group_idx = -1;
>  	char filename[PATH_MAX];
> 
>  	/* check if we already have the group descriptor open */
> -	for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
> +	for (i = 0; i < VFIO_MAX_GROUPS; i++)
>  		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
>  			return vfio_cfg.vfio_groups[i].fd;
> 
> +	/* Lets see first if there is room for a new group */
> +	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
> +		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups
> reached!\n");
> +		return -1;
> +	}
> +
> +	/* Now lets get an index for the new group */
> +	for (i = 0; i < VFIO_MAX_GROUPS; i++)
> +		if (vfio_cfg.vfio_groups[i].group_no == -1) {
> +			group_idx = i;
> +			break;
> +		}
> +
> +	/* This should not happen */
> +	if (group_idx == -1) {
> +		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
> +		return -1;
> +	}
>  	/* if primary, try to open the group */
>  	if (internal_config.process_type == RTE_PROC_PRIMARY) {
>  		/* try regular group format */
> @@ -104,14 +123,9 @@
>  			/* noiommu group found */
>  		}
> 
> -		/* if the fd is valid, create a new group for it */
> -		if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
> -			RTE_LOG(ERR, EAL, "Maximum number of VFIO
> groups reached!\n");
> -			close(vfio_group_fd);
> -			return -1;
> -		}
> -		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no =
> iommu_group_no;
> -		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd =
> vfio_group_fd;
> +		vfio_cfg.vfio_groups[group_idx].group_no =
> iommu_group_no;
> +		vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
> +		vfio_cfg.vfio_active_groups++;
>  		return vfio_group_fd;
>  	}
>  	/* if we're in a secondary process, request group fd from the primary
> @@ -158,14 +172,62 @@
>  	return -1;
>  }
> 
> -static void
> -clear_current_group(void)
> +int
> +clear_group(int vfio_group_fd)
>  {
> -	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
> -	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
> +	int i;
> +	int socket_fd, ret;
> +
> +	if (internal_config.process_type == RTE_PROC_PRIMARY) {
> +
> +		for (i = 0; i < VFIO_MAX_GROUPS; i++)
> +			if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
> +				vfio_cfg.vfio_groups[i].group_no = -1;
> +				vfio_cfg.vfio_groups[i].fd = -1;
> +				vfio_cfg.vfio_active_groups--;
> +				return 0;
> +			}
> +		return -1;
> +	}
> +
> +	/* This is just for SECONDARY processes */
> +	socket_fd = vfio_mp_sync_connect_to_primary();
> +
> +	if (socket_fd < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot connect to primary
> process!\n");
> +		return -1;
> +	}
> +
> +	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) <
> 0) {
> +		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> +		close(socket_fd);
> +		return -1;
> +	}
> +
> +	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> +		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> +		close(socket_fd);
> +		return -1;
> +	}
> +
> +	ret = vfio_mp_sync_receive_request(socket_fd);
> +	switch (ret) {
> +	case SOCKET_NO_FD:
> +		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> +		close(socket_fd);
> +		break;
> +	case SOCKET_OK:
> +		close(socket_fd);
> +		return 0;
> +	default:
> +		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> +		close(socket_fd);

Technically, SOCKET_ERR isn't an "UNKNOWN" reply, so maybe handle it explicitly?

That said, please forgive me my ignorance on hotplug, but does secondary process
need to notify primary about group clearing at all? I mean, doesn't primary process
also get the hotplug event? I would guess secondary process just needs to unmap
the device and that's it, no?

> +	}
> +	return -1;
>  }
> 
> -int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
> +int
> +vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>  		int *vfio_dev_fd, struct vfio_device_info *device_info)
>  {
>  	struct vfio_group_status group_status = {

<...>

> +		 */
> +
> +		RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s
> failed\n",
>  				dev_addr);
> -		return 1;
> +		close(vfio_group_fd);
> +		clear_group(vfio_group_fd);
> +		return -1;

I've had suspicions that it would be a bug, but I tested it on both modern and old kernels (where
ports on the same NIC are assigned same IOMMU group), and it works as it should. I think there wasn't
a "which kernel driver" EAL PCI code back when this was written, so returning 1 was necessary to not fail
when multiple ports shared the same IOMMU group, but only one of them was bound to VFIO.

>  	}
> 
>  	/* test and setup the device */
>  	ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
>  	if (ret) {
>  		RTE_LOG(ERR, EAL, "  %s cannot get device info, "
> -				"error %i (%s)\n", dev_addr, errno,
> strerror(errno));

<...>

> @@ -155,6 +154,10 @@ struct vfio_iommu_type {
>  int
>  vfio_get_group_fd(int iommu_group_no);
> 
> +/* remove group fd from internal VFIO group fd array */
> +int
> +clear_group(int vfio_group_fd);
> +
>  /**
>   * Setup vfio_cfg for the device identified by its address. It discovers
>   * the configured I/O MMU groups or sets a new one for the device. If a new
> @@ -165,6 +168,9 @@ struct vfio_iommu_type {
>  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
>  		int *vfio_dev_fd, struct vfio_device_info *device_info);
> 
> +int vfio_release_device(const char *sysfs_base, const char *dev_addr, int
> fd);
> +
> +int vfio_enable(const char *modname);
>  int vfio_enable(const char *modname);

Duplicate vfio_enable declaration?

>  int vfio_is_enabled(const char *modname);
> 
> @@ -175,6 +181,7 @@ int vfio_setup_device(const char *sysfs_base, const
> char *dev_addr,
> 
>  #define SOCKET_REQ_CONTAINER 0x100
>  #define SOCKET_REQ_GROUP 0x200
> +#define SOCKET_CLR_GROUP 0x300
>  #define SOCKET_OK 0x0
>  #define SOCKET_NO_FD 0x1
>  #define SOCKET_ERR 0xFF

Thanks,
Anatoly
  
Burakov, Anatoly March 23, 2017, 12:08 p.m. UTC | #3
Hi Alejandro,

I think the documentation should also be updated to reflect the fact that this patch adds VFIO support for hotplug.

Thanks,
Anatoly
  
Alejandro Lucero March 24, 2017, 4:38 p.m. UTC | #4
Hi Anatoly,

On Wed, Mar 22, 2017 at 1:49 PM, Burakov, Anatoly <anatoly.burakov@intel.com
> wrote:

> Hi Alejandro,
>
> Just a general comment, I think there's now a rule to not split error
> messages onto multiple lines
> even if going above the 80 char limit, to make grepping easier.
>
>
Ok. Thanks.


> > Current device hotplug is just when using UIO. This patch adds
> > same functionality with VFIO.
> >
> > It has been validated through tests using IOMMU and also with VFIO
> > and no-iommu mode.
> >
>
> <...>
>
>
Those lines are not error messages. Are you suggesting not splitting lines
in the commit comment?


> >  /* map VFIO resource prototype */
> >  int pci_vfio_map_resource(struct rte_pci_device *dev);
> > +int pci_vfio_unmap_resource(struct rte_pci_device *dev);
>
> Probably worth it to fix the comment here?
>

Yes. Thanks.


>
> > @@ -517,7 +521,7 @@
> >               }
> >
> >               /* set bus mastering for the device */
> > -             if (pci_vfio_set_bus_master(vfio_dev_fd)) {
> > +             if (pci_vfio_set_bus_master(vfio_dev_fd, 1)) {
>
> "true"? Doesn't make a difference, but if you're accepting bool, you might
> as well
> use bool values when calling :)
>
>
Right.


> >                       RTE_LOG(ERR, EAL, "  %s cannot set up bus
> > mastering!\n", pci_addr);
> >                       close(vfio_dev_fd);
> >                       rte_free(vfio_res);
> > @@ -535,6 +539,76 @@
> >  }
> >
> >  int
> > +pci_vfio_unmap_resource(struct rte_pci_device *dev)
> > +{
> > +     char pci_addr[PATH_MAX] = {0};
> > +     struct rte_pci_addr *loc = &dev->addr;
> > +     int i, ret;
> > +     struct mapped_pci_resource *vfio_res = NULL;
> > +     struct mapped_pci_res_list *vfio_res_list;
> > +
> > +     struct pci_map *maps;
> > +
> > +     /* store PCI address string */
> > +     snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
> > +                     loc->domain, loc->bus, loc->devid, loc->function);
> > +
> > +
> > +     if (close(dev->intr_handle.fd) < 0) {
> > +             RTE_LOG(INFO, EAL, "Error when Closing eventfd file
> descriptor for %s\n",
>
> "closing"
>
>
Well spotted. Thanks.


> > +                     pci_addr);
> > +             return -1;
> > +     }
> > +
> > +     if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, 0)) {
>
> "false"?
>
>
Right.


> > +             RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI
> > device!\n",
> > +                             pci_addr);
> > +             return -1;
> > +     }
> > +
> > +     ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
> > +                               dev->intr_handle.vfio_dev_fd);
> > +     if (ret < 0) {
> > +             RTE_LOG(ERR, EAL,
> > +                     "%s(): cannot release device\n", __func__);
> > +             return ret;
> > +     }
> > +
> > +     vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head,
> > mapped_pci_res_list);
> > +     /* Get vfio_res */
> > +     TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
> > +             if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev-
> > >addr)))
> > +                     continue;
> > +             break;
> > +     }
> > +     /* if we haven't found our tailq entry, something's wrong */
> > +     if (vfio_res == NULL) {
> > +             RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI
> > device!\n",
> > +                             pci_addr);
> > +             return -1;
> > +     }
> > +
> > +     /* map BARs */
> > +     maps = vfio_res->maps;
>
> Fix comment
>
>
What do you refer to? The TODO line?


> > +
> > +     RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
> > +             pci_addr);
> > +     for (i = 0; i < (int) vfio_res->nb_maps; i++) {
> > +
> > +             /* TODO: what about the maps offset field? */
>
> You probably need to perform the same calculations mapping function does
> if you
> are unmapping an MSI-X BAR. As far as I understand, the address space is
> mapped
> in whole (i.e. the maps[i].addr covers the whole BAR), but the things that
> map into
> that address space are mapped using two separate mmap calls. So it's
> probably a
> matter of 1) figuring out if maps[i] points to an MSI-X BAR, 2) figuring
> out if there
> is other stuff in the register that can be mapped, and 3) unmapping that.
> We pretty
> much map those unconditionally, so if the device is up, they're guaranteed
> to be mapped.
>

Maybe because I did not remove that TODO comment, this is all a bit
confusing.

When mapping, a BAR with the msix table needs to be handled specifically,
doing two mmap calls leaving out the msix table addresses. But although
there are two mmap calls, there is just one mmap region (from the process
point of view), because the address to use for the second call is just the
end of the first mmap. Note this is about the virtual addresses to be used
by the app and not the physical addresses really used at the end. So inside
the maps array, just one mmap region is saved. Doing the unmap is just a
matter of using that maps array and use the maps[x].size accordingly.



>
> > +             if (maps[i].addr) {
> > +                     RTE_LOG(INFO, EAL, "Calling pci_unmap_resource
> > for %s at %p\n",
> > +                             pci_addr, maps[i].addr);
> > +                     pci_unmap_resource(maps[i].addr, maps[i].size);
> > +             }
> > +     }
> > +
> > +     TAILQ_REMOVE(vfio_res_list, vfio_res, next);
> > +
> > +     return 0;
> > +}
> > +
> > +int
> >  pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
> >                   struct rte_pci_ioport *p)
> >  {
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > index 9377a66..42e0f62 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
> > @@ -68,13 +68,32 @@
> >  {
> >       int i;
> >       int vfio_group_fd;
> > +     int group_idx = -1;
> >       char filename[PATH_MAX];
> >
> >       /* check if we already have the group descriptor open */
> > -     for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
> > +     for (i = 0; i < VFIO_MAX_GROUPS; i++)
> >               if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
> >                       return vfio_cfg.vfio_groups[i].fd;
> >
> > +     /* Lets see first if there is room for a new group */
> > +     if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
> > +             RTE_LOG(ERR, EAL, "Maximum number of VFIO groups
> > reached!\n");
> > +             return -1;
> > +     }
> > +
> > +     /* Now lets get an index for the new group */
> > +     for (i = 0; i < VFIO_MAX_GROUPS; i++)
> > +             if (vfio_cfg.vfio_groups[i].group_no == -1) {
> > +                     group_idx = i;
> > +                     break;
> > +             }
> > +
> > +     /* This should not happen */
> > +     if (group_idx == -1) {
> > +             RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
> > +             return -1;
> > +     }
> >       /* if primary, try to open the group */
> >       if (internal_config.process_type == RTE_PROC_PRIMARY) {
> >               /* try regular group format */
> > @@ -104,14 +123,9 @@
> >                       /* noiommu group found */
> >               }
> >
> > -             /* if the fd is valid, create a new group for it */
> > -             if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
> > -                     RTE_LOG(ERR, EAL, "Maximum number of VFIO
> > groups reached!\n");
> > -                     close(vfio_group_fd);
> > -                     return -1;
> > -             }
> > -             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no =
> > iommu_group_no;
> > -             vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd =
> > vfio_group_fd;
> > +             vfio_cfg.vfio_groups[group_idx].group_no =
> > iommu_group_no;
> > +             vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
> > +             vfio_cfg.vfio_active_groups++;
> >               return vfio_group_fd;
> >       }
> >       /* if we're in a secondary process, request group fd from the
> primary
> > @@ -158,14 +172,62 @@
> >       return -1;
> >  }
> >
> > -static void
> > -clear_current_group(void)
> > +int
> > +clear_group(int vfio_group_fd)
> >  {
> > -     vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
> > -     vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
> > +     int i;
> > +     int socket_fd, ret;
> > +
> > +     if (internal_config.process_type == RTE_PROC_PRIMARY) {
> > +
> > +             for (i = 0; i < VFIO_MAX_GROUPS; i++)
> > +                     if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
> > +                             vfio_cfg.vfio_groups[i].group_no = -1;
> > +                             vfio_cfg.vfio_groups[i].fd = -1;
> > +                             vfio_cfg.vfio_active_groups--;
> > +                             return 0;
> > +                     }
> > +             return -1;
> > +     }
> > +
> > +     /* This is just for SECONDARY processes */
> > +     socket_fd = vfio_mp_sync_connect_to_primary();
> > +
> > +     if (socket_fd < 0) {
> > +             RTE_LOG(ERR, EAL, "  cannot connect to primary
> > process!\n");
> > +             return -1;
> > +     }
> > +
> > +     if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) <
> > 0) {
> > +             RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
> > +             close(socket_fd);
> > +             return -1;
> > +     }
> > +
> > +     if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
> > +             RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
> > +             close(socket_fd);
> > +             return -1;
> > +     }
> > +
> > +     ret = vfio_mp_sync_receive_request(socket_fd);
> > +     switch (ret) {
> > +     case SOCKET_NO_FD:
> > +             RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
> > +             close(socket_fd);
> > +             break;
> > +     case SOCKET_OK:
> > +             close(socket_fd);
> > +             return 0;
> > +     default:
> > +             RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
> > +             close(socket_fd);
>
> Technically, SOCKET_ERR isn't an "UNKNOWN" reply, so maybe handle it
> explicitly?
>

Yes. Good point.


>
> That said, please forgive me my ignorance on hotplug, but does secondary
> process
> need to notify primary about group clearing at all? I mean, doesn't
> primary process
> also get the hotplug event? I would guess secondary process just needs to
> unmap
> the device and that's it, no?
>
>
Uhmm, I think a secondary process can invoke rte_eth_dev_detach, so not
sure what event are you referring to.
I'm afraid I have not done any test regarding secondary processes calling
that function, but being honest, documentations is clear about how unsafe
is to use hotplug API from multiple threads, and I would add same
uncertainty when used from secondary processes.



> > +     }
> > +     return -1;
> >  }
> >
> > -int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
> > +int
> > +vfio_setup_device(const char *sysfs_base, const char *dev_addr,
> >               int *vfio_dev_fd, struct vfio_device_info *device_info)
> >  {
> >       struct vfio_group_status group_status = {
>
> <...>
>
> > +              */
> > +
> > +             RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s
> > failed\n",
> >                               dev_addr);
> > -             return 1;
> > +             close(vfio_group_fd);
> > +             clear_group(vfio_group_fd);
> > +             return -1;
>
> I've had suspicions that it would be a bug, but I tested it on both modern
> and old kernels (where
> ports on the same NIC are assigned same IOMMU group), and it works as it
> should. I think there wasn't
> a "which kernel driver" EAL PCI code back when this was written, so
> returning 1 was necessary to not fail
> when multiple ports shared the same IOMMU group, but only one of them was
> bound to VFIO.
>
>
Not sure I understand your concern, but I will take your comment as a thumb
up ;-)

    >       }

> >
> >       /* test and setup the device */
> >       ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
> >       if (ret) {
> >               RTE_LOG(ERR, EAL, "  %s cannot get device info, "
> > -                             "error %i (%s)\n", dev_addr, errno,
> > strerror(errno));
>
> <...>
>
> > @@ -155,6 +154,10 @@ struct vfio_iommu_type {
> >  int
> >  vfio_get_group_fd(int iommu_group_no);
> >
> > +/* remove group fd from internal VFIO group fd array */
> > +int
> > +clear_group(int vfio_group_fd);
> > +
> >  /**
> >   * Setup vfio_cfg for the device identified by its address. It discovers
> >   * the configured I/O MMU groups or sets a new one for the device. If a
> new
> > @@ -165,6 +168,9 @@ struct vfio_iommu_type {
> >  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
> >               int *vfio_dev_fd, struct vfio_device_info *device_info);
> >
> > +int vfio_release_device(const char *sysfs_base, const char *dev_addr,
> int
> > fd);
> > +
> > +int vfio_enable(const char *modname);
> >  int vfio_enable(const char *modname);
>
> Duplicate vfio_enable declaration?
>

Wow. I will fix that. Thanks


>
> >  int vfio_is_enabled(const char *modname);
> >
> > @@ -175,6 +181,7 @@ int vfio_setup_device(const char *sysfs_base, const
> > char *dev_addr,
> >
> >  #define SOCKET_REQ_CONTAINER 0x100
> >  #define SOCKET_REQ_GROUP 0x200
> > +#define SOCKET_CLR_GROUP 0x300
> >  #define SOCKET_OK 0x0
> >  #define SOCKET_NO_FD 0x1
> >  #define SOCKET_ERR 0xFF
>
> Thanks,
> Anatoly
>
>
  
Alejandro Lucero March 24, 2017, 4:39 p.m. UTC | #5
On Thu, Mar 23, 2017 at 12:08 PM, Burakov, Anatoly <
anatoly.burakov@intel.com> wrote:

> Hi Alejandro,
>
> I think the documentation should also be updated to reflect the fact that
> this patch adds VFIO support for hotplug.
>

Yes. Thanks.

I will submit a new version with all the fixes from your previous comments
and adding changes to the docs.


>
> Thanks,
> Anatoly
>
  
Burakov, Anatoly March 24, 2017, 4:56 p.m. UTC | #6
Hi Alejandro,

>Those lines are not error messages. Are you suggesting not splitting lines in the commit comment?


Apologies, that (and other instances) was supposed to mean "snipped" :)
 
> What do you refer to? The TODO line?


No, I was referring to "map BARS" comment in an unmapping function. Should probably say "unmap"?

> When mapping, a BAR with the msix table needs to be handled specifically, doing two mmap calls leaving out the msix table addresses. But although there are two mmap calls, there is just one mmap region (from the process point of view), because the address to use for the second call is just the end of the first mmap. Note this is about the virtual addresses to be used by the app and not the physical addresses really used at the end. So inside the maps array, just one mmap region is saved. Doing the unmap is just a matter of using that maps array and use the maps[x].size accordingly.


Yep, you're right.

> Uhmm, I think a secondary process can invoke rte_eth_dev_detach, so not sure what event are you referring to.

> I'm afraid I have not done any test regarding secondary processes calling that function, but being honest, documentations is clear about how unsafe is to use hotplug API from multiple threads, and I would add same uncertainty when used from secondary processes.


I think hotplug never worked for secondary processes. So really, I'm inclined to leave this out entirely. That would at least make it consistent in its brokenness :) Plus, this doesn't work the other way around (i.e. if a primary process unplugs a NIC, secondary process isn't notified), so we might as well just not bother supporting secondary processes at all. Opinions?
 
> Not sure I understand your concern, but I will take your comment as a thumb up ;-)


Yes, that's a thumbs up :)

Thanks,
Anatoly
  

Patch

diff --git a/lib/librte_eal/common/eal_common_pci.c b/lib/librte_eal/common/eal_common_pci.c
index 72547bd..4278126 100644
--- a/lib/librte_eal/common/eal_common_pci.c
+++ b/lib/librte_eal/common/eal_common_pci.c
@@ -199,7 +199,7 @@  static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev)
 				dev->id.device_id, dr->driver.name);
 
 		if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) {
-			/* map resources for devices that use igb_uio */
+			/* map resources for devices that use igb_uio or VFIO */
 			ret = rte_eal_pci_map_device(dev);
 			if (ret != 0)
 				return ret;
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci.c b/lib/librte_eal/linuxapp/eal/eal_pci.c
index 61d55b9..7d6f50a 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci.c
@@ -121,7 +121,10 @@ 
 	/* try unmapping the NIC resources using VFIO if it exists */
 	switch (dev->kdrv) {
 	case RTE_KDRV_VFIO:
-		RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n");
+#ifdef VFIO_PRESENT
+		if (pci_vfio_is_enabled())
+			pci_vfio_unmap_resource(dev);
+#endif
 		break;
 	case RTE_KDRV_IGB_UIO:
 	case RTE_KDRV_UIO_GENERIC:
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_init.h b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
index 6a960d1..bb1fabe 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_init.h
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_init.h
@@ -90,6 +90,7 @@  void pci_vfio_ioport_write(struct rte_pci_ioport *p,
 
 /* map VFIO resource prototype */
 int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
 
 #endif
 
diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
index 5f478c5..f02bf53 100644
--- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
@@ -38,6 +38,7 @@ 
 #include <sys/socket.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <stdbool.h>
 
 #include <rte_log.h>
 #include <rte_pci.h>
@@ -172,7 +173,7 @@ 
 
 /* set PCI bus mastering */
 static int
-pci_vfio_set_bus_master(int dev_fd)
+pci_vfio_set_bus_master(int dev_fd, bool op)
 {
 	uint16_t reg;
 	int ret;
@@ -185,8 +186,11 @@ 
 		return -1;
 	}
 
-	/* set the master bit */
-	reg |= PCI_COMMAND_MASTER;
+	if (op)
+		/* set the master bit */
+		reg |= PCI_COMMAND_MASTER;
+	else
+		reg &= ~(PCI_COMMAND_MASTER);
 
 	ret = pwrite64(dev_fd, &reg, sizeof(reg),
 			VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
@@ -517,7 +521,7 @@ 
 		}
 
 		/* set bus mastering for the device */
-		if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+		if (pci_vfio_set_bus_master(vfio_dev_fd, 1)) {
 			RTE_LOG(ERR, EAL, "  %s cannot set up bus mastering!\n", pci_addr);
 			close(vfio_dev_fd);
 			rte_free(vfio_res);
@@ -535,6 +539,76 @@ 
 }
 
 int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+	char pci_addr[PATH_MAX] = {0};
+	struct rte_pci_addr *loc = &dev->addr;
+	int i, ret;
+	struct mapped_pci_resource *vfio_res = NULL;
+	struct mapped_pci_res_list *vfio_res_list;
+
+	struct pci_map *maps;
+
+	/* store PCI address string */
+	snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+			loc->domain, loc->bus, loc->devid, loc->function);
+
+
+	if (close(dev->intr_handle.fd) < 0) {
+		RTE_LOG(INFO, EAL, "Error when Closing eventfd file descriptor for %s\n",
+			pci_addr);
+		return -1;
+	}
+
+	if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, 0)) {
+		RTE_LOG(ERR, EAL, "  %s cannot unset bus mastering for PCI device!\n",
+				pci_addr);
+		return -1;
+	}
+
+	ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
+				  dev->intr_handle.vfio_dev_fd);
+	if (ret < 0) {
+		RTE_LOG(ERR, EAL,
+			"%s(): cannot release device\n", __func__);
+		return ret;
+	}
+
+	vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+	/* Get vfio_res */
+	TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+		if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+			continue;
+		break;
+	}
+	/* if we haven't found our tailq entry, something's wrong */
+	if (vfio_res == NULL) {
+		RTE_LOG(ERR, EAL, "  %s cannot find TAILQ entry for PCI device!\n",
+				pci_addr);
+		return -1;
+	}
+
+	/* map BARs */
+	maps = vfio_res->maps;
+
+	RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+		pci_addr);
+	for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+		/* TODO: what about the maps offset field? */
+		if (maps[i].addr) {
+			RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
+				pci_addr, maps[i].addr);
+			pci_unmap_resource(maps[i].addr, maps[i].size);
+		}
+	}
+
+	TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+	return 0;
+}
+
+int
 pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
 		    struct rte_pci_ioport *p)
 {
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.c b/lib/librte_eal/linuxapp/eal/eal_vfio.c
index 9377a66..42e0f62 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.c
@@ -68,13 +68,32 @@ 
 {
 	int i;
 	int vfio_group_fd;
+	int group_idx = -1;
 	char filename[PATH_MAX];
 
 	/* check if we already have the group descriptor open */
-	for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
 		if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
 			return vfio_cfg.vfio_groups[i].fd;
 
+	/* Lets see first if there is room for a new group */
+	if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
+		RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+		return -1;
+	}
+
+	/* Now lets get an index for the new group */
+	for (i = 0; i < VFIO_MAX_GROUPS; i++)
+		if (vfio_cfg.vfio_groups[i].group_no == -1) {
+			group_idx = i;
+			break;
+		}
+
+	/* This should not happen */
+	if (group_idx == -1) {
+		RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+		return -1;
+	}
 	/* if primary, try to open the group */
 	if (internal_config.process_type == RTE_PROC_PRIMARY) {
 		/* try regular group format */
@@ -104,14 +123,9 @@ 
 			/* noiommu group found */
 		}
 
-		/* if the fd is valid, create a new group for it */
-		if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
-			RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
-			close(vfio_group_fd);
-			return -1;
-		}
-		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
-		vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+		vfio_cfg.vfio_groups[group_idx].group_no = iommu_group_no;
+		vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
+		vfio_cfg.vfio_active_groups++;
 		return vfio_group_fd;
 	}
 	/* if we're in a secondary process, request group fd from the primary
@@ -158,14 +172,62 @@ 
 	return -1;
 }
 
-static void
-clear_current_group(void)
+int
+clear_group(int vfio_group_fd)
 {
-	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
-	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+	int i;
+	int socket_fd, ret;
+
+	if (internal_config.process_type == RTE_PROC_PRIMARY) {
+
+		for (i = 0; i < VFIO_MAX_GROUPS; i++)
+			if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
+				vfio_cfg.vfio_groups[i].group_no = -1;
+				vfio_cfg.vfio_groups[i].fd = -1;
+				vfio_cfg.vfio_active_groups--;
+				return 0;
+			}
+		return -1;
+	}
+
+	/* This is just for SECONDARY processes */
+	socket_fd = vfio_mp_sync_connect_to_primary();
+
+	if (socket_fd < 0) {
+		RTE_LOG(ERR, EAL, "  cannot connect to primary process!\n");
+		return -1;
+	}
+
+	if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
+		RTE_LOG(ERR, EAL, "  cannot request container fd!\n");
+		close(socket_fd);
+		return -1;
+	}
+
+	if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
+		RTE_LOG(ERR, EAL, "  cannot send group fd!\n");
+		close(socket_fd);
+		return -1;
+	}
+
+	ret = vfio_mp_sync_receive_request(socket_fd);
+	switch (ret) {
+	case SOCKET_NO_FD:
+		RTE_LOG(ERR, EAL, "  BAD VFIO group fd!\n");
+		close(socket_fd);
+		break;
+	case SOCKET_OK:
+		close(socket_fd);
+		return 0;
+	default:
+		RTE_LOG(ERR, EAL, "  UNKNOWN reply, %d\n", ret);
+		close(socket_fd);
+	}
+	return -1;
 }
 
-int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+int
+vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info)
 {
 	struct vfio_group_status group_status = {
@@ -192,18 +254,10 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 	if (vfio_group_fd < 0)
 		return -1;
 
-	/* store group fd */
-	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
-	vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
-
 	/* if group_fd == 0, that means the device isn't managed by VFIO */
 	if (vfio_group_fd == 0) {
-		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+		RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
 				dev_addr);
-		/* we store 0 as group fd to distinguish between existing but
-		 * unbound VFIO groups, and groups that don't exist at all.
-		 */
-		vfio_cfg.vfio_group_idx++;
 		return 1;
 	}
 
@@ -218,12 +272,12 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		RTE_LOG(ERR, EAL, "  %s cannot get group status, "
 				"error %i (%s)\n", dev_addr, errno, strerror(errno));
 		close(vfio_group_fd);
-		clear_current_group();
+		clear_group(vfio_group_fd);
 		return -1;
 	} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
 		RTE_LOG(ERR, EAL, "  %s VFIO group is not viable!\n", dev_addr);
 		close(vfio_group_fd);
-		clear_current_group();
+		clear_group(vfio_group_fd);
 		return -1;
 	}
 
@@ -237,58 +291,62 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 			RTE_LOG(ERR, EAL, "  %s cannot add VFIO group to container, "
 					"error %i (%s)\n", dev_addr, errno, strerror(errno));
 			close(vfio_group_fd);
-			clear_current_group();
+			clear_group(vfio_group_fd);
 			return -1;
 		}
-		/*
-		 * at this point we know that this group has been successfully
-		 * initialized, so we increment vfio_group_idx to indicate that we can
-		 * add new groups.
-		 */
-		vfio_cfg.vfio_group_idx++;
 	}
 
 	/*
 	 * pick an IOMMU type and set up DMA mappings for container
 	 *
-	 * needs to be done only once, only when at least one group is assigned to
-	 * a container and only in primary process
+	 * needs to be done only once, only when first group is assigned to
+	 * a container and only in primary process. Note this can happen several
+	 * times with the hotplug functionality.
 	 */
 	if (internal_config.process_type == RTE_PROC_PRIMARY &&
-			vfio_cfg.vfio_container_has_dma == 0) {
+			vfio_cfg.vfio_active_groups == 1) {
 		/* select an IOMMU type which we will be using */
 		const struct vfio_iommu_type *t =
 				vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
 		if (!t) {
 			RTE_LOG(ERR, EAL, "  %s failed to select IOMMU type\n", dev_addr);
+			close(vfio_group_fd);
+			clear_group(vfio_group_fd);
 			return -1;
 		}
 		ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  %s DMA remapping failed, "
 					"error %i (%s)\n", dev_addr, errno, strerror(errno));
+			close(vfio_group_fd);
+			clear_group(vfio_group_fd);
 			return -1;
 		}
-		vfio_cfg.vfio_container_has_dma = 1;
 	}
 
 	/* get a file descriptor for the device */
 	*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
 	if (*vfio_dev_fd < 0) {
-		/* if we cannot get a device fd, this simply means that this
-		* particular port is not bound to VFIO
-		*/
-		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver, skipping\n",
+		/* if we cannot get a device fd, this implies a problem with
+		 * the VFIO group or the container not having IOMMU configured.
+		 */
+
+		RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
 				dev_addr);
-		return 1;
+		close(vfio_group_fd);
+		clear_group(vfio_group_fd);
+		return -1;
 	}
 
 	/* test and setup the device */
 	ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
 	if (ret) {
 		RTE_LOG(ERR, EAL, "  %s cannot get device info, "
-				"error %i (%s)\n", dev_addr, errno, strerror(errno));
+				"error %i (%s)\n", dev_addr, errno,
+				strerror(errno));
 		close(*vfio_dev_fd);
+		close(vfio_group_fd);
+		clear_group(vfio_group_fd);
 		return -1;
 	}
 
@@ -296,6 +354,54 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 }
 
 int
+vfio_release_device(const char *sysfs_base, const char *dev_addr,
+		    int vfio_dev_fd)
+{
+	struct vfio_group_status group_status = {
+			.argsz = sizeof(group_status)
+	};
+	int vfio_group_fd;
+	int iommu_group_no;
+	int ret;
+
+	/* get group number */
+	ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+	if (ret <= 0) {
+		RTE_LOG(WARNING, EAL, "  %s not managed by VFIO driver\n",
+			dev_addr);
+		/* This is an error at this point. */
+		return -1;
+	}
+
+	/* get the actual group fd */
+	vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+	if (vfio_group_fd <= 0) {
+		RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
+				   dev_addr);
+		return -1;
+	}
+
+	/* At this point we got an active group. Closing it will make the
+	 * container detachment. If this is the last active group, VFIO kernel
+	 * code will unset the container and the IOMMU mappings.
+	 */
+
+	if (close(vfio_group_fd) < 0)
+		RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+				   dev_addr);
+
+	if (close(vfio_dev_fd) < 0)
+		RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+				   dev_addr);
+
+	if (clear_group(vfio_group_fd) < 0)
+		RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+				   dev_addr);
+
+	return 0;
+}
+
+int
 vfio_enable(const char *modname)
 {
 	/* initialize group list */
@@ -534,7 +640,8 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 
 		if (ret) {
 			RTE_LOG(ERR, EAL, "  cannot set up DMA remapping, "
-					"error %i (%s)\n", errno, strerror(errno));
+					  "error %i (%s)\n", errno,
+					  strerror(errno));
 			return -1;
 		}
 	}
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio.h b/lib/librte_eal/linuxapp/eal/eal_vfio.h
index ac31a4f..3482c50 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio.h
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio.h
@@ -108,8 +108,7 @@  struct vfio_group {
 struct vfio_config {
 	int vfio_enabled;
 	int vfio_container_fd;
-	int vfio_container_has_dma;
-	int vfio_group_idx;
+	int vfio_active_groups;
 	struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
 };
 
@@ -155,6 +154,10 @@  struct vfio_iommu_type {
 int
 vfio_get_group_fd(int iommu_group_no);
 
+/* remove group fd from internal VFIO group fd array */
+int
+clear_group(int vfio_group_fd);
+
 /**
  * Setup vfio_cfg for the device identified by its address. It discovers
  * the configured I/O MMU groups or sets a new one for the device. If a new
@@ -165,6 +168,9 @@  struct vfio_iommu_type {
 int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 		int *vfio_dev_fd, struct vfio_device_info *device_info);
 
+int vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
+
+int vfio_enable(const char *modname);
 int vfio_enable(const char *modname);
 int vfio_is_enabled(const char *modname);
 
@@ -175,6 +181,7 @@  int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
 
 #define SOCKET_REQ_CONTAINER 0x100
 #define SOCKET_REQ_GROUP 0x200
+#define SOCKET_CLR_GROUP 0x300
 #define SOCKET_OK 0x0
 #define SOCKET_NO_FD 0x1
 #define SOCKET_ERR 0xFF
diff --git a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
index fb4a2f8..7e8095c 100644
--- a/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
+++ b/lib/librte_eal/linuxapp/eal/eal_vfio_mp_sync.c
@@ -267,7 +267,7 @@ 
 static __attribute__((noreturn)) void *
 vfio_mp_sync_thread(void __rte_unused * arg)
 {
-	int ret, fd, vfio_group_no;
+	int ret, fd, vfio_data;
 
 	/* wait for requests on the socket */
 	for (;;) {
@@ -305,13 +305,13 @@  static __attribute__((noreturn)) void *
 			break;
 		case SOCKET_REQ_GROUP:
 			/* wait for group number */
-			vfio_group_no = vfio_mp_sync_receive_request(conn_sock);
-			if (vfio_group_no < 0) {
+			vfio_data = vfio_mp_sync_receive_request(conn_sock);
+			if (vfio_data < 0) {
 				close(conn_sock);
 				continue;
 			}
 
-			fd = vfio_get_group_fd(vfio_group_no);
+			fd = vfio_get_group_fd(vfio_data);
 
 			if (fd < 0)
 				vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
@@ -324,6 +324,21 @@  static __attribute__((noreturn)) void *
 				vfio_mp_sync_send_fd(conn_sock, fd);
 			}
 			break;
+		case SOCKET_CLR_GROUP:
+			/* wait for group fd */
+			vfio_data = vfio_mp_sync_receive_request(conn_sock);
+			if (vfio_data < 0) {
+				close(conn_sock);
+				continue;
+			}
+
+			ret = clear_group(vfio_data);
+
+			if (ret < 0)
+				vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+			else
+				vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
+			break;
 		default:
 			vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
 			break;
diff --git a/lib/librte_ether/rte_ethdev.c b/lib/librte_ether/rte_ethdev.c
index eb0a94a..d844f42 100644
--- a/lib/librte_ether/rte_ethdev.c
+++ b/lib/librte_ether/rte_ethdev.c
@@ -455,8 +455,8 @@  struct rte_eth_dev *
 	case RTE_KDRV_UIO_GENERIC:
 	case RTE_KDRV_NIC_UIO:
 	case RTE_KDRV_NONE:
-		break;
 	case RTE_KDRV_VFIO:
+		break;
 	default:
 		return -ENOTSUP;
 	}