@@ -35,5 +35,6 @@ DIRS-$(CONFIG_RTE_TEST_PMD) += test-pmd
DIRS-$(CONFIG_RTE_APP_CRYPTO_PERF) += test-crypto-perf
DIRS-$(CONFIG_RTE_EXEC_ENV_LINUXAPP) += proc_info
DIRS-$(CONFIG_RTE_LIBRTE_PDUMP) += pdump
+DIRS-y += multivf
include $(RTE_SDK)/mk/rte.subdir.mk
@@ -102,7 +102,7 @@ Limitations
* The framework can only be enabled with Linux. BSD is not supported.
* To detach a port, the port should be backed by a device that igb_uio
- manages. VFIO is not supported.
+ or VFIO manages.
* Not all PMDs support detaching feature.
To know whether a PMD can support detaching, search for the
@@ -49,6 +49,10 @@ New Features
sPAPR IOMMU based pci probing enabled for vfio-pci devices.
+* **Added VFIO hotplug support.**
+
+ How hotplug supported with UIO and VFIO drivers.
+
Resolved Issues
---------------
@@ -199,7 +199,7 @@ static struct rte_devargs *pci_devargs_lookup(struct rte_pci_device *dev)
dev->id.device_id, dr->driver.name);
if (dr->drv_flags & RTE_PCI_DRV_NEED_MAPPING) {
- /* map resources for devices that use igb_uio */
+ /* map resources for devices that use igb_uio or VFIO */
ret = rte_eal_pci_map_device(dev);
if (ret != 0)
return ret;
@@ -121,7 +121,10 @@
/* try unmapping the NIC resources using VFIO if it exists */
switch (dev->kdrv) {
case RTE_KDRV_VFIO:
- RTE_LOG(ERR, EAL, "Hotplug doesn't support vfio yet\n");
+#ifdef VFIO_PRESENT
+ if (pci_vfio_is_enabled())
+ pci_vfio_unmap_resource(dev);
+#endif
break;
case RTE_KDRV_IGB_UIO:
case RTE_KDRV_UIO_GENERIC:
@@ -88,8 +88,9 @@ void pci_vfio_ioport_write(struct rte_pci_ioport *p,
const void *data, size_t len, off_t offset);
int pci_vfio_ioport_unmap(struct rte_pci_ioport *p);
-/* map VFIO resource prototype */
+/* map/unmap VFIO resource prototype */
int pci_vfio_map_resource(struct rte_pci_device *dev);
+int pci_vfio_unmap_resource(struct rte_pci_device *dev);
#endif
@@ -38,6 +38,7 @@
#include <sys/socket.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
+#include <stdbool.h>
#include <rte_log.h>
#include <rte_pci.h>
@@ -172,7 +173,7 @@
/* set PCI bus mastering */
static int
-pci_vfio_set_bus_master(int dev_fd)
+pci_vfio_set_bus_master(int dev_fd, bool op)
{
uint16_t reg;
int ret;
@@ -185,8 +186,11 @@
return -1;
}
- /* set the master bit */
- reg |= PCI_COMMAND_MASTER;
+ if (op)
+ /* set the master bit */
+ reg |= PCI_COMMAND_MASTER;
+ else
+ reg &= ~(PCI_COMMAND_MASTER);
ret = pwrite64(dev_fd, ®, sizeof(reg),
VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX) +
@@ -517,7 +521,7 @@
}
/* set bus mastering for the device */
- if (pci_vfio_set_bus_master(vfio_dev_fd)) {
+ if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
RTE_LOG(ERR, EAL, " %s cannot set up bus mastering!\n", pci_addr);
close(vfio_dev_fd);
rte_free(vfio_res);
@@ -535,6 +539,79 @@
}
int
+pci_vfio_unmap_resource(struct rte_pci_device *dev)
+{
+ char pci_addr[PATH_MAX] = {0};
+ struct rte_pci_addr *loc = &dev->addr;
+ int i, ret;
+ struct mapped_pci_resource *vfio_res = NULL;
+ struct mapped_pci_res_list *vfio_res_list;
+
+ struct pci_map *maps;
+
+ /* store PCI address string */
+ snprintf(pci_addr, sizeof(pci_addr), PCI_PRI_FMT,
+ loc->domain, loc->bus, loc->devid, loc->function);
+
+
+ if (close(dev->intr_handle.fd) < 0) {
+ RTE_LOG(INFO, EAL, "Error when closing eventfd file descriptor for %s\n",
+ pci_addr);
+ return -1;
+ }
+
+ if (pci_vfio_set_bus_master(dev->intr_handle.vfio_dev_fd, false)) {
+ RTE_LOG(ERR, EAL, " %s cannot unset bus mastering for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ ret = vfio_release_device(pci_get_sysfs_path(), pci_addr,
+ dev->intr_handle.vfio_dev_fd);
+ if (ret < 0) {
+ RTE_LOG(ERR, EAL,
+ "%s(): cannot release device\n", __func__);
+ return ret;
+ }
+
+ vfio_res_list = RTE_TAILQ_CAST(rte_vfio_tailq.head, mapped_pci_res_list);
+ /* Get vfio_res */
+ TAILQ_FOREACH(vfio_res, vfio_res_list, next) {
+ if (memcmp(&vfio_res->pci_addr, &dev->addr, sizeof(dev->addr)))
+ continue;
+ break;
+ }
+ /* if we haven't found our tailq entry, something's wrong */
+ if (vfio_res == NULL) {
+ RTE_LOG(ERR, EAL, " %s cannot find TAILQ entry for PCI device!\n",
+ pci_addr);
+ return -1;
+ }
+
+ /* unmap BARs */
+ maps = vfio_res->maps;
+
+ RTE_LOG(INFO, EAL, "Releasing pci mapped resource for %s\n",
+ pci_addr);
+ for (i = 0; i < (int) vfio_res->nb_maps; i++) {
+
+ /*
+ * We do not need to be aware of MSI-X table BAR mappings as
+ * when mapping. Just using current maps array is enough
+ */
+ if (maps[i].addr) {
+ RTE_LOG(INFO, EAL, "Calling pci_unmap_resource for %s at %p\n",
+ pci_addr, maps[i].addr);
+ pci_unmap_resource(maps[i].addr, maps[i].size);
+ }
+ }
+
+ TAILQ_REMOVE(vfio_res_list, vfio_res, next);
+
+ return 0;
+}
+
+int
pci_vfio_ioport_map(struct rte_pci_device *dev, int bar,
struct rte_pci_ioport *p)
{
@@ -68,13 +68,32 @@
{
int i;
int vfio_group_fd;
+ int group_idx = -1;
char filename[PATH_MAX];
/* check if we already have the group descriptor open */
- for (i = 0; i < vfio_cfg.vfio_group_idx; i++)
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
if (vfio_cfg.vfio_groups[i].group_no == iommu_group_no)
return vfio_cfg.vfio_groups[i].fd;
+ /* Lets see first if there is room for a new group */
+ if (vfio_cfg.vfio_active_groups == VFIO_MAX_GROUPS) {
+ RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
+ return -1;
+ }
+
+ /* Now lets get an index for the new group */
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg.vfio_groups[i].group_no == -1) {
+ group_idx = i;
+ break;
+ }
+
+ /* This should not happen */
+ if (group_idx == -1) {
+ RTE_LOG(ERR, EAL, "No VFIO group free slot found\n");
+ return -1;
+ }
/* if primary, try to open the group */
if (internal_config.process_type == RTE_PROC_PRIMARY) {
/* try regular group format */
@@ -104,14 +123,9 @@
/* noiommu group found */
}
- /* if the fd is valid, create a new group for it */
- if (vfio_cfg.vfio_group_idx == VFIO_MAX_GROUPS) {
- RTE_LOG(ERR, EAL, "Maximum number of VFIO groups reached!\n");
- close(vfio_group_fd);
- return -1;
- }
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
+ vfio_cfg.vfio_groups[group_idx].group_no = iommu_group_no;
+ vfio_cfg.vfio_groups[group_idx].fd = vfio_group_fd;
+ vfio_cfg.vfio_active_groups++;
return vfio_group_fd;
}
/* if we're in a secondary process, request group fd from the primary
@@ -158,14 +172,66 @@
return -1;
}
-static void
-clear_current_group(void)
+int
+clear_group(int vfio_group_fd)
{
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = 0;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = -1;
+ int i;
+ int socket_fd, ret;
+
+ if (internal_config.process_type == RTE_PROC_PRIMARY) {
+
+ for (i = 0; i < VFIO_MAX_GROUPS; i++)
+ if (vfio_cfg.vfio_groups[i].fd == vfio_group_fd) {
+ vfio_cfg.vfio_groups[i].group_no = -1;
+ vfio_cfg.vfio_groups[i].fd = -1;
+ vfio_cfg.vfio_active_groups--;
+ return 0;
+ }
+ return -1;
+ }
+
+ /* This is just for SECONDARY processes */
+ socket_fd = vfio_mp_sync_connect_to_primary();
+
+ if (socket_fd < 0) {
+ RTE_LOG(ERR, EAL, " cannot connect to primary process!\n");
+ return -1;
+ }
+
+ if (vfio_mp_sync_send_request(socket_fd, SOCKET_CLR_GROUP) < 0) {
+ RTE_LOG(ERR, EAL, " cannot request container fd!\n");
+ close(socket_fd);
+ return -1;
+ }
+
+ if (vfio_mp_sync_send_request(socket_fd, vfio_group_fd) < 0) {
+ RTE_LOG(ERR, EAL, " cannot send group fd!\n");
+ close(socket_fd);
+ return -1;
+ }
+
+ ret = vfio_mp_sync_receive_request(socket_fd);
+ switch (ret) {
+ case SOCKET_NO_FD:
+ RTE_LOG(ERR, EAL, " BAD VFIO group fd!\n");
+ close(socket_fd);
+ break;
+ case SOCKET_OK:
+ close(socket_fd);
+ return 0;
+ case SOCKET_ERR:
+ RTE_LOG(ERR, EAL, " Socket error\n");
+ close(socket_fd);
+ break;
+ default:
+ RTE_LOG(ERR, EAL, " UNKNOWN reply, %d\n", ret);
+ close(socket_fd);
+ }
+ return -1;
}
-int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
+int
+vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info)
{
struct vfio_group_status group_status = {
@@ -192,18 +258,10 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
if (vfio_group_fd < 0)
return -1;
- /* store group fd */
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].group_no = iommu_group_no;
- vfio_cfg.vfio_groups[vfio_cfg.vfio_group_idx].fd = vfio_group_fd;
-
/* if group_fd == 0, that means the device isn't managed by VFIO */
if (vfio_group_fd == 0) {
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
dev_addr);
- /* we store 0 as group fd to distinguish between existing but
- * unbound VFIO groups, and groups that don't exist at all.
- */
- vfio_cfg.vfio_group_idx++;
return 1;
}
@@ -218,12 +276,12 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
RTE_LOG(ERR, EAL, " %s cannot get group status, "
"error %i (%s)\n", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
} else if (!(group_status.flags & VFIO_GROUP_FLAGS_VIABLE)) {
RTE_LOG(ERR, EAL, " %s VFIO group is not viable!\n", dev_addr);
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
}
@@ -237,58 +295,62 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
RTE_LOG(ERR, EAL, " %s cannot add VFIO group to container, "
"error %i (%s)\n", dev_addr, errno, strerror(errno));
close(vfio_group_fd);
- clear_current_group();
+ clear_group(vfio_group_fd);
return -1;
}
- /*
- * at this point we know that this group has been successfully
- * initialized, so we increment vfio_group_idx to indicate that we can
- * add new groups.
- */
- vfio_cfg.vfio_group_idx++;
}
/*
* pick an IOMMU type and set up DMA mappings for container
*
- * needs to be done only once, only when at least one group is assigned to
- * a container and only in primary process
+ * needs to be done only once, only when first group is assigned to
+ * a container and only in primary process. Note this can happen several
+ * times with the hotplug functionality.
*/
if (internal_config.process_type == RTE_PROC_PRIMARY &&
- vfio_cfg.vfio_container_has_dma == 0) {
+ vfio_cfg.vfio_active_groups == 1) {
/* select an IOMMU type which we will be using */
const struct vfio_iommu_type *t =
vfio_set_iommu_type(vfio_cfg.vfio_container_fd);
if (!t) {
RTE_LOG(ERR, EAL, " %s failed to select IOMMU type\n", dev_addr);
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
return -1;
}
ret = t->dma_map_func(vfio_cfg.vfio_container_fd);
if (ret) {
RTE_LOG(ERR, EAL, " %s DMA remapping failed, "
"error %i (%s)\n", dev_addr, errno, strerror(errno));
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
return -1;
}
- vfio_cfg.vfio_container_has_dma = 1;
}
/* get a file descriptor for the device */
*vfio_dev_fd = ioctl(vfio_group_fd, VFIO_GROUP_GET_DEVICE_FD, dev_addr);
if (*vfio_dev_fd < 0) {
- /* if we cannot get a device fd, this simply means that this
- * particular port is not bound to VFIO
- */
- RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver, skipping\n",
+ /* if we cannot get a device fd, this implies a problem with
+ * the VFIO group or the container not having IOMMU configured.
+ */
+
+ RTE_LOG(WARNING, EAL, "Getting a vfio_dev_fd for %s failed\n",
dev_addr);
- return 1;
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
+ return -1;
}
/* test and setup the device */
ret = ioctl(*vfio_dev_fd, VFIO_DEVICE_GET_INFO, device_info);
if (ret) {
RTE_LOG(ERR, EAL, " %s cannot get device info, "
- "error %i (%s)\n", dev_addr, errno, strerror(errno));
+ "error %i (%s)\n", dev_addr, errno,
+ strerror(errno));
close(*vfio_dev_fd);
+ close(vfio_group_fd);
+ clear_group(vfio_group_fd);
return -1;
}
@@ -296,6 +358,54 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
}
int
+vfio_release_device(const char *sysfs_base, const char *dev_addr,
+ int vfio_dev_fd)
+{
+ struct vfio_group_status group_status = {
+ .argsz = sizeof(group_status)
+ };
+ int vfio_group_fd;
+ int iommu_group_no;
+ int ret;
+
+ /* get group number */
+ ret = vfio_get_group_no(sysfs_base, dev_addr, &iommu_group_no);
+ if (ret <= 0) {
+ RTE_LOG(WARNING, EAL, " %s not managed by VFIO driver\n",
+ dev_addr);
+ /* This is an error at this point. */
+ return -1;
+ }
+
+ /* get the actual group fd */
+ vfio_group_fd = vfio_get_group_fd(iommu_group_no);
+ if (vfio_group_fd <= 0) {
+ RTE_LOG(INFO, EAL, "vfio_get_group_fd failed for %s\n",
+ dev_addr);
+ return -1;
+ }
+
+ /* At this point we got an active group. Closing it will make the
+ * container detachment. If this is the last active group, VFIO kernel
+ * code will unset the container and the IOMMU mappings.
+ */
+
+ if (close(vfio_group_fd) < 0)
+ RTE_LOG(INFO, EAL, "Error when closing vfio_group_fd for %s\n",
+ dev_addr);
+
+ if (close(vfio_dev_fd) < 0)
+ RTE_LOG(INFO, EAL, "Error when closing vfio_dev_fd for %s\n",
+ dev_addr);
+
+ if (clear_group(vfio_group_fd) < 0)
+ RTE_LOG(INFO, EAL, "Error when clearing group for %s\n",
+ dev_addr);
+
+ return 0;
+}
+
+int
vfio_enable(const char *modname)
{
/* initialize group list */
@@ -534,7 +644,8 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
if (ret) {
RTE_LOG(ERR, EAL, " cannot set up DMA remapping, "
- "error %i (%s)\n", errno, strerror(errno));
+ "error %i (%s)\n", errno,
+ strerror(errno));
return -1;
}
}
@@ -108,8 +108,7 @@ struct vfio_group {
struct vfio_config {
int vfio_enabled;
int vfio_container_fd;
- int vfio_container_has_dma;
- int vfio_group_idx;
+ int vfio_active_groups;
struct vfio_group vfio_groups[VFIO_MAX_GROUPS];
};
@@ -155,6 +154,10 @@ struct vfio_iommu_type {
int
vfio_get_group_fd(int iommu_group_no);
+/* remove group fd from internal VFIO group fd array */
+int
+clear_group(int vfio_group_fd);
+
/**
* Setup vfio_cfg for the device identified by its address. It discovers
* the configured I/O MMU groups or sets a new one for the device. If a new
@@ -165,6 +168,8 @@ struct vfio_iommu_type {
int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
int *vfio_dev_fd, struct vfio_device_info *device_info);
+int vfio_release_device(const char *sysfs_base, const char *dev_addr, int fd);
+
int vfio_enable(const char *modname);
int vfio_is_enabled(const char *modname);
@@ -175,6 +180,7 @@ int vfio_setup_device(const char *sysfs_base, const char *dev_addr,
#define SOCKET_REQ_CONTAINER 0x100
#define SOCKET_REQ_GROUP 0x200
+#define SOCKET_CLR_GROUP 0x300
#define SOCKET_OK 0x0
#define SOCKET_NO_FD 0x1
#define SOCKET_ERR 0xFF
@@ -267,7 +267,7 @@
static __attribute__((noreturn)) void *
vfio_mp_sync_thread(void __rte_unused * arg)
{
- int ret, fd, vfio_group_no;
+ int ret, fd, vfio_data;
/* wait for requests on the socket */
for (;;) {
@@ -305,13 +305,13 @@ static __attribute__((noreturn)) void *
break;
case SOCKET_REQ_GROUP:
/* wait for group number */
- vfio_group_no = vfio_mp_sync_receive_request(conn_sock);
- if (vfio_group_no < 0) {
+ vfio_data = vfio_mp_sync_receive_request(conn_sock);
+ if (vfio_data < 0) {
close(conn_sock);
continue;
}
- fd = vfio_get_group_fd(vfio_group_no);
+ fd = vfio_get_group_fd(vfio_data);
if (fd < 0)
vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
@@ -324,6 +324,21 @@ static __attribute__((noreturn)) void *
vfio_mp_sync_send_fd(conn_sock, fd);
}
break;
+ case SOCKET_CLR_GROUP:
+ /* wait for group fd */
+ vfio_data = vfio_mp_sync_receive_request(conn_sock);
+ if (vfio_data < 0) {
+ close(conn_sock);
+ continue;
+ }
+
+ ret = clear_group(vfio_data);
+
+ if (ret < 0)
+ vfio_mp_sync_send_request(conn_sock, SOCKET_NO_FD);
+ else
+ vfio_mp_sync_send_request(conn_sock, SOCKET_OK);
+ break;
default:
vfio_mp_sync_send_request(conn_sock, SOCKET_ERR);
break;
@@ -455,8 +455,8 @@ struct rte_eth_dev *
case RTE_KDRV_UIO_GENERIC:
case RTE_KDRV_NIC_UIO:
case RTE_KDRV_NONE:
- break;
case RTE_KDRV_VFIO:
+ break;
default:
return -ENOTSUP;
}