[dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping

Burakov, Anatoly anatoly.burakov at intel.com
Tue Sep 19 13:40:51 CEST 2017


Hi Jonas,

On 17-Aug-17 12:35 PM, Jonas Pfefferle wrote:
> Split pci_vfio_map_resource for primary and secondary processes.
> Save all relevant mapping data in primary process to allow
> the secondary process to perform mappings.
> 
> Signed-off-by: Jonas Pfefferle <jpf at zurich.ibm.com>
> ---
>   lib/librte_eal/common/include/rte_pci.h    |   7 +
>   lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 447 +++++++++++++++++------------
>   2 files changed, 271 insertions(+), 183 deletions(-)
> 
> diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/librte_eal/common/include/rte_pci.h
> index 8b12339..0821af9 100644
> --- a/lib/librte_eal/common/include/rte_pci.h
> +++ b/lib/librte_eal/common/include/rte_pci.h
> @@ -214,6 +214,12 @@ struct pci_map {
>   	uint64_t phaddr;
>   };
>   
> +struct pci_msix_table {
> +	int bar_index;
> +	uint32_t offset;
> +	uint32_t size;
> +};
> +
>   /**
>    * A structure describing a mapped PCI resource.
>    * For multi-process we need to reproduce all PCI mappings in secondary
> @@ -226,6 +232,7 @@ struct mapped_pci_resource {
>   	char path[PATH_MAX];
>   	int nb_maps;
>   	struct pci_map maps[PCI_MAX_RESOURCE];
> +	struct pci_msix_table msix_table;
>   };
>   
>   /** mapped pci device list */
> diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> index aa9d96e..f37552a 100644
> --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> @@ -88,8 +88,7 @@ pci_vfio_write_config(const struct rte_intr_handle *intr_handle,
>   
>   /* get PCI BAR number where MSI-X interrupts are */
>   static int
> -pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
> -		      uint32_t *msix_table_size)
> +pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
>   {
>   	int ret;
>   	uint32_t reg;
> @@ -161,9 +160,10 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t *msix_table_offset,
>   				return -1;
>   			}
>   
> -			*msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
> -			*msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> -			*msix_table_size = 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
> +			msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
> +			msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> +			msix_table->size =
> +				16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
>   
>   			return 0;
>   		}
> @@ -300,25 +300,152 @@ pci_vfio_setup_interrupts(struct rte_pci_device *dev, int vfio_dev_fd)
>   	return -1;
>   }
>   
> -/*
> - * map the PCI resources of a PCI device in virtual memory (VFIO version).
> - * primary and secondary processes follow almost exactly the same path
> - */
> -int
> -pci_vfio_map_resource(struct rte_pci_device *dev)
> +static int
> +pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
> +{
> +	uint32_t ioport_bar;
> +	int ret;
> +
> +	ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
> +			  VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
> +			  + PCI_BASE_ADDRESS_0 + bar_index*4);
> +	if (ret != sizeof(ioport_bar)) {
> +		RTE_LOG(ERR, EAL, "Cannot read command (%x) from config space!\n",
> +			PCI_BASE_ADDRESS_0 + bar_index*4);
> +		return -1;
> +	}
> +
> +	if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
> +		RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d) addr: %x\n",
> +			 bar_index, ioport_bar);

This log message should probably go to the "continue" portion of the 
calling code, it looks out of place here.

> +		return 1;
> +	}
> +	return 0;
> +}
> +
> +static int
> +pci_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
> +{
> +	if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
> +		RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
> +		return -1;
> +	}
> +
> +	/* set bus mastering for the device */
> +	if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
> +		RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
> +		return -1;
> +	}
> +
> +	/* Reset the device */
> +	ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
> +
> +	return 0;
> +}
> +
> +static int
> +pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource *vfio_res,
> +		int bar_index, int additional_flags)
> +{
> +	struct memreg {
> +		unsigned long offset, size;
> +	} memreg[2] = {};
> +	void *bar_addr;
> +	struct pci_msix_table *msix_table = &vfio_res->msix_table;
> +	struct pci_map *bar = &vfio_res->maps[bar_index];
> +
> +	if (bar->size == 0)
> +		/* Skip this BAR */
> +		return 0;
> +
> +	if (msix_table->bar_index == bar_index) {
> +		/*
> +		 * VFIO will not let us map the MSI-X table,
> +		 * but we can map around it.
> +		 */
> +		uint32_t table_start = msix_table->offset;
> +		uint32_t table_end = table_start + msix_table->size;
> +		table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
> +		table_start &= PAGE_MASK;
> +
> +		if (table_start == 0 && table_end >= bar->size) {
> +			/* Cannot map this BAR */
> +			RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
> +			bar->size = 0;
> +			bar->addr = 0;
> +			return 0;
> +		}
> +
> +		memreg[0].offset = bar->offset;
> +		memreg[0].size = table_start;
> +		memreg[1].offset = bar->offset + table_end;
> +		memreg[1].size = bar->size - table_end;
> +
> +		RTE_LOG(DEBUG, EAL,
> +			"Trying to map BAR%d that contains the MSI-X "
> +			"table. Trying offsets: "
> +			"0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index,
> +			memreg[0].offset, memreg[0].size,
> +			memreg[1].offset, memreg[1].size);
> +	}

I believe you forgot the "else" part. memreg is, by default, initialized 
to zeroes, and if bar_index is not equal to MSI-X bar index, memreg does 
not get filled with any values, and therefore all of the following 
checks for memreg.size etc. will return false and you'll end up with 
failed BAR mappings.

Confirmed with testing:

EAL: PCI device 0000:08:00.0 on NUMA socket 0
EAL:   probe driver: 8086:10fb net_ixgbe
EAL:   using IOMMU type 1 (Type 1)
EAL: Failed to map pci BAR0
EAL:   0000:08:00.0 mapping BAR0 failed: Success
EAL: Requested device 0000:08:00.0 cannot be used

--
Thanks,
Anatoly


More information about the dev mailing list