[dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping
Jonas Pfefferle1
JPF at zurich.ibm.com
Wed Sep 20 16:38:47 CEST 2017
Hi Anatoly,
"Burakov, Anatoly" <anatoly.burakov at intel.com> wrote on 09/19/2017 01:40:51
PM:
> From: "Burakov, Anatoly" <anatoly.burakov at intel.com>
> To: Jonas Pfefferle <jpf at zurich.ibm.com>, dev at dpdk.org
> Date: 09/19/2017 01:41 PM
> Subject: Re: [dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping
>
> Hi Jonas,
>
> On 17-Aug-17 12:35 PM, Jonas Pfefferle wrote:
> > Split pci_vfio_map_resource for primary and secondary processes.
> > Save all relevant mapping data in primary process to allow
> > the secondary process to perform mappings.
> >
> > Signed-off-by: Jonas Pfefferle <jpf at zurich.ibm.com>
> > ---
> > lib/librte_eal/common/include/rte_pci.h | 7 +
> > lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 447 +++++++++++++++
> ++------------
> > 2 files changed, 271 insertions(+), 183 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/
> librte_eal/common/include/rte_pci.h
> > index 8b12339..0821af9 100644
> > --- a/lib/librte_eal/common/include/rte_pci.h
> > +++ b/lib/librte_eal/common/include/rte_pci.h
> > @@ -214,6 +214,12 @@ struct pci_map {
> > uint64_t phaddr;
> > };
> >
> > +struct pci_msix_table {
> > + int bar_index;
> > + uint32_t offset;
> > + uint32_t size;
> > +};
> > +
> > /**
> > * A structure describing a mapped PCI resource.
> > * For multi-process we need to reproduce all PCI mappings in
secondary
> > @@ -226,6 +232,7 @@ struct mapped_pci_resource {
> > char path[PATH_MAX];
> > int nb_maps;
> > struct pci_map maps[PCI_MAX_RESOURCE];
> > + struct pci_msix_table msix_table;
> > };
> >
> > /** mapped pci device list */
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_pci_vfio.c
> > index aa9d96e..f37552a 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > @@ -88,8 +88,7 @@ pci_vfio_write_config(const struct
> rte_intr_handle *intr_handle,
> >
> > /* get PCI BAR number where MSI-X interrupts are */
> > static int
> > -pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t
*msix_table_offset,
> > - uint32_t *msix_table_size)
> > +pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
> > {
> > int ret;
> > uint32_t reg;
> > @@ -161,9 +160,10 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar,
> uint32_t *msix_table_offset,
> > return -1;
> > }
> >
> > - *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
> > - *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> > - *msix_table_size = 16 * (1 + (flags &
RTE_PCI_MSIX_FLAGS_QSIZE));
> > + msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
> > + msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> > + msix_table->size =
> > + 16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
> >
> > return 0;
> > }
> > @@ -300,25 +300,152 @@ pci_vfio_setup_interrupts(struct
> rte_pci_device *dev, int vfio_dev_fd)
> > return -1;
> > }
> >
> > -/*
> > - * map the PCI resources of a PCI device in virtual memory (VFIO
version).
> > - * primary and secondary processes follow almost exactly the same path
> > - */
> > -int
> > -pci_vfio_map_resource(struct rte_pci_device *dev)
> > +static int
> > +pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
> > +{
> > + uint32_t ioport_bar;
> > + int ret;
> > +
> > + ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
> > + VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
> > + + PCI_BASE_ADDRESS_0 + bar_index*4);
> > + if (ret != sizeof(ioport_bar)) {
> > + RTE_LOG(ERR, EAL, "Cannot read command (%x) from config
space!\n",
> > + PCI_BASE_ADDRESS_0 + bar_index*4);
> > + return -1;
> > + }
> > +
> > + if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
> > + RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d) addr: %x\n",
> > + bar_index, ioport_bar);
>
> This log message should probably go to the "continue" portion of the
> calling code, it looks out of place here.
Agree. I will move it.
>
> > + return 1;
> > + }
> > + return 0;
> > +}
> > +
> > +static int
> > +pci_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
> > +{
> > + if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
> > + RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
> > + return -1;
> > + }
> > +
> > + /* set bus mastering for the device */
> > + if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
> > + RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
> > + return -1;
> > + }
> > +
> > + /* Reset the device */
> > + ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
> > +
> > + return 0;
> > +}
> > +
> > +static int
> > +pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource
*vfio_res,
> > + int bar_index, int additional_flags)
> > +{
> > + struct memreg {
> > + unsigned long offset, size;
> > + } memreg[2] = {};
> > + void *bar_addr;
> > + struct pci_msix_table *msix_table = &vfio_res->msix_table;
> > + struct pci_map *bar = &vfio_res->maps[bar_index];
> > +
> > + if (bar->size == 0)
> > + /* Skip this BAR */
> > + return 0;
> > +
> > + if (msix_table->bar_index == bar_index) {
> > + /*
> > + * VFIO will not let us map the MSI-X table,
> > + * but we can map around it.
> > + */
> > + uint32_t table_start = msix_table->offset;
> > + uint32_t table_end = table_start + msix_table->size;
> > + table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
> > + table_start &= PAGE_MASK;
> > +
> > + if (table_start == 0 && table_end >= bar->size) {
> > + /* Cannot map this BAR */
> > + RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
> > + bar->size = 0;
> > + bar->addr = 0;
> > + return 0;
> > + }
> > +
> > + memreg[0].offset = bar->offset;
> > + memreg[0].size = table_start;
> > + memreg[1].offset = bar->offset + table_end;
> > + memreg[1].size = bar->size - table_end;
> > +
> > + RTE_LOG(DEBUG, EAL,
> > + "Trying to map BAR%d that contains the MSI-X "
> > + "table. Trying offsets: "
> > + "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index,
> > + memreg[0].offset, memreg[0].size,
> > + memreg[1].offset, memreg[1].size);
> > + }
>
> I believe you forgot the "else" part. memreg is, by default, initialized
> to zeroes, and if bar_index is not equal to MSI-X bar index, memreg does
> not get filled with any values, and therefore all of the following
> checks for memreg.size etc. will return false and you'll end up with
> failed BAR mappings.
>
> Confirmed with testing:
>
> EAL: PCI device 0000:08:00.0 on NUMA socket 0
> EAL: probe driver: 8086:10fb net_ixgbe
> EAL: using IOMMU type 1 (Type 1)
> EAL: Failed to map pci BAR0
> EAL: 0000:08:00.0 mapping BAR0 failed: Success
> EAL: Requested device 0000:08:00.0 cannot be used
You are correct. Not sure how this happened...I remember testing this
successfully on x86 and POWER.
I will create a new version with the fixes.
The whole reason I started this refactoring effort is to allow (in a later
patch) to mmap the MSI-X table if the kernel allows it (e.g. via this patch
https://lkml.org/lkml/2017/8/7/98). The problem on POWER is that the
default page size is 64K, i.e. you will not be able to map around the MSI-X
table which makes most of the NVMe devices (at least to my knowledge)
unusable with SPDK on POWER.
>
> --
> Thanks,
> Anatoly
>
Thanks,
Jonas
More information about the dev
mailing list