[dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping

Jonas Pfefferle1 JPF at zurich.ibm.com
Wed Sep 20 16:38:47 CEST 2017


Hi Anatoly,

"Burakov, Anatoly" <anatoly.burakov at intel.com> wrote on 09/19/2017 01:40:51
PM:

> From: "Burakov, Anatoly" <anatoly.burakov at intel.com>
> To: Jonas Pfefferle <jpf at zurich.ibm.com>, dev at dpdk.org
> Date: 09/19/2017 01:41 PM
> Subject: Re: [dpdk-dev] [PATCH] vfio: refactor PCI BAR mapping
>
> Hi Jonas,
>
> On 17-Aug-17 12:35 PM, Jonas Pfefferle wrote:
> > Split pci_vfio_map_resource for primary and secondary processes.
> > Save all relevant mapping data in primary process to allow
> > the secondary process to perform mappings.
> >
> > Signed-off-by: Jonas Pfefferle <jpf at zurich.ibm.com>
> > ---
> >   lib/librte_eal/common/include/rte_pci.h    |   7 +
> >   lib/librte_eal/linuxapp/eal/eal_pci_vfio.c | 447 +++++++++++++++
> ++------------
> >   2 files changed, 271 insertions(+), 183 deletions(-)
> >
> > diff --git a/lib/librte_eal/common/include/rte_pci.h b/lib/
> librte_eal/common/include/rte_pci.h
> > index 8b12339..0821af9 100644
> > --- a/lib/librte_eal/common/include/rte_pci.h
> > +++ b/lib/librte_eal/common/include/rte_pci.h
> > @@ -214,6 +214,12 @@ struct pci_map {
> >      uint64_t phaddr;
> >   };
> >
> > +struct pci_msix_table {
> > +   int bar_index;
> > +   uint32_t offset;
> > +   uint32_t size;
> > +};
> > +
> >   /**
> >    * A structure describing a mapped PCI resource.
> >    * For multi-process we need to reproduce all PCI mappings in
secondary
> > @@ -226,6 +232,7 @@ struct mapped_pci_resource {
> >      char path[PATH_MAX];
> >      int nb_maps;
> >      struct pci_map maps[PCI_MAX_RESOURCE];
> > +   struct pci_msix_table msix_table;
> >   };
> >
> >   /** mapped pci device list */
> > diff --git a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c b/lib/
> librte_eal/linuxapp/eal/eal_pci_vfio.c
> > index aa9d96e..f37552a 100644
> > --- a/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > +++ b/lib/librte_eal/linuxapp/eal/eal_pci_vfio.c
> > @@ -88,8 +88,7 @@ pci_vfio_write_config(const struct
> rte_intr_handle *intr_handle,
> >
> >   /* get PCI BAR number where MSI-X interrupts are */
> >   static int
> > -pci_vfio_get_msix_bar(int fd, int *msix_bar, uint32_t
*msix_table_offset,
> > -            uint32_t *msix_table_size)
> > +pci_vfio_get_msix_bar(int fd, struct pci_msix_table *msix_table)
> >   {
> >      int ret;
> >      uint32_t reg;
> > @@ -161,9 +160,10 @@ pci_vfio_get_msix_bar(int fd, int *msix_bar,
> uint32_t *msix_table_offset,
> >               return -1;
> >            }
> >
> > -         *msix_bar = reg & RTE_PCI_MSIX_TABLE_BIR;
> > -         *msix_table_offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> > -         *msix_table_size = 16 * (1 + (flags &
RTE_PCI_MSIX_FLAGS_QSIZE));
> > +         msix_table->bar_index = reg & RTE_PCI_MSIX_TABLE_BIR;
> > +         msix_table->offset = reg & RTE_PCI_MSIX_TABLE_OFFSET;
> > +         msix_table->size =
> > +            16 * (1 + (flags & RTE_PCI_MSIX_FLAGS_QSIZE));
> >
> >            return 0;
> >         }
> > @@ -300,25 +300,152 @@ pci_vfio_setup_interrupts(struct
> rte_pci_device *dev, int vfio_dev_fd)
> >      return -1;
> >   }
> >
> > -/*
> > - * map the PCI resources of a PCI device in virtual memory (VFIO
version).
> > - * primary and secondary processes follow almost exactly the same path
> > - */
> > -int
> > -pci_vfio_map_resource(struct rte_pci_device *dev)
> > +static int
> > +pci_vfio_is_ioport_bar(int vfio_dev_fd, int bar_index)
> > +{
> > +   uint32_t ioport_bar;
> > +   int ret;
> > +
> > +   ret = pread64(vfio_dev_fd, &ioport_bar, sizeof(ioport_bar),
> > +           VFIO_GET_REGION_ADDR(VFIO_PCI_CONFIG_REGION_INDEX)
> > +           + PCI_BASE_ADDRESS_0 + bar_index*4);
> > +   if (ret != sizeof(ioport_bar)) {
> > +      RTE_LOG(ERR, EAL, "Cannot read command (%x) from config
space!\n",
> > +         PCI_BASE_ADDRESS_0 + bar_index*4);
> > +      return -1;
> > +   }
> > +
> > +   if (ioport_bar & PCI_BASE_ADDRESS_SPACE_IO) {
> > +      RTE_LOG(INFO, EAL, "Ignore mapping IO port bar(%d) addr: %x\n",
> > +          bar_index, ioport_bar);
>
> This log message should probably go to the "continue" portion of the
> calling code, it looks out of place here.

Agree. I will move it.

>
> > +      return 1;
> > +   }
> > +   return 0;
> > +}
> > +
> > +static int
> > +pci_vfio_setup_device(struct rte_pci_device *dev, int vfio_dev_fd)
> > +{
> > +   if (pci_vfio_setup_interrupts(dev, vfio_dev_fd) != 0) {
> > +      RTE_LOG(ERR, EAL, "Error setting up interrupts!\n");
> > +      return -1;
> > +   }
> > +
> > +   /* set bus mastering for the device */
> > +   if (pci_vfio_set_bus_master(vfio_dev_fd, true)) {
> > +      RTE_LOG(ERR, EAL, "Cannot set up bus mastering!\n");
> > +      return -1;
> > +   }
> > +
> > +   /* Reset the device */
> > +   ioctl(vfio_dev_fd, VFIO_DEVICE_RESET);
> > +
> > +   return 0;
> > +}
> > +
> > +static int
> > +pci_vfio_mmap_bar(int vfio_dev_fd, struct mapped_pci_resource
*vfio_res,
> > +      int bar_index, int additional_flags)
> > +{
> > +   struct memreg {
> > +      unsigned long offset, size;
> > +   } memreg[2] = {};
> > +   void *bar_addr;
> > +   struct pci_msix_table *msix_table = &vfio_res->msix_table;
> > +   struct pci_map *bar = &vfio_res->maps[bar_index];
> > +
> > +   if (bar->size == 0)
> > +      /* Skip this BAR */
> > +      return 0;
> > +
> > +   if (msix_table->bar_index == bar_index) {
> > +      /*
> > +       * VFIO will not let us map the MSI-X table,
> > +       * but we can map around it.
> > +       */
> > +      uint32_t table_start = msix_table->offset;
> > +      uint32_t table_end = table_start + msix_table->size;
> > +      table_end = (table_end + ~PAGE_MASK) & PAGE_MASK;
> > +      table_start &= PAGE_MASK;
> > +
> > +      if (table_start == 0 && table_end >= bar->size) {
> > +         /* Cannot map this BAR */
> > +         RTE_LOG(DEBUG, EAL, "Skipping BAR%d\n", bar_index);
> > +         bar->size = 0;
> > +         bar->addr = 0;
> > +         return 0;
> > +      }
> > +
> > +      memreg[0].offset = bar->offset;
> > +      memreg[0].size = table_start;
> > +      memreg[1].offset = bar->offset + table_end;
> > +      memreg[1].size = bar->size - table_end;
> > +
> > +      RTE_LOG(DEBUG, EAL,
> > +         "Trying to map BAR%d that contains the MSI-X "
> > +         "table. Trying offsets: "
> > +         "0x%04lx:0x%04lx, 0x%04lx:0x%04lx\n", bar_index,
> > +         memreg[0].offset, memreg[0].size,
> > +         memreg[1].offset, memreg[1].size);
> > +   }
>
> I believe you forgot the "else" part. memreg is, by default, initialized
> to zeroes, and if bar_index is not equal to MSI-X bar index, memreg does
> not get filled with any values, and therefore all of the following
> checks for memreg.size etc. will return false and you'll end up with
> failed BAR mappings.
>
> Confirmed with testing:
>
> EAL: PCI device 0000:08:00.0 on NUMA socket 0
> EAL:   probe driver: 8086:10fb net_ixgbe
> EAL:   using IOMMU type 1 (Type 1)
> EAL: Failed to map pci BAR0
> EAL:   0000:08:00.0 mapping BAR0 failed: Success
> EAL: Requested device 0000:08:00.0 cannot be used

You are correct. Not sure how this happened...I remember testing this
successfully on x86 and POWER.
I will create a new version with the fixes.

The whole reason I started this refactoring effort is to allow (in a later
patch) to mmap the MSI-X table if the kernel allows it (e.g. via this patch
https://lkml.org/lkml/2017/8/7/98). The problem on POWER is that the
default page size is 64K, i.e. you will not be able to map around the MSI-X
table which makes most of the NVMe devices (at least to my knowledge)
unusable with SPDK on POWER.

>
> --
> Thanks,
> Anatoly
>

Thanks,
Jonas


More information about the dev mailing list