[dpdk-dev] [PATCH 2/2] uio: new driver to support PCI MSI-X

Alexander Duyck alexander.duyck at gmail.com
Fri Oct 2 01:40:10 CEST 2015


On 09/30/2015 03:28 PM, Stephen Hemminger wrote:
> This driver allows using PCI device with Message Signalled Interrupt
> from userspace. The API is similar to the igb_uio driver used by the DPDK.
> Via ioctl it provides a mechanism to map MSI-X interrupts into event
> file descriptors similar to VFIO.
>
> VFIO is a better choice if IOMMU is available, but often userspace drivers
> have to work in environments where IOMMU support (real or emulated) is
> not available.  All UIO drivers that support DMA are not secure against
> rogue userspace applications programming DMA hardware to access
> private memory; this driver is no less secure than existing code.
>
> Signed-off-by: Stephen Hemminger <stephen at networkplumber.org>
> ---
>   drivers/uio/Kconfig          |   9 ++
>   drivers/uio/Makefile         |   1 +
>   drivers/uio/uio_msi.c        | 378 +++++++++++++++++++++++++++++++++++++++++++
>   include/uapi/linux/Kbuild    |   1 +
>   include/uapi/linux/uio_msi.h |  22 +++
>   5 files changed, 411 insertions(+)
>   create mode 100644 drivers/uio/uio_msi.c
>   create mode 100644 include/uapi/linux/uio_msi.h
>
> diff --git a/drivers/uio/Kconfig b/drivers/uio/Kconfig
> index 52c98ce..04adfa0 100644
> --- a/drivers/uio/Kconfig
> +++ b/drivers/uio/Kconfig
> @@ -93,6 +93,15 @@ config UIO_PCI_GENERIC
>   	  primarily, for virtualization scenarios.
>   	  If you compile this as a module, it will be called uio_pci_generic.
>
> +config UIO_PCI_MSI
> +       tristate "Generic driver supporting MSI-x on PCI Express cards"
> +	depends on PCI
> +	help
> +	  Generic driver that provides Message Signalled IRQ events
> +	  similar to VFIO. If IOMMMU is available please use VFIO
> +	  instead since it provides more security.
> +	  If you compile this as a module, it will be called uio_msi.
> +
>   config UIO_NETX
>   	tristate "Hilscher NetX Card driver"
>   	depends on PCI

Should you maybe instead depend on CONFIG_PCI_MSI.  Without MSI this is 
essentially just uio_pci_generic with a bit more greedy mapping setup.

> diff --git a/drivers/uio/Makefile b/drivers/uio/Makefile
> index 8560dad..62fc44b 100644
> --- a/drivers/uio/Makefile
> +++ b/drivers/uio/Makefile
> @@ -9,3 +9,4 @@ obj-$(CONFIG_UIO_NETX)	+= uio_netx.o
>   obj-$(CONFIG_UIO_PRUSS)         += uio_pruss.o
>   obj-$(CONFIG_UIO_MF624)         += uio_mf624.o
>   obj-$(CONFIG_UIO_FSL_ELBC_GPCM)	+= uio_fsl_elbc_gpcm.o
> +obj-$(CONFIG_UIO_PCI_MSI)	+= uio_msi.o
> diff --git a/drivers/uio/uio_msi.c b/drivers/uio/uio_msi.c
> new file mode 100644
> index 0000000..802b5c4
> --- /dev/null
> +++ b/drivers/uio/uio_msi.c
> @@ -0,0 +1,378 @@
> +/*-
> + *
> + * Copyright (c) 2015 by Brocade Communications Systems, Inc.
> + * Author: Stephen Hemminger <stephen at networkplumber.org>
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 only.
> + */
> +
> +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
> +
> +#include <linux/device.h>
> +#include <linux/interrupt.h>
> +#include <linux/eventfd.h>
> +#include <linux/module.h>
> +#include <linux/pci.h>
> +#include <linux/uio_driver.h>
> +#include <linux/msi.h>
> +#include <linux/uio_msi.h>
> +
> +#define DRIVER_VERSION		"0.1.1"
> +#define MAX_MSIX_VECTORS	64
> +
> +/* MSI-X vector information */
> +struct uio_msi_pci_dev {
> +	struct uio_info info;		/* UIO driver info */
> +	struct pci_dev *pdev;		/* PCI device */
> +	struct mutex	mutex;		/* open/release/ioctl mutex */
> +	int		ref_cnt;	/* references to device */
> +	unsigned int	max_vectors;	/* MSI-X slots available */
> +	struct msix_entry *msix;	/* MSI-X vector table */
> +	struct uio_msi_irq_ctx {
> +		struct eventfd_ctx *trigger; /* vector to eventfd */
> +		char *name;		/* name in /proc/interrupts */
> +	} *ctx;
> +};
> +

I would move the definition of uio_msi_irq_ctx out of uio_msi_pci_dev. 
It would help to make this a bit more readable.

> +static irqreturn_t uio_intx_irqhandler(int irq, void *arg)
> +{
> +	struct uio_msi_pci_dev *udev = arg;
> +
> +	if (pci_check_and_mask_intx(udev->pdev)) {
> +		eventfd_signal(udev->ctx->trigger, 1);
> +		return IRQ_HANDLED;
> +	}
> +
> +	return IRQ_NONE;
> +}
> +

I would really prefer to see the intx handling dropped since there are 
already 2 different UIO drivers setup for handling INTx style 
interrupts.  Lets focus on the parts from the last decade and drop 
support for INTx now in favor of MSI-X and maybe MSI.  If we _REALLY_ 
need it we can always come back later and add it.

> +static irqreturn_t uio_msi_irqhandler(int irq, void *arg)
> +{
> +	struct eventfd_ctx *trigger = arg;
> +
> +	eventfd_signal(trigger, 1);
> +	return IRQ_HANDLED;
> +}
> +
> +/* set the mapping between vector # and existing eventfd. */
> +static int set_irq_eventfd(struct uio_msi_pci_dev *udev, u32 vec, int fd)
> +{
> +	struct eventfd_ctx *trigger;
> +	int irq, err;
> +
> +	if (vec >= udev->max_vectors) {
> +		dev_notice(&udev->pdev->dev, "vec %u >= num_vec %u\n",
> +			   vec, udev->max_vectors);
> +		return -ERANGE;
> +	}
> +
> +	irq = udev->msix[vec].vector;
> +	trigger = udev->ctx[vec].trigger;
> +	if (trigger) {
> +		/* Clearup existing irq mapping */

Minor spelling issue here, "Clear up" should be 2 words, "Cleanup" can 
be one.

> +		free_irq(irq, trigger);
> +		eventfd_ctx_put(trigger);
> +		udev->ctx[vec].trigger = NULL;
> +	}
> +
> +	/* Passing -1 is used to disable interrupt */
> +	if (fd < 0)
> +		return 0;
> +
> +	trigger = eventfd_ctx_fdget(fd);
> +	if (IS_ERR(trigger)) {
> +		err = PTR_ERR(trigger);
> +		dev_notice(&udev->pdev->dev,
> +			   "eventfd ctx get failed: %d\n", err);
> +		return err;
> +	}
> +
> +	if (udev->msix)
> +		err = request_irq(irq, uio_msi_irqhandler, 0,
> +				  udev->ctx[vec].name, trigger);
> +	else
> +		err = request_irq(irq, uio_intx_irqhandler, IRQF_SHARED,
> +				  udev->ctx[vec].name, udev);
> +
> +	if (err) {
> +		dev_notice(&udev->pdev->dev,
> +			   "request irq failed: %d\n", err);
> +		eventfd_ctx_put(trigger);
> +		return err;
> +	}
> +
> +	udev->ctx[vec].trigger = trigger;
> +	return 0;
> +}
> +
> +static int
> +uio_msi_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg)
> +{
> +	struct uio_msi_pci_dev *udev
> +		= container_of(info, struct uio_msi_pci_dev, info);
> +	struct uio_msi_irq_set hdr;
> +	int err;
> +
> +	switch (cmd) {
> +	case UIO_MSI_IRQ_SET:
> +		if (copy_from_user(&hdr, (void __user *)arg, sizeof(hdr)))
> +			return -EFAULT;
> +
> +		mutex_lock(&udev->mutex);
> +		err = set_irq_eventfd(udev, hdr.vec, hdr.fd);
> +		mutex_unlock(&udev->mutex);
> +		break;
> +	default:
> +		err = -EOPNOTSUPP;
> +	}
> +	return err;
> +}
> +
> +/* Opening the UIO device for first time enables MSI-X */
> +static int
> +uio_msi_open(struct uio_info *info, struct inode *inode)
> +{
> +	struct uio_msi_pci_dev *udev
> +		= container_of(info, struct uio_msi_pci_dev, info);
> +	int err = 0;
> +
> +	mutex_lock(&udev->mutex);
> +	if (udev->ref_cnt++ == 0) {
> +		if (udev->msix)
> +			err = pci_enable_msix(udev->pdev, udev->msix,
> +					      udev->max_vectors);
> +	}
> +	mutex_unlock(&udev->mutex);
> +
> +	return err;
> +}
> +

I agree with some other reviewers.  Why call pci_enable_msix in open? 
It seems like it would make much more sense to do this on probe, and 
then disable MSI-X on free.  I can only assume you are trying to do it 
to save on resources but the fact is this is a driver you have to 
explicitly force onto a device so you would probably be safe to assume 
that they plan to use it in the near future.

> +/* Last close of the UIO device releases/disables all IRQ's */
> +static int
> +uio_msi_release(struct uio_info *info, struct inode *inode)
> +{
> +	struct uio_msi_pci_dev *udev
> +		= container_of(info, struct uio_msi_pci_dev, info);
> +	int i;
> +
> +	mutex_lock(&udev->mutex);
> +	if (--udev->ref_cnt == 0) {
> +		for (i = 0; i < udev->max_vectors; i++) {
> +			int irq = udev->msix[i].vector;
> +			struct eventfd_ctx *trigger = udev->ctx[i].trigger;
> +
> +			if (!trigger)
> +				continue;
> +
> +			free_irq(irq, trigger);
> +			eventfd_ctx_put(trigger);
> +			udev->ctx[i].trigger = NULL;
> +		}
> +
> +		if (udev->msix)
> +			pci_disable_msix(udev->pdev);
> +	}
> +	mutex_unlock(&udev->mutex);
> +
> +	return 0;
> +}
> +
> +/* Unmap previously ioremap'd resources */
> +static void
> +release_iomaps(struct uio_mem *mem)
> +{
> +	int i;
> +
> +	for (i = 0; i < MAX_UIO_MAPS; i++, mem++) {
> +		if (mem->internal_addr)
> +			iounmap(mem->internal_addr);
> +	}
> +}
> +
> +static int
> +setup_maps(struct pci_dev *pdev, struct uio_info *info)
> +{
> +	int i, m = 0, p = 0, err;
> +	static const char * const bar_names[] = {
> +		"BAR0",	"BAR1",	"BAR2",	"BAR3",	"BAR4",	"BAR5",
> +	};
> +
> +	for (i = 0; i < ARRAY_SIZE(bar_names); i++) {
> +		unsigned long start = pci_resource_start(pdev, i);
> +		unsigned long flags = pci_resource_flags(pdev, i);
> +		unsigned long len = pci_resource_len(pdev, i);
> +
> +		if (start == 0 || len == 0)
> +			continue;
> +
> +		if (flags & IORESOURCE_MEM) {
> +			void __iomem *addr;
> +
> +			if (m >= MAX_UIO_MAPS)
> +				continue;
> +
> +			addr = ioremap(start, len);
> +			if (addr == NULL) {
> +				err = -EINVAL;
> +				goto fail;
> +			}
> +
> +			info->mem[m].name = bar_names[i];
> +			info->mem[m].addr = start;
> +			info->mem[m].internal_addr = addr;
> +			info->mem[m].size = len;
> +			info->mem[m].memtype = UIO_MEM_PHYS;
> +			++m;
> +		} else if (flags & IORESOURCE_IO) {
> +			if (p >= MAX_UIO_PORT_REGIONS)
> +				continue;
> +
> +			info->port[p].name = bar_names[i];
> +			info->port[p].start = start;
> +			info->port[p].size = len;
> +			info->port[p].porttype = UIO_PORT_X86;
> +			++p;
> +		}
> +	}
> +
> +	return 0;
> + fail:
> +	for (i = 0; i < m; i++)
> +		iounmap(info->mem[i].internal_addr);
> +	return err;
> +}
> +

Do you really need to map IORESOURCE bars?  Most drivers I can think of 
don't use IO BARs anymore.  Maybe we could look at just dropping the 
code and adding it back later if we have a use case that absolutely 
needs it.

Also how many devices actually need resources beyond BAR 0?  I'm just 
curious as I know BAR 2 on many of the Intel devices is the register 
space related to MSI-X so now we have both the PCIe subsystem and user 
space with access to this region.

> +static int uio_msi_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> +{
> +	struct uio_msi_pci_dev *udev;
> +	int i, err, vectors;
> +
> +	udev = kzalloc(sizeof(struct uio_msi_pci_dev), GFP_KERNEL);
> +	if (!udev)
> +		return -ENOMEM;
> +
> +	err = pci_enable_device(pdev);
> +	if (err != 0) {
> +		dev_err(&pdev->dev, "cannot enable PCI device\n");
> +		goto fail_free;
> +	}
> +
> +	err = pci_request_regions(pdev, "uio_msi");
> +	if (err != 0) {
> +		dev_err(&pdev->dev, "Cannot request regions\n");
> +		goto fail_disable;
> +	}
> +
> +	pci_set_master(pdev);
> +
> +	/* remap resources */
> +	err = setup_maps(pdev, &udev->info);
> +	if (err)
> +		goto fail_release_iomem;
> +
> +	/* fill uio infos */
> +	udev->info.name = "uio_msi";
> +	udev->info.version = DRIVER_VERSION;
> +	udev->info.priv = udev;
> +	udev->pdev = pdev;
> +	udev->info.ioctl = uio_msi_ioctl;
> +	udev->info.open = uio_msi_open;
> +	udev->info.release = uio_msi_release;
> +	udev->info.irq = UIO_IRQ_CUSTOM;
> +	mutex_init(&udev->mutex);
> +
> +	vectors = pci_msix_vec_count(pdev);
> +	if (vectors > 0) {
> +		udev->max_vectors = min_t(u16, vectors, MAX_MSIX_VECTORS);
> +		dev_info(&pdev->dev, "using up to %u MSI-X vectors\n",
> +			udev->max_vectors);
> +
> +		err = -ENOMEM;
> +		udev->msix = kcalloc(udev->max_vectors,
> +				     sizeof(struct msix_entry), GFP_KERNEL);
> +		if (!udev->msix)
> +			goto fail_release_iomem;
> +	} else if (!pci_intx_mask_supported(pdev)) {
> +		dev_err(&pdev->dev,
> +			"device does not support MSI-X or INTX\n");
> +		err = -EINVAL;
> +		goto fail_release_iomem;
> +	} else {
> +		dev_notice(&pdev->dev, "using INTX\n");
> +		udev->info.irq_flags = IRQF_SHARED;
> +		udev->max_vectors = 1;
> +	}
> +
> +	udev->ctx = kcalloc(udev->max_vectors,
> +			    sizeof(struct uio_msi_irq_ctx), GFP_KERNEL);
> +	if (!udev->ctx)
> +		goto fail_free_msix;
> +
> +	for (i = 0; i < udev->max_vectors; i++) {
> +		udev->msix[i].entry = i;
> +
> +		udev->ctx[i].name = kasprintf(GFP_KERNEL,
> +					      KBUILD_MODNAME "[%d](%s)",
> +					      i, pci_name(pdev));
> +		if (!udev->ctx[i].name)
> +			goto fail_free_ctx;
> +	}
> +
> +	/* register uio driver */
> +	err = uio_register_device(&pdev->dev, &udev->info);
> +	if (err != 0)
> +		goto fail_free_ctx;
> +
> +	pci_set_drvdata(pdev, udev);
> +	return 0;
> +
> +fail_free_ctx:
> +	for (i = 0; i < udev->max_vectors; i++)
> +		kfree(udev->ctx[i].name);
> +	kfree(udev->ctx);
> +fail_free_msix:
> +	kfree(udev->msix);
> +fail_release_iomem:
> +	release_iomaps(udev->info.mem);
> +	pci_release_regions(pdev);
> +fail_disable:
> +	pci_disable_device(pdev);
> +fail_free:
> +	kfree(udev);
> +
> +	pr_notice("%s ret %d\n", __func__, err);
> +	return err;
> +}
> +
> +static void uio_msi_remove(struct pci_dev *pdev)
> +{
> +	struct uio_info *info = pci_get_drvdata(pdev);
> +	struct uio_msi_pci_dev *udev
> +		= container_of(info, struct uio_msi_pci_dev, info);
> +	int i;
> +
> +	uio_unregister_device(info);
> +	release_iomaps(info->mem);
> +
> +	pci_release_regions(pdev);
> +	for (i = 0; i < udev->max_vectors; i++)
> +		kfree(udev->ctx[i].name);
> +	kfree(udev->ctx);
> +	kfree(udev->msix);
> +	pci_disable_device(pdev);
> +
> +	pci_set_drvdata(pdev, NULL);
> +	kfree(udev);
> +}
> +
> +static struct pci_driver uio_msi_pci_driver = {
> +	.name = "uio_msi",
> +	.probe = uio_msi_probe,
> +	.remove = uio_msi_remove,
> +};
> +
> +module_pci_driver(uio_msi_pci_driver);
> +MODULE_VERSION(DRIVER_VERSION);
> +MODULE_LICENSE("GPL v2");
> +MODULE_AUTHOR("Stephen Hemminger <stephen at networkplumber.org>");
> +MODULE_DESCRIPTION("UIO driver for MSI PCI devices");
> diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild
> index f7b2db4..d9497691 100644
> --- a/include/uapi/linux/Kbuild
> +++ b/include/uapi/linux/Kbuild
> @@ -411,6 +411,7 @@ header-y += udp.h
>   header-y += uhid.h
>   header-y += uinput.h
>   header-y += uio.h
> +header-y += uio_msi.h
>   header-y += ultrasound.h
>   header-y += un.h
>   header-y += unistd.h
> diff --git a/include/uapi/linux/uio_msi.h b/include/uapi/linux/uio_msi.h
> new file mode 100644
> index 0000000..297de00
> --- /dev/null
> +++ b/include/uapi/linux/uio_msi.h
> @@ -0,0 +1,22 @@
> +/*
> + * UIO_MSI API definition
> + *
> + * Copyright (c) 2015 by Brocade Communications Systems, Inc.
> + * All rights reserved.
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +#ifndef _UIO_PCI_MSI_H
> +#define _UIO_PCI_MSI_H
> +
> +struct uio_msi_irq_set {
> +	u32 vec;
> +	int fd;
> +};
> +
> +#define UIO_MSI_BASE	0x86
> +#define UIO_MSI_IRQ_SET	_IOW('I', UIO_MSI_BASE+1, struct uio_msi_irq_set)
> +
> +#endif
>



More information about the dev mailing list