[dpdk-dev] [RFC 15/24] vhost: add virtio pci framework

Stefan Hajnoczi stefanha at redhat.com
Fri Jan 19 14:44:35 CET 2018


The virtio-vhost-user transport will involve a virtio pci device driver.
There is currently no librte_virtio API that we can reusable.

This commit is a hack that duplicates the virtio pci code from
drivers/net/.  A better solution would be to extract the code cleanly
from drivers/net/ and share it.  Or perhaps we could backport SPDK's
lib/virtio.  I don't have time to do either right now so I've just
copied the code, removed virtio-net and ethdev parts, and renamed
symbols to avoid link errors.

Signed-off-by: Stefan Hajnoczi <stefanha at redhat.com>
---
 drivers/librte_vhost/Makefile     |   4 +-
 drivers/librte_vhost/virtio_pci.h | 267 ++++++++++++++++++++
 drivers/librte_vhost/virtqueue.h  | 181 ++++++++++++++
 drivers/librte_vhost/virtio_pci.c | 504 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 955 insertions(+), 1 deletion(-)
 create mode 100644 drivers/librte_vhost/virtio_pci.h
 create mode 100644 drivers/librte_vhost/virtqueue.h
 create mode 100644 drivers/librte_vhost/virtio_pci.c

diff --git a/drivers/librte_vhost/Makefile b/drivers/librte_vhost/Makefile
index ccbbce3af..8a56c32af 100644
--- a/drivers/librte_vhost/Makefile
+++ b/drivers/librte_vhost/Makefile
@@ -21,7 +21,9 @@ LDLIBS += -lrte_eal -lrte_mempool -lrte_mbuf -lrte_ethdev -lrte_net
 
 # all source are stored in SRCS-y
 SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
-					vhost_user.c virtio_net.c trans_af_unix.c
+					vhost_user.c virtio_net.c \
+					trans_af_unix.c \
+					virtio_pci.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
diff --git a/drivers/librte_vhost/virtio_pci.h b/drivers/librte_vhost/virtio_pci.h
new file mode 100644
index 000000000..7afc24853
--- /dev/null
+++ b/drivers/librte_vhost/virtio_pci.h
@@ -0,0 +1,267 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+/* XXX This file is based on drivers/net/virtio/virtio_pci.h.  It would be
+ * better to create a shared rte_virtio library instead of duplicating this
+ * code.
+ */
+
+#ifndef _VIRTIO_PCI_H_
+#define _VIRTIO_PCI_H_
+
+#include <stdint.h>
+
+#include <rte_log.h>
+#include <rte_pci.h>
+#include <rte_bus_pci.h>
+#include <rte_spinlock.h>
+
+/* Macros for printing using RTE_LOG */
+#define RTE_LOGTYPE_VIRTIO_PCI_CONFIG RTE_LOGTYPE_USER2
+
+struct virtqueue;
+
+/* VirtIO PCI vendor/device ID. */
+#define VIRTIO_PCI_VENDORID     0x1AF4
+#define VIRTIO_PCI_LEGACY_DEVICEID_VHOST_USER 0x1017
+#define VIRTIO_PCI_MODERN_DEVICEID_VHOST_USER 0x1058
+
+/* VirtIO ABI version, this must match exactly. */
+#define VIRTIO_PCI_ABI_VERSION 0
+
+/*
+ * VirtIO Header, located in BAR 0.
+ */
+#define VIRTIO_PCI_HOST_FEATURES  0  /* host's supported features (32bit, RO)*/
+#define VIRTIO_PCI_GUEST_FEATURES 4  /* guest's supported features (32, RW) */
+#define VIRTIO_PCI_QUEUE_PFN      8  /* physical address of VQ (32, RW) */
+#define VIRTIO_PCI_QUEUE_NUM      12 /* number of ring entries (16, RO) */
+#define VIRTIO_PCI_QUEUE_SEL      14 /* current VQ selection (16, RW) */
+#define VIRTIO_PCI_QUEUE_NOTIFY   16 /* notify host regarding VQ (16, RW) */
+#define VIRTIO_PCI_STATUS         18 /* device status register (8, RW) */
+#define VIRTIO_PCI_ISR		  19 /* interrupt status register, reading
+				      * also clears the register (8, RO) */
+/* Only if MSIX is enabled: */
+#define VIRTIO_MSI_CONFIG_VECTOR  20 /* configuration change vector (16, RW) */
+#define VIRTIO_MSI_QUEUE_VECTOR	  22 /* vector for selected VQ notifications
+				      (16, RW) */
+
+/* The bit of the ISR which indicates a device has an interrupt. */
+#define VIRTIO_PCI_ISR_INTR   0x1
+/* The bit of the ISR which indicates a device configuration change. */
+#define VIRTIO_PCI_ISR_CONFIG 0x2
+/* Vector value used to disable MSI for queue. */
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/* VirtIO device IDs. */
+#define VIRTIO_ID_VHOST_USER  0x18
+
+/* Status byte for guest to report progress. */
+#define VIRTIO_CONFIG_STATUS_RESET     0x00
+#define VIRTIO_CONFIG_STATUS_ACK       0x01
+#define VIRTIO_CONFIG_STATUS_DRIVER    0x02
+#define VIRTIO_CONFIG_STATUS_DRIVER_OK 0x04
+#define VIRTIO_CONFIG_STATUS_FEATURES_OK 0x08
+#define VIRTIO_CONFIG_STATUS_FAILED    0x80
+
+/*
+ * Each virtqueue indirect descriptor list must be physically contiguous.
+ * To allow us to malloc(9) each list individually, limit the number
+ * supported to what will fit in one page. With 4KB pages, this is a limit
+ * of 256 descriptors. If there is ever a need for more, we can switch to
+ * contigmalloc(9) for the larger allocations, similar to what
+ * bus_dmamem_alloc(9) does.
+ *
+ * Note the sizeof(struct vring_desc) is 16 bytes.
+ */
+#define VIRTIO_MAX_INDIRECT ((int) (PAGE_SIZE / 16))
+
+/* Do we get callbacks when the ring is completely used, even if we've
+ * suppressed them? */
+#define VIRTIO_F_NOTIFY_ON_EMPTY	24
+
+/* Can the device handle any descriptor layout? */
+#define VIRTIO_F_ANY_LAYOUT		27
+
+/* We support indirect buffer descriptors */
+#define VIRTIO_RING_F_INDIRECT_DESC	28
+
+#define VIRTIO_F_VERSION_1		32
+#define VIRTIO_F_IOMMU_PLATFORM	33
+
+/*
+ * Some VirtIO feature bits (currently bits 28 through 31) are
+ * reserved for the transport being used (eg. virtio_ring), the
+ * rest are per-device feature bits.
+ */
+#define VIRTIO_TRANSPORT_F_START 28
+#define VIRTIO_TRANSPORT_F_END   34
+
+/* The Guest publishes the used index for which it expects an interrupt
+ * at the end of the avail ring. Host should ignore the avail->flags field. */
+/* The Host publishes the avail index for which it expects a kick
+ * at the end of the used ring. Guest should ignore the used->flags field. */
+#define VIRTIO_RING_F_EVENT_IDX		29
+
+/* Common configuration */
+#define VIRTIO_PCI_CAP_COMMON_CFG	1
+/* Notifications */
+#define VIRTIO_PCI_CAP_NOTIFY_CFG	2
+/* ISR Status */
+#define VIRTIO_PCI_CAP_ISR_CFG		3
+/* Device specific configuration */
+#define VIRTIO_PCI_CAP_DEVICE_CFG	4
+/* PCI configuration access */
+#define VIRTIO_PCI_CAP_PCI_CFG		5
+
+/* This is the PCI capability header: */
+struct virtio_pci_cap {
+	uint8_t cap_vndr;		/* Generic PCI field: PCI_CAP_ID_VNDR */
+	uint8_t cap_next;		/* Generic PCI field: next ptr. */
+	uint8_t cap_len;		/* Generic PCI field: capability length */
+	uint8_t cfg_type;		/* Identifies the structure. */
+	uint8_t bar;			/* Where to find it. */
+	uint8_t padding[3];		/* Pad to full dword. */
+	uint32_t offset;		/* Offset within bar. */
+	uint32_t length;		/* Length of the structure, in bytes. */
+};
+
+struct virtio_pci_notify_cap {
+	struct virtio_pci_cap cap;
+	uint32_t notify_off_multiplier;	/* Multiplier for queue_notify_off. */
+};
+
+/* Fields in VIRTIO_PCI_CAP_COMMON_CFG: */
+struct virtio_pci_common_cfg {
+	/* About the whole device. */
+	uint32_t device_feature_select;	/* read-write */
+	uint32_t device_feature;	/* read-only */
+	uint32_t guest_feature_select;	/* read-write */
+	uint32_t guest_feature;		/* read-write */
+	uint16_t msix_config;		/* read-write */
+	uint16_t num_queues;		/* read-only */
+	uint8_t device_status;		/* read-write */
+	uint8_t config_generation;	/* read-only */
+
+	/* About a specific virtqueue. */
+	uint16_t queue_select;		/* read-write */
+	uint16_t queue_size;		/* read-write, power of 2. */
+	uint16_t queue_msix_vector;	/* read-write */
+	uint16_t queue_enable;		/* read-write */
+	uint16_t queue_notify_off;	/* read-only */
+	uint32_t queue_desc_lo;		/* read-write */
+	uint32_t queue_desc_hi;		/* read-write */
+	uint32_t queue_avail_lo;	/* read-write */
+	uint32_t queue_avail_hi;	/* read-write */
+	uint32_t queue_used_lo;		/* read-write */
+	uint32_t queue_used_hi;		/* read-write */
+};
+
+struct virtio_hw;
+
+struct virtio_pci_ops {
+	void (*read_dev_cfg)(struct virtio_hw *hw, size_t offset,
+			     void *dst, int len);
+	void (*write_dev_cfg)(struct virtio_hw *hw, size_t offset,
+			      const void *src, int len);
+	void (*reset)(struct virtio_hw *hw);
+
+	uint8_t (*get_status)(struct virtio_hw *hw);
+	void    (*set_status)(struct virtio_hw *hw, uint8_t status);
+
+	uint64_t (*get_features)(struct virtio_hw *hw);
+	void     (*set_features)(struct virtio_hw *hw, uint64_t features);
+
+	uint8_t (*get_isr)(struct virtio_hw *hw);
+
+	uint16_t (*set_config_irq)(struct virtio_hw *hw, uint16_t vec);
+
+	uint16_t (*set_queue_irq)(struct virtio_hw *hw, struct virtqueue *vq,
+			uint16_t vec);
+
+	uint16_t (*get_queue_num)(struct virtio_hw *hw, uint16_t queue_id);
+	int (*setup_queue)(struct virtio_hw *hw, struct virtqueue *vq);
+	void (*del_queue)(struct virtio_hw *hw, struct virtqueue *vq);
+	void (*notify_queue)(struct virtio_hw *hw, struct virtqueue *vq);
+};
+
+struct virtio_hw {
+	uint64_t    guest_features;
+	uint32_t    max_queue_pairs;
+	uint16_t    started;
+	uint8_t	    use_msix;
+	uint16_t    internal_id;
+	uint32_t    notify_off_multiplier;
+	uint8_t     *isr;
+	uint16_t    *notify_base;
+	struct virtio_pci_common_cfg *common_cfg;
+	void	    *dev_cfg;
+	/*
+	 * App management thread and virtio interrupt handler thread
+	 * both can change device state, this lock is meant to avoid
+	 * such a contention.
+	 */
+	rte_spinlock_t state_lock;
+
+	struct virtqueue **vqs;
+};
+
+/*
+ * While virtio_hw is stored in shared memory, this structure stores
+ * some infos that may vary in the multiple process model locally.
+ * For example, the vtpci_ops pointer.
+ */
+struct virtio_hw_internal {
+	const struct virtio_pci_ops *vtpci_ops;
+};
+
+#define VTPCI_OPS(hw)	(virtio_pci_hw_internal[(hw)->internal_id].vtpci_ops)
+
+extern struct virtio_hw_internal virtio_pci_hw_internal[8];
+
+/*
+ * How many bits to shift physical queue address written to QUEUE_PFN.
+ * 12 is historical, and due to x86 page size.
+ */
+#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12
+
+/* The alignment to use between consumer and producer parts of vring. */
+#define VIRTIO_PCI_VRING_ALIGN 4096
+
+enum virtio_msix_status {
+	VIRTIO_MSIX_NONE = 0,
+	VIRTIO_MSIX_DISABLED = 1,
+	VIRTIO_MSIX_ENABLED = 2
+};
+
+static inline int
+virtio_pci_with_feature(struct virtio_hw *hw, uint64_t bit)
+{
+	return (hw->guest_features & (1ULL << bit)) != 0;
+}
+
+/*
+ * Function declaration from virtio_pci.c
+ */
+int virtio_pci_init(struct rte_pci_device *dev, struct virtio_hw *hw);
+void virtio_pci_reset(struct virtio_hw *);
+
+void virtio_pci_reinit_complete(struct virtio_hw *);
+
+uint8_t virtio_pci_get_status(struct virtio_hw *);
+void virtio_pci_set_status(struct virtio_hw *, uint8_t);
+
+uint64_t virtio_pci_negotiate_features(struct virtio_hw *, uint64_t);
+
+void virtio_pci_write_dev_config(struct virtio_hw *, size_t, const void *, int);
+
+void virtio_pci_read_dev_config(struct virtio_hw *, size_t, void *, int);
+
+uint8_t virtio_pci_isr(struct virtio_hw *);
+
+enum virtio_msix_status virtio_pci_msix_detect(struct rte_pci_device *dev);
+
+extern const struct virtio_pci_ops virtio_pci_modern_ops;
+
+#endif /* _VIRTIO_PCI_H_ */
diff --git a/drivers/librte_vhost/virtqueue.h b/drivers/librte_vhost/virtqueue.h
new file mode 100644
index 000000000..e2ac78eef
--- /dev/null
+++ b/drivers/librte_vhost/virtqueue.h
@@ -0,0 +1,181 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+
+/* XXX This file is based on drivers/net/virtio/virtqueue.h.  It would be
+ * better to create a shared rte_virtio library instead of duplicating this
+ * code.
+ */
+
+#ifndef _VIRTQUEUE_H_
+#define _VIRTQUEUE_H_
+
+#include <stdint.h>
+#include <linux/virtio_ring.h>
+
+#include <rte_atomic.h>
+#include <rte_memory.h>
+#include <rte_mempool.h>
+
+#include "virtio_pci.h"
+
+/*
+ * Per virtio_config.h in Linux.
+ *     For virtio_pci on SMP, we don't need to order with respect to MMIO
+ *     accesses through relaxed memory I/O windows, so smp_mb() et al are
+ *     sufficient.
+ *
+ */
+#define virtio_mb()	rte_smp_mb()
+#define virtio_rmb()	rte_smp_rmb()
+#define virtio_wmb()	rte_smp_wmb()
+
+#define VIRTQUEUE_MAX_NAME_SZ 32
+
+/**
+ * The maximum virtqueue size is 2^15. Use that value as the end of
+ * descriptor chain terminator since it will never be a valid index
+ * in the descriptor table. This is used to verify we are correctly
+ * handling vq_free_cnt.
+ */
+#define VQ_RING_DESC_CHAIN_END 32768
+
+struct vq_desc_extra {
+	void *cookie;
+	uint16_t ndescs;
+};
+
+struct virtqueue {
+	struct virtio_hw  *hw; /**< virtio_hw structure pointer. */
+	struct vring vq_ring;  /**< vring keeping desc, used and avail */
+	/**
+	 * Last consumed descriptor in the used table,
+	 * trails vq_ring.used->idx.
+	 */
+	uint16_t vq_used_cons_idx;
+	uint16_t vq_nentries;  /**< vring desc numbers */
+	uint16_t vq_free_cnt;  /**< num of desc available */
+	uint16_t vq_avail_idx; /**< sync until needed */
+	uint16_t vq_free_thresh; /**< free threshold */
+
+	void *vq_ring_virt_mem;  /**< linear address of vring*/
+	unsigned int vq_ring_size;
+
+	rte_iova_t vq_ring_mem; /**< physical address of vring */
+
+	const struct rte_memzone *mz; /**< memzone backing vring */
+
+	/**
+	 * Head of the free chain in the descriptor table. If
+	 * there are no free descriptors, this will be set to
+	 * VQ_RING_DESC_CHAIN_END.
+	 */
+	uint16_t  vq_desc_head_idx;
+	uint16_t  vq_desc_tail_idx;
+	uint16_t  vq_queue_index;   /**< PCI queue index */
+	uint16_t  *notify_addr;
+	struct vq_desc_extra vq_descx[0];
+};
+
+/* Chain all the descriptors in the ring with an END */
+static inline void
+vring_desc_init(struct vring_desc *dp, uint16_t n)
+{
+	uint16_t i;
+
+	for (i = 0; i < n - 1; i++)
+		dp[i].next = (uint16_t)(i + 1);
+	dp[i].next = VQ_RING_DESC_CHAIN_END;
+}
+
+/**
+ * Tell the backend not to interrupt us.
+ */
+static inline void
+virtqueue_disable_intr(struct virtqueue *vq)
+{
+	vq->vq_ring.avail->flags |= VRING_AVAIL_F_NO_INTERRUPT;
+}
+
+/**
+ * Tell the backend to interrupt us.
+ */
+static inline void
+virtqueue_enable_intr(struct virtqueue *vq)
+{
+	vq->vq_ring.avail->flags &= (~VRING_AVAIL_F_NO_INTERRUPT);
+}
+
+/**
+ *  Dump virtqueue internal structures, for debug purpose only.
+ */
+void virtqueue_dump(struct virtqueue *vq);
+
+static inline int
+virtqueue_full(const struct virtqueue *vq)
+{
+	return vq->vq_free_cnt == 0;
+}
+
+#define VIRTQUEUE_NUSED(vq) ((uint16_t)((vq)->vq_ring.used->idx - (vq)->vq_used_cons_idx))
+
+static inline void
+vq_update_avail_idx(struct virtqueue *vq)
+{
+	virtio_wmb();
+	vq->vq_ring.avail->idx = vq->vq_avail_idx;
+}
+
+static inline void
+vq_update_avail_ring(struct virtqueue *vq, uint16_t desc_idx)
+{
+	uint16_t avail_idx;
+	/*
+	 * Place the head of the descriptor chain into the next slot and make
+	 * it usable to the host. The chain is made available now rather than
+	 * deferring to virtqueue_notify() in the hopes that if the host is
+	 * currently running on another CPU, we can keep it processing the new
+	 * descriptor.
+	 */
+	avail_idx = (uint16_t)(vq->vq_avail_idx & (vq->vq_nentries - 1));
+	if (unlikely(vq->vq_ring.avail->ring[avail_idx] != desc_idx))
+		vq->vq_ring.avail->ring[avail_idx] = desc_idx;
+	vq->vq_avail_idx++;
+}
+
+static inline int
+virtqueue_kick_prepare(struct virtqueue *vq)
+{
+	return !(vq->vq_ring.used->flags & VRING_USED_F_NO_NOTIFY);
+}
+
+static inline void
+virtqueue_notify(struct virtqueue *vq)
+{
+	/*
+	 * Ensure updated avail->idx is visible to host.
+	 * For virtio on IA, the notificaiton is through io port operation
+	 * which is a serialization instruction itself.
+	 */
+	VTPCI_OPS(vq->hw)->notify_queue(vq->hw, vq);
+}
+
+#ifdef RTE_LIBRTE_VIRTIO_DEBUG_DUMP
+#define VIRTQUEUE_DUMP(vq) do { \
+	uint16_t used_idx, nused; \
+	used_idx = (vq)->vq_ring.used->idx; \
+	nused = (uint16_t)(used_idx - (vq)->vq_used_cons_idx); \
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, \
+	  "VQ: - size=%d; free=%d; used=%d; desc_head_idx=%d;" \
+	  " avail.idx=%d; used_cons_idx=%d; used.idx=%d;" \
+	  " avail.flags=0x%x; used.flags=0x%x\n", \
+	  (vq)->vq_nentries, (vq)->vq_free_cnt, nused, \
+	  (vq)->vq_desc_head_idx, (vq)->vq_ring.avail->idx, \
+	  (vq)->vq_used_cons_idx, (vq)->vq_ring.used->idx, \
+	  (vq)->vq_ring.avail->flags, (vq)->vq_ring.used->flags); \
+} while (0)
+#else
+#define VIRTQUEUE_DUMP(vq) do { } while (0)
+#endif
+
+#endif /* _VIRTQUEUE_H_ */
diff --git a/drivers/librte_vhost/virtio_pci.c b/drivers/librte_vhost/virtio_pci.c
new file mode 100644
index 000000000..f1a23bbbf
--- /dev/null
+++ b/drivers/librte_vhost/virtio_pci.c
@@ -0,0 +1,504 @@
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright(c) 2010-2014 Intel Corporation
+ */
+#include <stdint.h>
+
+/* XXX This file is based on drivers/net/virtio/virtio_pci.c.  It would be
+ * better to create a shared rte_virtio library instead of duplicating this
+ * code.
+ */
+
+#ifdef RTE_EXEC_ENV_LINUXAPP
+ #include <dirent.h>
+ #include <fcntl.h>
+#endif
+
+#include <rte_io.h>
+#include <rte_bus.h>
+
+#include "virtio_pci.h"
+#include "virtqueue.h"
+
+/*
+ * Following macros are derived from linux/pci_regs.h, however,
+ * we can't simply include that header here, as there is no such
+ * file for non-Linux platform.
+ */
+#define PCI_CAPABILITY_LIST	0x34
+#define PCI_CAP_ID_VNDR		0x09
+#define PCI_CAP_ID_MSIX		0x11
+
+/*
+ * The remaining space is defined by each driver as the per-driver
+ * configuration space.
+ */
+#define VIRTIO_PCI_CONFIG(hw) \
+		(((hw)->use_msix == VIRTIO_MSIX_ENABLED) ? 24 : 20)
+
+static inline int
+check_vq_phys_addr_ok(struct virtqueue *vq)
+{
+	/* Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+	 * and only accepts 32 bit page frame number.
+	 * Check if the allocated physical memory exceeds 16TB.
+	 */
+	if ((vq->vq_ring_mem + vq->vq_ring_size - 1) >>
+			(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+		RTE_LOG(ERR, VIRTIO_PCI_CONFIG, "vring address shouldn't be above 16TB!\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline void
+io_write64_twopart(uint64_t val, uint32_t *lo, uint32_t *hi)
+{
+	rte_write32(val & ((1ULL << 32) - 1), lo);
+	rte_write32(val >> 32,		     hi);
+}
+
+static void
+modern_read_dev_config(struct virtio_hw *hw, size_t offset,
+		       void *dst, int length)
+{
+	int i;
+	uint8_t *p;
+	uint8_t old_gen, new_gen;
+
+	do {
+		old_gen = rte_read8(&hw->common_cfg->config_generation);
+
+		p = dst;
+		for (i = 0;  i < length; i++)
+			*p++ = rte_read8((uint8_t *)hw->dev_cfg + offset + i);
+
+		new_gen = rte_read8(&hw->common_cfg->config_generation);
+	} while (old_gen != new_gen);
+}
+
+static void
+modern_write_dev_config(struct virtio_hw *hw, size_t offset,
+			const void *src, int length)
+{
+	int i;
+	const uint8_t *p = src;
+
+	for (i = 0;  i < length; i++)
+		rte_write8((*p++), (((uint8_t *)hw->dev_cfg) + offset + i));
+}
+
+static uint64_t
+modern_get_features(struct virtio_hw *hw)
+{
+	uint32_t features_lo, features_hi;
+
+	rte_write32(0, &hw->common_cfg->device_feature_select);
+	features_lo = rte_read32(&hw->common_cfg->device_feature);
+
+	rte_write32(1, &hw->common_cfg->device_feature_select);
+	features_hi = rte_read32(&hw->common_cfg->device_feature);
+
+	return ((uint64_t)features_hi << 32) | features_lo;
+}
+
+static void
+modern_set_features(struct virtio_hw *hw, uint64_t features)
+{
+	rte_write32(0, &hw->common_cfg->guest_feature_select);
+	rte_write32(features & ((1ULL << 32) - 1),
+		    &hw->common_cfg->guest_feature);
+
+	rte_write32(1, &hw->common_cfg->guest_feature_select);
+	rte_write32(features >> 32,
+		    &hw->common_cfg->guest_feature);
+}
+
+static uint8_t
+modern_get_status(struct virtio_hw *hw)
+{
+	return rte_read8(&hw->common_cfg->device_status);
+}
+
+static void
+modern_set_status(struct virtio_hw *hw, uint8_t status)
+{
+	rte_write8(status, &hw->common_cfg->device_status);
+}
+
+static void
+modern_reset(struct virtio_hw *hw)
+{
+	modern_set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	modern_get_status(hw);
+}
+
+static uint8_t
+modern_get_isr(struct virtio_hw *hw)
+{
+	return rte_read8(hw->isr);
+}
+
+static uint16_t
+modern_set_config_irq(struct virtio_hw *hw, uint16_t vec)
+{
+	rte_write16(vec, &hw->common_cfg->msix_config);
+	return rte_read16(&hw->common_cfg->msix_config);
+}
+
+static uint16_t
+modern_set_queue_irq(struct virtio_hw *hw, struct virtqueue *vq, uint16_t vec)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+	rte_write16(vec, &hw->common_cfg->queue_msix_vector);
+	return rte_read16(&hw->common_cfg->queue_msix_vector);
+}
+
+static uint16_t
+modern_get_queue_num(struct virtio_hw *hw, uint16_t queue_id)
+{
+	rte_write16(queue_id, &hw->common_cfg->queue_select);
+	return rte_read16(&hw->common_cfg->queue_size);
+}
+
+static int
+modern_setup_queue(struct virtio_hw *hw, struct virtqueue *vq)
+{
+	uint64_t desc_addr, avail_addr, used_addr;
+	uint16_t notify_off;
+
+	if (!check_vq_phys_addr_ok(vq))
+		return -1;
+
+	desc_addr = vq->vq_ring_mem;
+	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
+	used_addr = RTE_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
+							 ring[vq->vq_nentries]),
+				   VIRTIO_PCI_VRING_ALIGN);
+
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(desc_addr, &hw->common_cfg->queue_desc_lo,
+				      &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(avail_addr, &hw->common_cfg->queue_avail_lo,
+				       &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(used_addr, &hw->common_cfg->queue_used_lo,
+				      &hw->common_cfg->queue_used_hi);
+
+	notify_off = rte_read16(&hw->common_cfg->queue_notify_off);
+	vq->notify_addr = (void *)((uint8_t *)hw->notify_base +
+				notify_off * hw->notify_off_multiplier);
+
+	rte_write16(1, &hw->common_cfg->queue_enable);
+
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "queue %u addresses:\n", vq->vq_queue_index);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "\t desc_addr: %" PRIx64 "\n", desc_addr);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "\t aval_addr: %" PRIx64 "\n", avail_addr);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "\t used_addr: %" PRIx64 "\n", used_addr);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "\t notify addr: %p (notify offset: %u)\n",
+		vq->notify_addr, notify_off);
+
+	return 0;
+}
+
+static void
+modern_del_queue(struct virtio_hw *hw, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, &hw->common_cfg->queue_select);
+
+	io_write64_twopart(0, &hw->common_cfg->queue_desc_lo,
+				  &hw->common_cfg->queue_desc_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_avail_lo,
+				  &hw->common_cfg->queue_avail_hi);
+	io_write64_twopart(0, &hw->common_cfg->queue_used_lo,
+				  &hw->common_cfg->queue_used_hi);
+
+	rte_write16(0, &hw->common_cfg->queue_enable);
+}
+
+static void
+modern_notify_queue(struct virtio_hw *hw __rte_unused, struct virtqueue *vq)
+{
+	rte_write16(vq->vq_queue_index, vq->notify_addr);
+}
+
+const struct virtio_pci_ops virtio_pci_modern_ops = {
+	.read_dev_cfg	= modern_read_dev_config,
+	.write_dev_cfg	= modern_write_dev_config,
+	.reset		= modern_reset,
+	.get_status	= modern_get_status,
+	.set_status	= modern_set_status,
+	.get_features	= modern_get_features,
+	.set_features	= modern_set_features,
+	.get_isr	= modern_get_isr,
+	.set_config_irq	= modern_set_config_irq,
+	.set_queue_irq  = modern_set_queue_irq,
+	.get_queue_num	= modern_get_queue_num,
+	.setup_queue	= modern_setup_queue,
+	.del_queue	= modern_del_queue,
+	.notify_queue	= modern_notify_queue,
+};
+
+
+void
+virtio_pci_read_dev_config(struct virtio_hw *hw, size_t offset,
+		      void *dst, int length)
+{
+	VTPCI_OPS(hw)->read_dev_cfg(hw, offset, dst, length);
+}
+
+void
+virtio_pci_write_dev_config(struct virtio_hw *hw, size_t offset,
+		       const void *src, int length)
+{
+	VTPCI_OPS(hw)->write_dev_cfg(hw, offset, src, length);
+}
+
+uint64_t
+virtio_pci_negotiate_features(struct virtio_hw *hw, uint64_t host_features)
+{
+	uint64_t features;
+
+	/*
+	 * Limit negotiated features to what the driver, virtqueue, and
+	 * host all support.
+	 */
+	features = host_features & hw->guest_features;
+	VTPCI_OPS(hw)->set_features(hw, features);
+
+	return features;
+}
+
+void
+virtio_pci_reset(struct virtio_hw *hw)
+{
+	VTPCI_OPS(hw)->set_status(hw, VIRTIO_CONFIG_STATUS_RESET);
+	/* flush status write */
+	VTPCI_OPS(hw)->get_status(hw);
+}
+
+void
+virtio_pci_reinit_complete(struct virtio_hw *hw)
+{
+	virtio_pci_set_status(hw, VIRTIO_CONFIG_STATUS_DRIVER_OK);
+}
+
+void
+virtio_pci_set_status(struct virtio_hw *hw, uint8_t status)
+{
+	if (status != VIRTIO_CONFIG_STATUS_RESET)
+		status |= VTPCI_OPS(hw)->get_status(hw);
+
+	VTPCI_OPS(hw)->set_status(hw, status);
+}
+
+uint8_t
+virtio_pci_get_status(struct virtio_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_status(hw);
+}
+
+uint8_t
+virtio_pci_isr(struct virtio_hw *hw)
+{
+	return VTPCI_OPS(hw)->get_isr(hw);
+}
+
+static void *
+get_cfg_addr(struct rte_pci_device *dev, struct virtio_pci_cap *cap)
+{
+	uint8_t  bar    = cap->bar;
+	uint32_t length = cap->length;
+	uint32_t offset = cap->offset;
+	uint8_t *base;
+
+	if (bar >= PCI_MAX_RESOURCE) {
+		RTE_LOG(ERR, VIRTIO_PCI_CONFIG, "invalid bar: %u\n", bar);
+		return NULL;
+	}
+
+	if (offset + length < offset) {
+		RTE_LOG(ERR, VIRTIO_PCI_CONFIG, "offset(%u) + length(%u) overflows\n",
+			offset, length);
+		return NULL;
+	}
+
+	if (offset + length > dev->mem_resource[bar].len) {
+		RTE_LOG(ERR, VIRTIO_PCI_CONFIG,
+			"invalid cap: overflows bar space: %u > %" PRIu64 "\n",
+			offset + length, dev->mem_resource[bar].len);
+		return NULL;
+	}
+
+	base = dev->mem_resource[bar].addr;
+	if (base == NULL) {
+		RTE_LOG(ERR, VIRTIO_PCI_CONFIG, "bar %u base addr is NULL\n", bar);
+		return NULL;
+	}
+
+	return base + offset;
+}
+
+#define PCI_MSIX_ENABLE 0x8000
+
+static int
+virtio_read_caps(struct rte_pci_device *dev, struct virtio_hw *hw)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	if (rte_pci_map_device(dev)) {
+		RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "failed to map pci device!\n");
+		return -1;
+	}
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "failed to read pci capability list\n");
+		return -1;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			RTE_LOG(ERR, VIRTIO_PCI_CONFIG,
+				"failed to read pci cap at pos: %x\n", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			/* Transitional devices would also have this capability,
+			 * that's why we also check if msix is enabled.
+			 * 1st byte is cap ID; 2nd byte is the position of next
+			 * cap; next two bytes are the flags.
+			 */
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				hw->use_msix = VIRTIO_MSIX_ENABLED;
+			else
+				hw->use_msix = VIRTIO_MSIX_DISABLED;
+		}
+
+		if (cap.cap_vndr != PCI_CAP_ID_VNDR) {
+			RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG,
+				"[%2x] skipping non VNDR cap id: %02x\n",
+				pos, cap.cap_vndr);
+			goto next;
+		}
+
+		RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG,
+			"[%2x] cfg type: %u, bar: %u, offset: %04x, len: %u\n",
+			pos, cap.cfg_type, cap.bar, cap.offset, cap.length);
+
+		switch (cap.cfg_type) {
+		case VIRTIO_PCI_CAP_COMMON_CFG:
+			hw->common_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_NOTIFY_CFG:
+			rte_pci_read_config(dev, &hw->notify_off_multiplier,
+					4, pos + sizeof(cap));
+			hw->notify_base = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_DEVICE_CFG:
+			hw->dev_cfg = get_cfg_addr(dev, &cap);
+			break;
+		case VIRTIO_PCI_CAP_ISR_CFG:
+			hw->isr = get_cfg_addr(dev, &cap);
+			break;
+		}
+
+next:
+		pos = cap.cap_next;
+	}
+
+	if (hw->common_cfg == NULL || hw->notify_base == NULL ||
+	    hw->dev_cfg == NULL    || hw->isr == NULL) {
+		RTE_LOG(INFO, VIRTIO_PCI_CONFIG, "no modern virtio pci device found.\n");
+		return -1;
+	}
+
+	RTE_LOG(INFO, VIRTIO_PCI_CONFIG, "found modern virtio pci device.\n");
+
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "common cfg mapped at: %p\n", hw->common_cfg);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "device cfg mapped at: %p\n", hw->dev_cfg);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "isr cfg mapped at: %p\n", hw->isr);
+	RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "notify base: %p, notify off multiplier: %u\n",
+		hw->notify_base, hw->notify_off_multiplier);
+
+	return 0;
+}
+
+struct virtio_hw_internal virtio_pci_hw_internal[8];
+
+/*
+ * Return -1:
+ *   if there is error mapping with VFIO/UIO.
+ *   if port map error when driver type is KDRV_NONE.
+ *   if whitelisted but driver type is KDRV_UNKNOWN.
+ * Return 1 if kernel driver is managing the device.
+ * Return 0 on success.
+ */
+int
+virtio_pci_init(struct rte_pci_device *dev, struct virtio_hw *hw)
+{
+	static size_t internal_id;
+
+	if (internal_id >=
+	    sizeof(virtio_pci_hw_internal) / sizeof(*virtio_pci_hw_internal)) {
+		RTE_LOG(INFO, VIRTIO_PCI_CONFIG, "too many virtio pci devices.\n");
+		return -1;
+	}
+
+	/*
+	 * Try if we can succeed reading virtio pci caps, which exists
+	 * only on modern pci device.
+	 */
+	if (virtio_read_caps(dev, hw) != 0) {
+		RTE_LOG(INFO, VIRTIO_PCI_CONFIG, "legacy virtio pci is not supported.\n");
+		return -1;
+	}
+
+	RTE_LOG(INFO, VIRTIO_PCI_CONFIG, "modern virtio pci detected.\n");
+	hw->internal_id = internal_id++;
+	virtio_pci_hw_internal[hw->internal_id].vtpci_ops =
+		&virtio_pci_modern_ops;
+	return 0;
+}
+
+enum virtio_msix_status
+virtio_pci_msix_detect(struct rte_pci_device *dev)
+{
+	uint8_t pos;
+	struct virtio_pci_cap cap;
+	int ret;
+
+	ret = rte_pci_read_config(dev, &pos, 1, PCI_CAPABILITY_LIST);
+	if (ret < 0) {
+		RTE_LOG(DEBUG, VIRTIO_PCI_CONFIG, "failed to read pci capability list\n");
+		return VIRTIO_MSIX_NONE;
+	}
+
+	while (pos) {
+		ret = rte_pci_read_config(dev, &cap, sizeof(cap), pos);
+		if (ret < 0) {
+			RTE_LOG(ERR, VIRTIO_PCI_CONFIG,
+				"failed to read pci cap at pos: %x\n", pos);
+			break;
+		}
+
+		if (cap.cap_vndr == PCI_CAP_ID_MSIX) {
+			uint16_t flags = ((uint16_t *)&cap)[1];
+
+			if (flags & PCI_MSIX_ENABLE)
+				return VIRTIO_MSIX_ENABLED;
+			else
+				return VIRTIO_MSIX_DISABLED;
+		}
+
+		pos = cap.cap_next;
+	}
+
+	return VIRTIO_MSIX_NONE;
+}
-- 
2.14.3



More information about the dev mailing list