[dpdk-dev] [PATCH v2 5/7] net/virtio_user: add vhost kernel support
Jason Wang
jasowang at redhat.com
Mon Jan 9 05:39:54 CET 2017
On 2016年12月23日 15:14, Jianfeng Tan wrote:
> This patch add support vhost kernel as the backend for virtio_user.
> Three main hook functions are added:
> - vhost_kernel_setup() to open char device, each vq pair needs one
> vhostfd;
> - vhost_kernel_ioctl() to communicate control messages with vhost
> kernel module;
> - vhost_kernel_enable_queue_pair() to open tap device and set it
> as the backend of corresonding vhost fd (that is to say, vq pair).
>
> Signed-off-by: Jianfeng Tan <jianfeng.tan at intel.com>
> ---
> drivers/net/virtio/Makefile | 1 +
> drivers/net/virtio/virtio_user/vhost.h | 2 +
> drivers/net/virtio/virtio_user/vhost_kernel.c | 364 +++++++++++++++++++++++
> drivers/net/virtio/virtio_user/virtio_user_dev.c | 21 +-
> drivers/net/virtio/virtio_user/virtio_user_dev.h | 4 +
> 5 files changed, 388 insertions(+), 4 deletions(-)
> create mode 100644 drivers/net/virtio/virtio_user/vhost_kernel.c
>
> diff --git a/drivers/net/virtio/Makefile b/drivers/net/virtio/Makefile
> index 97972a6..faeffb2 100644
> --- a/drivers/net/virtio/Makefile
> +++ b/drivers/net/virtio/Makefile
> @@ -60,6 +60,7 @@ endif
>
> ifeq ($(CONFIG_RTE_VIRTIO_USER),y)
> SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_user.c
> +SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/vhost_kernel.c
> SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user/virtio_user_dev.c
> SRCS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio_user_ethdev.c
> endif
> diff --git a/drivers/net/virtio/virtio_user/vhost.h b/drivers/net/virtio/virtio_user/vhost.h
> index bd67133..ffab13a 100644
> --- a/drivers/net/virtio/virtio_user/vhost.h
> +++ b/drivers/net/virtio/virtio_user/vhost.h
> @@ -120,4 +120,6 @@ struct virtio_user_backend_ops {
> };
>
> struct virtio_user_backend_ops ops_user;
> +struct virtio_user_backend_ops ops_kernel;
> +
> #endif
> diff --git a/drivers/net/virtio/virtio_user/vhost_kernel.c b/drivers/net/virtio/virtio_user/vhost_kernel.c
> new file mode 100644
> index 0000000..8984c5c
> --- /dev/null
> +++ b/drivers/net/virtio/virtio_user/vhost_kernel.c
> @@ -0,0 +1,364 @@
> +/*-
> + * BSD LICENSE
> + *
> + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
> + * All rights reserved.
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions
> + * are met:
> + *
> + * * Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * * Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in
> + * the documentation and/or other materials provided with the
> + * distribution.
> + * * Neither the name of Intel Corporation nor the names of its
> + * contributors may be used to endorse or promote products derived
> + * from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +#include <unistd.h>
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <sys/ioctl.h>
> +#include <net/if.h>
> +#include <string.h>
> +#include <errno.h>
> +
> +#include <rte_memory.h>
> +#include <rte_eal_memconfig.h>
> +
> +#include "vhost.h"
> +#include "virtio_user_dev.h"
> +
> +struct vhost_memory_kernel {
> + uint32_t nregions;
> + uint32_t padding;
> + struct vhost_memory_region regions[0];
> +};
> +
> +/* vhost kernel ioctls */
> +#define VHOST_VIRTIO 0xAF
> +#define VHOST_GET_FEATURES _IOR(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_FEATURES _IOW(VHOST_VIRTIO, 0x00, __u64)
> +#define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
> +#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
> +#define VHOST_SET_MEM_TABLE _IOW(VHOST_VIRTIO, 0x03, struct vhost_memory_kernel)
> +#define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64)
> +#define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int)
> +#define VHOST_SET_VRING_NUM _IOW(VHOST_VIRTIO, 0x10, struct vhost_vring_state)
> +#define VHOST_SET_VRING_ADDR _IOW(VHOST_VIRTIO, 0x11, struct vhost_vring_addr)
> +#define VHOST_SET_VRING_BASE _IOW(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
> +#define VHOST_GET_VRING_BASE _IOWR(VHOST_VIRTIO, 0x12, struct vhost_vring_state)
> +#define VHOST_SET_VRING_KICK _IOW(VHOST_VIRTIO, 0x20, struct vhost_vring_file)
> +#define VHOST_SET_VRING_CALL _IOW(VHOST_VIRTIO, 0x21, struct vhost_vring_file)
> +#define VHOST_SET_VRING_ERR _IOW(VHOST_VIRTIO, 0x22, struct vhost_vring_file)
> +#define VHOST_NET_SET_BACKEND _IOW(VHOST_VIRTIO, 0x30, struct vhost_vring_file)
> +
> +/* TUN ioctls */
> +#define TUNSETIFF _IOW('T', 202, int)
> +#define TUNGETFEATURES _IOR('T', 207, unsigned int)
> +#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
> +#define TUNGETIFF _IOR('T', 210, unsigned int)
> +#define TUNSETSNDBUF _IOW('T', 212, int)
> +#define TUNGETVNETHDRSZ _IOR('T', 215, int)
> +#define TUNSETVNETHDRSZ _IOW('T', 216, int)
> +#define TUNSETQUEUE _IOW('T', 217, int)
> +#define TUNSETVNETLE _IOW('T', 220, int)
> +#define TUNSETVNETBE _IOW('T', 222, int)
> +
> +/* TUNSETIFF ifr flags */
> +#define IFF_TAP 0x0002
> +#define IFF_NO_PI 0x1000
> +#define IFF_ONE_QUEUE 0x2000
> +#define IFF_VNET_HDR 0x4000
> +#define IFF_MULTI_QUEUE 0x0100
> +#define IFF_ATTACH_QUEUE 0x0200
> +#define IFF_DETACH_QUEUE 0x0400
Do we really want to duplicate those things which has been exposed by
uapi here?
> +
> +/* Constants */
> +#define TUN_DEF_SNDBUF (1ull << 20)
> +#define PATH_NET_TUN "/dev/net/tun"
> +#define VHOST_KERNEL_MAX_REGIONS 64
Unfortunate not a constant any more since c9ce42f72fd0 vhost: add
max_mem_regions module parameter.
> +
> +static uint64_t vhost_req_user_to_kernel[] = {
> + [VHOST_USER_SET_OWNER] = VHOST_SET_OWNER,
> + [VHOST_USER_RESET_OWNER] = VHOST_RESET_OWNER,
> + [VHOST_USER_SET_FEATURES] = VHOST_SET_FEATURES,
> + [VHOST_USER_GET_FEATURES] = VHOST_GET_FEATURES,
> + [VHOST_USER_SET_VRING_CALL] = VHOST_SET_VRING_CALL,
> + [VHOST_USER_SET_VRING_NUM] = VHOST_SET_VRING_NUM,
> + [VHOST_USER_SET_VRING_BASE] = VHOST_SET_VRING_BASE,
> + [VHOST_USER_GET_VRING_BASE] = VHOST_GET_VRING_BASE,
> + [VHOST_USER_SET_VRING_ADDR] = VHOST_SET_VRING_ADDR,
> + [VHOST_USER_SET_VRING_KICK] = VHOST_SET_VRING_KICK,
> + [VHOST_USER_SET_MEM_TABLE] = VHOST_SET_MEM_TABLE,
> +};
> +
> +/* By default, vhost kernel module allows 64 regions, but DPDK allows
> + * 256 segments. As a relief, below function merges those virtually
> + * adjacent memsegs into one region.
> + */
> +static struct vhost_memory_kernel *
> +prepare_vhost_memory_kernel(void)
> +{
> + uint32_t i, j, k = 0;
> + struct rte_memseg *seg;
> + struct vhost_memory_region *mr;
> + struct vhost_memory_kernel *vm;
> +
> + vm = malloc(sizeof(struct vhost_memory_kernel) +
> + VHOST_KERNEL_MAX_REGIONS *
> + sizeof(struct vhost_memory_region));
> +
> + for (i = 0; i < RTE_MAX_MEMSEG; ++i) {
> + seg = &rte_eal_get_configuration()->mem_config->memseg[i];
> + if (!seg->addr)
> + break;
If we're sure the number of regions is less than 64(or the module
parameter read from /sys), can we avoid the iteration here?
> +
> + int new_region = 1;
> +
> + for (j = 0; j < k; ++j) {
> + mr = &vm->regions[j];
> +
> + if (mr->userspace_addr + mr->memory_size ==
> + (uint64_t)seg->addr) {
> + mr->memory_size += seg->len;
> + new_region = 0;
> + break;
> + }
> +
> + if ((uint64_t)seg->addr + seg->len ==
> + mr->userspace_addr) {
> + mr->guest_phys_addr = (uint64_t)seg->addr;
> + mr->userspace_addr = (uint64_t)seg->addr;
> + mr->memory_size += seg->len;
> + new_region = 0;
> + break;
> + }
> + }
> +
> + if (new_region == 0)
> + continue;
> +
> + mr = &vm->regions[k++];
> + mr->guest_phys_addr = (uint64_t)seg->addr; /* use vaddr here! */
> + mr->userspace_addr = (uint64_t)seg->addr;
> + mr->memory_size = seg->len;
> + mr->mmap_offset = 0;
> +
> + if (k >= VHOST_KERNEL_MAX_REGIONS) {
> + free(vm);
> + return NULL;
> + }
> + }
> +
> + vm->nregions = k;
> + vm->padding = 0;
> + return vm;
> +}
> +
> +static int
> +vhost_kernel_ioctl(struct virtio_user_dev *dev,
> + enum vhost_user_request req,
> + void *arg)
> +{
> + int i, ret = -1;
> + uint64_t req_kernel;
> + struct vhost_memory_kernel *vm = NULL;
> +
> + req_kernel = vhost_req_user_to_kernel[req];
> +
> + if (req_kernel == VHOST_SET_MEM_TABLE) {
> + vm = prepare_vhost_memory_kernel();
> + if (!vm)
> + return -1;
> + arg = (void *)vm;
> + }
> +
> + /* Does not work when VIRTIO_F_IOMMU_PLATFORM now, why? */
I think the reason is when VIRTIO_F_IOMMU_PLATFORM is negotiated, all
address should be iova instead of gpa.
> + if (req_kernel == VHOST_SET_FEATURES)
> + *(uint64_t *)arg &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
> +
> + for (i = 0; i < VHOST_KERNEL_MAX_QUEUES; ++i) {
> + if (dev->vhostfds[i] < 0)
> + continue;
> +
> + ret = ioctl(dev->vhostfds[i], req_kernel, arg);
> + if (ret < 0)
> + break;
> + }
> +
> + if (vm)
> + free(vm);
> +
> + return ret;
> +}
> +
> +/**
> + * Set up environment to talk with a vhost kernel backend.
> + *
> + * @return
> + * - (-1) if fail to set up;
> + * - (>=0) if successful.
> + */
> +static int
> +vhost_kernel_setup(struct virtio_user_dev *dev)
> +{
> + int vhostfd;
> + uint32_t q;
> +
> + for (q = 0; q < dev->max_queue_pairs; ++q) {
> + vhostfd = open(dev->path, O_RDWR);
> + if (vhostfd < 0) {
> + PMD_DRV_LOG(ERR, "fail to open %s, %s",
> + dev->path, strerror(errno));
> + return -1;
> + }
> +
> + dev->vhostfds[q] = vhostfd;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +vhost_kernel_set_backend(int vhostfd, int tapfd)
> +{
> + struct vhost_vring_file f;
> +
> + f.fd = tapfd;
> + f.index = 0;
> + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
> + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
> + strerror(errno));
> + return -1;
> + }
> +
> + f.index = 1;
> + if (ioctl(vhostfd, VHOST_NET_SET_BACKEND, &f) < 0) {
> + PMD_DRV_LOG(ERR, "VHOST_NET_SET_BACKEND fails, %s",
> + strerror(errno));
> + return -1;
> + }
> +
> + return 0;
> +}
> +
> +static int
> +vhost_kernel_enable_queue_pair(struct virtio_user_dev *dev,
> + uint16_t pair_idx,
> + int enable)
> +{
> + unsigned int features;
> + int sndbuf = TUN_DEF_SNDBUF;
> + struct ifreq ifr;
> + int hdr_size;
> + int vhostfd;
> + int tapfd;
> +
> + vhostfd = dev->vhostfds[pair_idx];
> +
> + if (!enable) {
> + if (dev->tapfds[pair_idx]) {
> + close(dev->tapfds[pair_idx]);
> + dev->tapfds[pair_idx] = -1;
> + }
> + return vhost_kernel_set_backend(vhostfd, -1);
If this is used to for thing like ethtool -L in guest, we should use
TUNSETQUEUE here.
> + } else if (dev->tapfds[pair_idx] >= 0) {
> + return 0;
> + }
> +
> + if ((dev->features & (1ULL << VIRTIO_NET_F_MRG_RXBUF)) ||
> + (dev->features & (1ULL << VIRTIO_F_VERSION_1)))
> + hdr_size = sizeof(struct virtio_net_hdr_mrg_rxbuf);
> + else
> + hdr_size = sizeof(struct virtio_net_hdr);
> +
> + /* TODO:
> + * 1. verify we can get/set vnet_hdr_len, tap_probe_vnet_hdr_len
> + * 2. get number of memory regions from vhost module parameter
> + * max_mem_regions, supported in newer version linux kernel
> + */
> + tapfd = open(PATH_NET_TUN, O_RDWR);
> + if (tapfd < 0) {
> + PMD_DRV_LOG(ERR, "fail to open %s: %s",
> + PATH_NET_TUN, strerror(errno));
> + return -1;
> + }
> +
> + /* Construct ifr */
> + memset(&ifr, 0, sizeof(ifr));
> + ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
> +
> + if (ioctl(tapfd, TUNGETFEATURES, &features) == -1) {
> + PMD_DRV_LOG(ERR, "TUNGETFEATURES failed: %s", strerror(errno));
> + goto error;
> + }
> + if (features & IFF_ONE_QUEUE)
> + ifr.ifr_flags |= IFF_ONE_QUEUE;
> +
> + /* Let tap instead of vhost-net handle vnet header, as the latter does
> + * not support offloading. And in this case, we should not set feature
> + * bit VHOST_NET_F_VIRTIO_NET_HDR.
> + */
> + if (features & IFF_VNET_HDR) {
> + ifr.ifr_flags |= IFF_VNET_HDR;
> + } else {
> + PMD_DRV_LOG(ERR, "TAP does not support IFF_VNET_HDR");
> + goto error;
> + }
> +
> + if (dev->ifname)
> + strncpy(ifr.ifr_name, dev->ifname, IFNAMSIZ);
> + else
> + strncpy(ifr.ifr_name, "tap%d", IFNAMSIZ);
> + if (ioctl(tapfd, TUNSETIFF, (void *)&ifr) == -1) {
> + PMD_DRV_LOG(ERR, "TUNSETIFF failed: %s", strerror(errno));
> + goto error;
> + }
This requires CAP_NET_ADMIN, so we should really consider to accept a
pre-created fd here.
> +
> + fcntl(tapfd, F_SETFL, O_NONBLOCK);
> +
> + if (ioctl(tapfd, TUNSETVNETHDRSZ, &hdr_size) < 0) {
> + PMD_DRV_LOG(ERR, "TUNSETVNETHDRSZ failed: %s", strerror(errno));
> + goto error;
> + }
> +
> + if (ioctl(tapfd, TUNSETSNDBUF, &sndbuf) < 0) {
> + PMD_DRV_LOG(ERR, "TUNSETSNDBUF failed: %s", strerror(errno));
> + goto error;
> + }
Let's use INT_MAX as default here to survive from evil consumer here.
> +
> + if (vhost_kernel_set_backend(vhostfd, tapfd) < 0)
> + goto error;
> +
> + dev->tapfds[pair_idx] = tapfd;
> + if (!dev->ifname)
> + dev->ifname = strdup(ifr.ifr_name);
> +
> + return 0;
> +error:
> + return -1;
> +}
> +
> +struct virtio_user_backend_ops ops_kernel = {
> + .setup = vhost_kernel_setup,
> + .send_request = vhost_kernel_ioctl,
> + .enable_qp = vhost_kernel_enable_queue_pair
> +};
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.c b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> index a818c29..c718b85 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.c
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.c
> @@ -219,7 +219,7 @@ is_vhost_user_by_type(const char *path)
> static int
> virtio_user_dev_setup(struct virtio_user_dev *dev)
> {
> - uint32_t i;
> + uint32_t i, q;
>
> dev->vhostfd = -1;
> for (i = 0; i < VIRTIO_MAX_VIRTQUEUES * 2 + 1; ++i) {
> @@ -227,12 +227,18 @@ virtio_user_dev_setup(struct virtio_user_dev *dev)
> dev->callfds[i] = -1;
> }
>
> + for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) {
> + dev->vhostfds[q] = -1;
> + dev->tapfds[q] = -1;
> + }
> +
> if (is_vhost_user_by_type(dev->path)) {
> dev->ops = &ops_user;
> - return dev->ops->setup(dev);
> + } else {
> + dev->ops = &ops_kernel;
> }
>
> - return -1;
> + return dev->ops->setup(dev);
> }
>
> int
> @@ -284,7 +290,9 @@ virtio_user_dev_init(struct virtio_user_dev *dev, char *path, int queues,
> void
> virtio_user_dev_uninit(struct virtio_user_dev *dev)
> {
> - uint32_t i;
> + uint32_t i, q;
> +
> + dev->ops->send_request(dev, VHOST_USER_RESET_OWNER, NULL);
>
> for (i = 0; i < dev->max_queue_pairs * 2; ++i) {
> close(dev->callfds[i]);
> @@ -292,6 +300,11 @@ virtio_user_dev_uninit(struct virtio_user_dev *dev)
> }
>
> close(dev->vhostfd);
> +
> + for (q = 0; q < VHOST_KERNEL_MAX_QUEUES; ++q) {
> + close(dev->vhostfds[q]);
> + close(dev->tapfds[q]);
> + }
> }
>
> static uint8_t
> diff --git a/drivers/net/virtio/virtio_user/virtio_user_dev.h b/drivers/net/virtio/virtio_user/virtio_user_dev.h
> index 503a496..148b2e6 100644
> --- a/drivers/net/virtio/virtio_user/virtio_user_dev.h
> +++ b/drivers/net/virtio/virtio_user/virtio_user_dev.h
> @@ -44,6 +44,10 @@ struct virtio_user_dev {
> int vhostfd;
>
> /* for vhost_kernel backend */
> + char *ifname;
> +#define VHOST_KERNEL_MAX_QUEUES 8
> + int vhostfds[VHOST_KERNEL_MAX_QUEUES];
> + int tapfds[VHOST_KERNEL_MAX_QUEUES];
>
> /* for both vhost_user and vhost_kernel */
> int callfds[VIRTIO_MAX_VIRTQUEUES * 2 + 1];
More information about the dev
mailing list