[dpdk-dev] [RFC PATCH 7/7] lib/librte_vhost: Add vhost-user implementation

Xie, Huawei huawei.xie at intel.com
Fri Nov 7 22:25:40 CET 2014


How about using client/server model and select/poll event handing mechanism rather than poll?
The polling could cause periodic jitter.

> -----Original Message-----
> From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Tetsuya Mukawa
> Sent: Thursday, November 06, 2014 4:15 AM
> To: dev at dpdk.org
> Cc: nakajima.yoshihiro at lab.ntt.co.jp; masutani.hitoshi at lab.ntt.co.jp
> Subject: [dpdk-dev] [RFC PATCH 7/7] lib/librte_vhost: Add vhost-user
> implementation
> 
> This patch adds vhost-user implementation to librte_vhost.
> To communicate with vhost-user of QEMU, speficy VHOST_DRV_USER as
> a vhost_driver_type_t variable in rte_vhost_driver_register().
> 
> Signed-off-by: Tetsuya Mukawa <mukawa at igel.co.jp>
> ---
>  lib/librte_vhost/rte_virtio_net.h  |  19 +-
>  lib/librte_vhost/vhost-net-user.c  | 541
> +++++++++++++++++++++++++++++++++++++
>  lib/librte_vhost/vhost-net.c       |  39 ++-
>  lib/librte_vhost/vhost-net.h       |   7 +
>  lib/librte_vhost/virtio-net-user.c | 410 ++++++++++++++++++++++++++++
>  lib/librte_vhost/virtio-net.c      |  64 ++++-
>  6 files changed, 1073 insertions(+), 7 deletions(-)
>  create mode 100644 lib/librte_vhost/vhost-net-user.c
>  create mode 100644 lib/librte_vhost/virtio-net-user.c
> 
> diff --git a/lib/librte_vhost/rte_virtio_net.h b/lib/librte_vhost/rte_virtio_net.h
> index a9e20ea..af07900 100644
> --- a/lib/librte_vhost/rte_virtio_net.h
> +++ b/lib/librte_vhost/rte_virtio_net.h
> @@ -75,17 +75,32 @@ struct buf_vector {
>   */
>  typedef enum {
>  	VHOST_DRV_CUSE, /* cuse driver */
> +	VHOST_DRV_USER, /* vhost-user driver */
>  	VHOST_DRV_NUM	/* the number of vhost driver types */
>  } vhost_driver_type_t;
> 
> +
> +/**
> + * Structure contains vhost-user session specific information
> + */
> +struct vhost_user_session {
> +	int		fh;		/**< session identifier */
> +	pthread_t	tid;		/**< thread id of session handler */
> +	int		socketfd;	/**< fd of socket */
> +	int		interval;	/**< reconnection interval of session
> */
> +};
> +
>  /**
>   * Structure contains information relating vhost driver.
>   */
>  struct vhost_driver {
>  	vhost_driver_type_t	type;		/**< driver type. */
>  	const char		*dev_name;	/**< accessing device name. */
> +	void			*priv;		/**< private data. */
>  	union {
>  		struct fuse_session *cuse_session;	/**< fuse session. */
> +		struct vhost_user_session *user_session;
> +						/**< vhost-user session. */
>  	};
>  };
> 
> @@ -199,9 +214,11 @@ struct vhost_driver *rte_vhost_driver_register(
>  		const char *dev_name, vhost_driver_type_t type);
> 
>  /* Register callbacks. */
> -int rte_vhost_driver_callback_register(struct virtio_net_device_ops const *
> const);
> +int rte_vhost_driver_callback_register(struct vhost_driver *drv,
> +			struct virtio_net_device_ops const * const, void *priv);
>  /* Start vhost driver session blocking loop. */
>  int rte_vhost_driver_session_start(struct vhost_driver *drv);
> +void rte_vhost_driver_session_stop(struct vhost_driver *drv);
> 
>  /**
>   * This function adds buffers to the virtio devices RX virtqueue. Buffers can
> diff --git a/lib/librte_vhost/vhost-net-user.c b/lib/librte_vhost/vhost-net-user.c
> new file mode 100644
> index 0000000..434f20f
> --- /dev/null
> +++ b/lib/librte_vhost/vhost-net-user.c
> @@ -0,0 +1,541 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright (c) 2014 IGEL Co/.Ltd.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of IGEL nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <sys/types.h>
> +#include <sys/socket.h>
> +#include <linux/un.h>
> +
> +#define VHOST_USER_MAX_DEVICE		(32)
> +#define VHOST_USER_MAX_FD_NUM		(3)
> +
> +/* start id of vhost user device */
> +rte_atomic16_t vhost_user_device_id;
> +
> +static struct vhost_net_device_ops const *ops;
> +
> +typedef enum VhostUserRequest {
> +	VHOST_USER_NONE = 0,
> +	VHOST_USER_GET_FEATURES = 1,
> +	VHOST_USER_SET_FEATURES = 2,
> +	VHOST_USER_SET_OWNER = 3,
> +	VHOST_USER_RESET_OWNER = 4,
> +	VHOST_USER_SET_MEM_TABLE = 5,
> +	VHOST_USER_SET_LOG_BASE = 6,
> +	VHOST_USER_SET_LOG_FD = 7,
> +	VHOST_USER_SET_VRING_NUM = 8,
> +	VHOST_USER_SET_VRING_ADDR = 9,
> +	VHOST_USER_SET_VRING_BASE = 10,
> +	VHOST_USER_GET_VRING_BASE = 11,
> +	VHOST_USER_SET_VRING_KICK = 12,
> +	VHOST_USER_SET_VRING_CALL = 13,
> +	VHOST_USER_SET_VRING_ERR = 14,
> +	VHOST_USER_MAX
> +} VhostUserRequest;
> +
> +#define VHOST_MEMORY_MAX_NREGIONS	8
> +
> +typedef struct VhostUserMemoryRegion {
> +	uint64_t guest_phys_addr;
> +	uint64_t memory_size;
> +	uint64_t userspace_addr;
> +	uint64_t mmap_offset;
> +} VhostUserMemoryRegion;
> +
> +typedef struct VhostUserMemory {
> +	uint32_t nregions;
> +	uint32_t padding;
> +	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
> +} VhostUserMemory;
> +
> +typedef struct VhostUserMsg {
> +	VhostUserRequest request;
> +
> +#define VHOST_USER_VERSION_MASK		(0x3)
> +#define VHOST_USER_REPLY_MASK		(0x1<<2)
> +	uint32_t flags;
> +	uint32_t size; /* the following payload size */
> +	union {
> +#define VHOST_USER_VRING_IDX_MASK	(0xff)
> +#define VHOST_USER_VRING_NOFD_MASK	(0x1<<8)
> +		uint64_t u64;
> +		struct vhost_vring_state state;
> +		struct vhost_vring_addr addr;
> +		VhostUserMemory memory;
> +	};
> +} __attribute__((packed)) VhostUserMsg;
> +
> +static VhostUserMsg m __attribute__ ((unused));
> +#define VHOST_USER_HDR_SIZE	(sizeof(m.request) \
> +		+ sizeof(m.flags) + sizeof(m.size))
> +
> +/* The version of the protocol we support */
> +#define VHOST_USER_VERSION		(0x1)
> +
> +static unsigned long int ioctl_to_vhost_user_request[VHOST_USER_MAX] = {
> +	-1,			/* VHOST_USER_NONE */
> +	VHOST_GET_FEATURES,	/* VHOST_USER_GET_FEATURES */
> +	VHOST_SET_FEATURES,	/* VHOST_USER_SET_FEATURES */
> +	VHOST_SET_OWNER,	/* VHOST_USER_SET_OWNER */
> +	VHOST_RESET_OWNER,	/* VHOST_USER_RESET_OWNER */
> +	VHOST_SET_MEM_TABLE,	/* VHOST_USER_SET_MEM_TABLE */
> +	VHOST_SET_LOG_BASE,	/* VHOST_USER_SET_LOG_BASE */
> +	VHOST_SET_LOG_FD,	/* VHOST_USER_SET_LOG_FD */
> +	VHOST_SET_VRING_NUM,	/* VHOST_USER_SET_VRING_NUM */
> +	VHOST_SET_VRING_ADDR,	/* VHOST_USER_SET_VRING_ADDR */
> +	VHOST_SET_VRING_BASE,	/* VHOST_USER_SET_VRING_BASE */
> +	VHOST_GET_VRING_BASE,	/* VHOST_USER_GET_VRING_BASE */
> +	VHOST_SET_VRING_KICK,	/* VHOST_USER_SET_VRING_KICK */
> +	VHOST_SET_VRING_CALL,	/* VHOST_USER_SET_VRING_CALL */
> +	VHOST_SET_VRING_ERR	/* VHOST_USER_SET_VRING_ERR */
> +};
> +
> +/**
> + * Returns vhost_device_ctx from given fuse_req_t. The index is populated later
> when
> + * the device is added to the device linked list.
> + */
> +static struct vhost_device_ctx
> +vhost_driver_to_vhost_ctx(struct vhost_driver *drv)
> +{
> +	struct vhost_device_ctx ctx;
> +	int device_id = drv->user_session->fh;
> +
> +	ctx.type = VHOST_DRV_USER;
> +	ctx.fh = device_id;
> +	ctx.user.drv = drv;
> +
> +	return ctx;
> +}
> +
> +/**
> + * When the device is created in QEMU it gets initialised here and added to the
> device linked list.
> + */
> +static int
> +vhost_user_open(struct vhost_driver *drv)
> +{
> +	struct vhost_device_ctx ctx = vhost_driver_to_vhost_ctx(drv);
> +
> +	int ret;
> +
> +	ret = ops->new_device(ctx);
> +	if (ret == -1)
> +		return -1;
> +
> +	RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device configuration
> started\n", ctx.fh);
> +
> +	return 0;
> +}
> +
> +/**
> + * When QEMU is shutdown or killed the device gets released.
> + */
> +static void
> +vhost_user_release(struct vhost_driver *drv)
> +{
> +	struct vhost_device_ctx ctx = vhost_driver_to_vhost_ctx(drv);
> +
> +	ops->destroy_device(ctx);
> +	RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device released\n",
> ctx.fh);
> +}
> +
> +/**
> + * Send data to vhost-user device on a QEMU.
> + */
> +static int
> +vhost_user_write(struct vhost_driver *drv, VhostUserMsg *msg,
> +		int *fds, size_t fd_num)
> +{
> +	int fd, len;
> +	size_t fd_size = fd_num * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +	struct msghdr msg_header;
> +	struct iovec iov[1];
> +	struct cmsghdr *cmsg_header;
> +	struct vhost_device_ctx ctx = vhost_driver_to_vhost_ctx(drv);
> +
> +	if ((drv == NULL) || (msg == NULL))
> +		return -EINVAL;
> +
> +	fd = drv->user_session->socketfd;
> +
> +	memset(&msg_header, 0, sizeof(msg_header));
> +	memset(control, 0, sizeof(control));
> +
> +	/* set the payload */
> +	iov[0].iov_base = (void *)msg;
> +	iov[0].iov_len = VHOST_USER_HDR_SIZE + msg->size;
> +
> +	msg_header.msg_iov = iov;
> +	msg_header.msg_iovlen = 1;
> +
> +	if (fd_num) {
> +		msg_header.msg_control = control;
> +		msg_header.msg_controllen = sizeof(control);
> +		cmsg_header = CMSG_FIRSTHDR(&msg_header);
> +		cmsg_header->cmsg_len = CMSG_LEN(fd_size);
> +		cmsg_header->cmsg_level = SOL_SOCKET;
> +		cmsg_header->cmsg_type = SCM_RIGHTS;
> +		memcpy(CMSG_DATA(cmsg_header), fds, fd_size);
> +	} else {
> +		msg_header.msg_control = 0;
> +		msg_header.msg_controllen = 0;
> +	}
> +
> +	do {
> +		len = sendmsg(fd, &msg_header, 0);
> +	} while (len < 0 && errno == EINTR);
> +
> +	if (len < 0)
> +		goto error;
> +
> +	return 0;
> +
> +error:
> +	RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device cannot send
> message\n", ctx.fh);
> +	return -EFAULT;
> +}
> +
> +/**
> + * Receive data from vhost-user device on a QEMU.
> + */
> +static int
> +vhost_user_read(struct vhost_driver *drv, VhostUserMsg *msg,
> +		int *fds, size_t *fd_num)
> +{
> +	int fd, len;
> +	size_t fd_size = (*fd_num) * sizeof(int);
> +	char control[CMSG_SPACE(fd_size)];
> +	struct msghdr msg_header;
> +	struct iovec iov[1];
> +	struct cmsghdr *cmsg_header;
> +	struct vhost_device_ctx ctx = vhost_driver_to_vhost_ctx(drv);
> +
> +	if ((drv == NULL) || (msg == NULL))
> +		return -EINVAL;
> +
> +	fd = drv->user_session->socketfd;
> +
> +	memset(&msg_header, 0, sizeof(msg_header));
> +	memset(control, 0, sizeof(control));
> +	*fd_num = 0;
> +
> +	/* set the payload */
> +	iov[0].iov_base = (void *)msg;
> +	iov[0].iov_len = VHOST_USER_HDR_SIZE;
> +
> +	msg_header.msg_iov = iov;
> +	msg_header.msg_iovlen = 1;
> +	msg_header.msg_control = control;
> +	msg_header.msg_controllen = sizeof(control);
> +
> +	if ((len = recvmsg(fd, &msg_header, 0)) <= 0)
> +		goto error;
> +
> +	if (msg_header.msg_flags & (MSG_TRUNC | MSG_CTRUNC))
> +		goto error;
> +
> +	cmsg_header = CMSG_FIRSTHDR(&msg_header);
> +	if (cmsg_header && cmsg_header->cmsg_len > 0 &&
> +			cmsg_header->cmsg_level == SOL_SOCKET &&
> +			cmsg_header->cmsg_type == SCM_RIGHTS) {
> +		if (fd_size >= cmsg_header->cmsg_len - CMSG_LEN(0)) {
> +			fd_size = cmsg_header->cmsg_len - CMSG_LEN(0);
> +			memcpy(fds, CMSG_DATA(cmsg_header), fd_size);
> +			*fd_num = fd_size / sizeof(int);
> +		}
> +	}
> +
> +	if (read(fd, ((char *)msg) + len, msg->size) < 0)
> +		goto error;
> +
> +	return 0;
> +
> +error:
> +	RTE_LOG(INFO, VHOST_CONFIG, "(%"PRIu64") Device cannot receive
> message\n", ctx.fh);
> +	return -EFAULT;
> +}
> +
> +/*
> + * Boilerplate code for vhost-user IOCTL
> + * Implicit arguments: ctx, req, result.
> + */
> +#define VHOST_USER_IOCTL(func) do {	\
> +	result = (func)(ctx);		\
> +} while (0)
> +
> +/*
> + * Boilerplate code for vhost-user Read IOCTL
> + * Implicit arguments: ctx, req, result, in_bufsz, in_buf.
> + */
> +#define VHOST_USER_IOCTL_R(type, var, func) do {\
> +	result = func(ctx, &(var));		\
> +} while (0)
> +
> +/*
> + * Boilerplate code for vhost-user Write IOCTL
> + * Implicit arguments: ctx, req, result, out_bufsz.
> + */
> +#define	VHOST_USER_IOCTL_W(type, var, func) do {\
> +	result = (func)(ctx, &(var));		\
> +	msg->flags |= VHOST_USER_REPLY_MASK;	\
> +	msg->size = sizeof(type);		\
> +	vhost_user_write(drv, msg, NULL, 0);	\
> +} while (0)
> +
> +/*
> + * Boilerplate code for vhost-user Read/Write IOCTL
> + * Implicit arguments: ctx, req, result, in_bufsz, in_buf.
> + */
> +#define VHOST_USER_IOCTL_RW(type1, var1, type2, var2, func) do {\
> +	result = (func)(ctx, (var1), &(var2));			\
> +	msg->flags |= VHOST_USER_REPLY_MASK;			\
> +	msg->size = sizeof(type2);				\
> +	vhost_user_write(drv, msg, NULL, 0);			\
> +} while (0)
> +
> +/**
> + * The IOCTLs are handled using unix domain socket in userspace.
> + */
> +	static int
> +vhost_user_ioctl(struct vhost_driver *drv, VhostUserMsg *msg,
> +		int *fds, int fd_num)
> +{
> +	struct vhost_device_ctx ctx = vhost_driver_to_vhost_ctx(drv);
> +	struct vhost_vring_file file;
> +	int result = 0;
> +
> +	switch (ioctl_to_vhost_user_request[msg->request]) {
> +	case VHOST_GET_FEATURES:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_GET_FEATURES\n", ctx.fh);
> +		VHOST_USER_IOCTL_W(uint64_t, msg->u64, ops->get_features);
> +		break;
> +
> +	case VHOST_SET_FEATURES:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_FEATURES\n", ctx.fh);
> +		VHOST_USER_IOCTL_R(uint64_t, msg->u64, ops->set_features);
> +		break;
> +
> +	case VHOST_RESET_OWNER:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_RESET_OWNER\n", ctx.fh);
> +		VHOST_USER_IOCTL(ops->reset_owner);
> +		break;
> +
> +	case VHOST_SET_OWNER:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_OWNER\n", ctx.fh);
> +		VHOST_USER_IOCTL(ops->set_owner);
> +		break;
> +
> +	case VHOST_SET_MEM_TABLE:
> +		/*TODO fix race condition.*/
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_MEM_TABLE\n", ctx.fh);
> +		/* all fds should be same, because physical memory consist of
> an one file */
> +		ctx.user.fds = fds;
> +		ctx.user.fd_num = fd_num;
> +		result = ops->set_mem_table(ctx, &msg->memory, msg-
> >memory.nregions);
> +		break;
> +
> +	case VHOST_SET_VRING_NUM:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_VRING_NUM\n", ctx.fh);
> +		VHOST_USER_IOCTL_R(struct vhost_vring_state, msg->state,
> ops->set_vring_num);
> +		break;
> +
> +	case VHOST_SET_VRING_BASE:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_VRING_BASE\n", ctx.fh);
> +		VHOST_USER_IOCTL_R(struct vhost_vring_state, msg->state,
> ops->set_vring_base);
> +		break;
> +
> +	case VHOST_GET_VRING_BASE:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_GET_VRING_BASE\n", ctx.fh);
> +		VHOST_USER_IOCTL_RW(uint32_t, msg->addr.index, struct
> vhost_vring_state, msg->state, ops->get_vring_base);
> +		break;
> +
> +	case VHOST_SET_VRING_ADDR:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_VRING_ADDR\n", ctx.fh);
> +		VHOST_USER_IOCTL_R(struct vhost_vring_addr, msg->addr,
> ops->set_vring_addr);
> +		break;
> +
> +	case VHOST_SET_VRING_KICK:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_VRING_KICK\n", ctx.fh);
> +		ctx.user.fds = fds;
> +		ctx.user.fd_num = fd_num;
> +		file.index = msg->u64;
> +		VHOST_USER_IOCTL_R(struct vhost_vring_file, file, ops-
> >set_vring_kick);
> +		break;
> +
> +	case VHOST_SET_VRING_CALL:
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL:
> VHOST_SET_VRING_CALL\n", ctx.fh);
> +		ctx.user.fds = fds;
> +		ctx.user.fd_num = fd_num;
> +		file.index = msg->u64;
> +		VHOST_USER_IOCTL_R(struct vhost_vring_file, file, ops-
> >set_vring_call);
> +		break;
> +
> +	case VHOST_NET_SET_BACKEND:
> +	default:
> +		RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") IOCTL: DOESN
> NOT EXIST\n", ctx.fh);
> +		result = -1;
> +	}
> +
> +	if (result < 0)
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL: FAIL\n",
> ctx.fh);
> +	else
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") IOCTL: SUCCESS\n",
> ctx.fh);
> +
> +	return result;
> +}
> +
> +/**
> + * vhost-user specific registration.
> + */
> +static int
> +vhost_user_driver_register(struct vhost_driver *drv)
> +{
> +	if ((drv == NULL) || (drv->dev_name == NULL) ||
> +			(strlen(drv->dev_name) > UNIX_PATH_MAX - 1))
> +		return -1;
> +
> +	ops = get_virtio_net_callbacks(drv->type);
> +
> +	drv->user_session = rte_malloc(NULL, sizeof(struct vhost_user_session),
> CACHE_LINE_SIZE);
> +	if (drv->user_session == NULL)
> +		return -1;
> +
> +	drv->user_session->fh =
> +		rte_atomic16_add_return(&vhost_user_device_id, 1) - 1; /* fh
> of first device is zero */
> +	drv->user_session->interval = 1;
> +
> +	return 0;
> +}
> +
> +/**
> + * When vhost-user driver starts, the session handler communicates with vhost-
> user
> + * device on a QEMU using a unix domain sokcet.
> + */
> +static void *
> +vhost_user_session_handler(void *data)
> +{
> +	struct vhost_driver *drv = data;
> +	int ret;
> +	struct sockaddr_un caddr;
> +	VhostUserMsg msg;
> +	int fds[VHOST_USER_MAX_FD_NUM];
> +	int socketfd;
> +	int interval;
> +	size_t fd_num;
> +
> +	if ((drv == NULL) || (drv->dev_name == NULL))
> +		return NULL;
> +
> +	bzero(&caddr, sizeof(caddr));
> +	caddr.sun_family = AF_LOCAL;
> +	strncpy((char *)&caddr.sun_path, drv->dev_name, strlen(drv-
> >dev_name));
> +
> +reconnect:
> +	drv->user_session->socketfd = socket(AF_UNIX, SOCK_STREAM, 0);
> +	if (drv->user_session->socketfd < 0)
> +		return NULL;
> +
> +	socketfd = drv->user_session->socketfd;
> +	interval = drv->user_session->interval;
> +	while (1) {
> +		ret = connect(socketfd, (struct sockaddr *)&caddr, sizeof(caddr));
> +		if (ret == 0)
> +			break; /* success */
> +		sleep(interval);
> +	}
> +
> +	ret = vhost_user_open(drv);
> +	if (ret != 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "(Socket %s) open failuer\n",
> drv->dev_name);
> +		return NULL;
> +	}
> +
> +	for (;;) {
> +		fd_num = VHOST_USER_MAX_FD_NUM;
> +		bzero(&msg, sizeof(VhostUserMsg));
> +		ret = vhost_user_read(drv, &msg, fds, &fd_num);
> +		if (ret != 0) {
> +			RTE_LOG(ERR, VHOST_CONFIG, "(Socket %s) read
> failuer\n", drv->dev_name);
> +			vhost_user_release(drv);
> +			goto reconnect;
> +		}
> +
> +		ret = vhost_user_ioctl(drv, &msg, fds, fd_num);
> +		if (ret != 0) {
> +			RTE_LOG(ERR, VHOST_CONFIG, "(Socket %s) request
> failuer\n", drv->dev_name);
> +			vhost_user_release(drv);
> +			goto reconnect;
> +		}
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + * Create session handler
> + */
> +static int
> +vhost_user_driver_start(struct vhost_driver *drv)
> +{
> +	if (pthread_create(&drv->user_session->tid, NULL,
> vhost_user_session_handler, drv)) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +				"(Socket %s) starting event handler failuer\n",
> drv->dev_name);
> +		return -1;
> +	}
> +
> +	/* TODO: The event handler thread may need to run on a core user
> speficied. */
> +
> +	return 0;
> +}
> +
> +/**
> + * Destroy session handler
> + */
> +static void
> +vhost_user_driver_stop(struct vhost_driver *drv)
> +{
> +	pthread_t *tid = &drv->user_session->tid;
> +
> +	if (pthread_create(tid, NULL, vhost_user_session_handler, drv)) {
> +		RTE_LOG(ERR, VHOST_CONFIG,
> +				"(Socket %s) starting event handler failuer\n",
> drv->dev_name);
> +		return;
> +	}
> +
> +	/* stop event thread and wait until connection is closed */
> +	if (*tid) {
> +		pthread_cancel(*tid);
> +		pthread_join(*tid, NULL);
> +	}
> +
> +	vhost_user_release(drv);
> +}
> diff --git a/lib/librte_vhost/vhost-net.c b/lib/librte_vhost/vhost-net.c
> index b0de5fd..10f41e9 100644
> --- a/lib/librte_vhost/vhost-net.c
> +++ b/lib/librte_vhost/vhost-net.c
> @@ -42,6 +42,11 @@
>   */
>  #include "vhost-net-cdev.c"
> 
> +/*
> + * Include vhost-user depend functions and definitions
> + */
> +#include "vhost-net-user.c"
> +
>  /**
>   * This function abstracts cuse and vhost-user driver registration.
>   */
> @@ -65,10 +70,17 @@ rte_vhost_driver_register(const char *dev_name,
> vhost_driver_type_t type)
>  		if (ret != 0)
>  			goto err;
>  		break;
> +	case VHOST_DRV_USER:
> +		ret = vhost_user_driver_register(drv);
> +		break;
>  	default:
> +		ret = -EINVAL;
>  		break;
>  	}
> 
> +	if (ret != 0)
> +		goto err;
> +
>  	return drv;
>  err:
>  	free(drv);
> @@ -81,17 +93,40 @@ err:
>  int
>  rte_vhost_driver_session_start(struct vhost_driver *drv)
>  {
> +	int ret;
> +
>  	if (drv == NULL)
>  		return -ENODEV;
> 
>  	switch (drv->type) {
>  	case VHOST_DRV_CUSE:
> -		vhost_cuse_driver_session_start(drv);
> +		ret = vhost_cuse_driver_session_start(drv);
> +		break;
> +	case VHOST_DRV_USER:
> +		ret = vhost_user_driver_start(drv);
>  		break;
>  	default:
> +		ret = -EINVAL;
>  		break;
>  	}
> 
> -	return 0;
> +	return ret;
>  }
> 
> +/**
> + * The vhost session is closed, only allow for vhost-user.
> + */
> +void
> +rte_vhost_driver_session_stop(struct vhost_driver *drv)
> +{
> +	if (drv == NULL)
> +		return;
> +
> +	switch (drv->type) {
> +	case VHOST_DRV_USER:
> +		vhost_user_driver_stop(drv);
> +		break;
> +	default:
> +		break;
> +	}
> +}
> diff --git a/lib/librte_vhost/vhost-net.h b/lib/librte_vhost/vhost-net.h
> index ef04832..0e36ba0 100644
> --- a/lib/librte_vhost/vhost-net.h
> +++ b/lib/librte_vhost/vhost-net.h
> @@ -76,6 +76,12 @@ struct vhost_device_cuse_ctx {
>  	pid_t   pid;	/* PID of process calling the IOCTL. */
>  };
> 
> +struct vhost_device_user_ctx {
> +	int			*fds;
> +	int			fd_num;
> +	struct vhost_driver	*drv;
> +};
> +
>  /*
>   * Structure used to identify device context.
>   */
> @@ -83,6 +89,7 @@ struct vhost_device_ctx {
>  	vhost_driver_type_t	type;	/* driver type. */
>  	uint64_t		fh;	/* Populated with fi->fh to track the
> device index. */
>  	union {
> +		struct vhost_device_user_ctx user;
>  		struct vhost_device_cuse_ctx cdev;
>  	};
>  };
> diff --git a/lib/librte_vhost/virtio-net-user.c b/lib/librte_vhost/virtio-net-user.c
> new file mode 100644
> index 0000000..1e78f98
> --- /dev/null
> +++ b/lib/librte_vhost/virtio-net-user.c
> @@ -0,0 +1,410 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright (c) 2014 IGEL Co.,Ltd.
> + *   All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of IGEL nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
> CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
> FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
> COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
> INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
> NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
> OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
> ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
> THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
> DAMAGE.
> + */
> +
> +#include <sys/types.h>
> +#include <sys/stat.h>
> +#include <fcntl.h>
> +#include <unistd.h>
> +#include <sys/mman.h>
> +#include <stdlib.h>
> +
> +/* Functions defined in virtio_net.c */
> +static void init_device(struct vhost_device_ctx ctx, struct virtio_net *dev);
> +static void cleanup_device(struct virtio_net *dev);
> +static void free_device(struct virtio_net_config_ll *ll_dev);
> +static int new_device(struct vhost_device_ctx ctx);
> +static void destroy_device(struct vhost_device_ctx ctx);
> +static int set_owner(struct vhost_device_ctx ctx);
> +static int reset_owner(struct vhost_device_ctx ctx);
> +static int get_features(struct vhost_device_ctx ctx, uint64_t *pu);
> +static int set_features(struct vhost_device_ctx ctx, uint64_t *pu);
> +static int set_vring_num(struct vhost_device_ctx ctx, struct vhost_vring_state
> *state);
> +static int set_vring_addr(struct vhost_device_ctx ctx, struct vhost_vring_addr
> *addr);
> +static int set_vring_base(struct vhost_device_ctx ctx, struct vhost_vring_state
> *state);
> +static int set_backend(struct vhost_device_ctx ctx, struct vhost_vring_file *file);
> +
> +/* Root address of the linked list in the configuration core. */
> +static struct virtio_net_config_ll *user_ll_root;
> +
> +/**
> + * Retrieves an entry from the devices configuration linked list.
> + */
> +static struct virtio_net_config_ll *
> +user_get_config_ll_entry(struct vhost_device_ctx ctx)
> +{
> +	struct virtio_net_config_ll *ll_dev = user_ll_root;
> +
> +	/* Loop through linked list until the device_fh is found. */
> +	while (ll_dev != NULL) {
> +		if (ll_dev->dev.device_fh == ctx.fh)
> +			return ll_dev;
> +		ll_dev = ll_dev->next;
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + * Searches the configuration core linked list and retrieves the device if it exists.
> + */
> +static struct virtio_net *
> +user_get_device(struct vhost_device_ctx ctx)
> +{
> +	struct virtio_net_config_ll *ll_dev;
> +
> +	ll_dev = user_get_config_ll_entry(ctx);
> +
> +	/* If a matching entry is found in the linked list, return the device in that
> entry. */
> +	if (ll_dev)
> +		return &ll_dev->dev;
> +
> +	RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Device not found in linked
> list.\n", ctx.fh);
> +	return NULL;
> +}
> +
> +/**
> + * Add entry containing a device to the device configuration linked list.
> + */
> +static void
> +user_add_config_ll_entry(struct virtio_net_config_ll *new_ll_dev)
> +{
> +	struct virtio_net_config_ll *ll_dev = user_ll_root;
> +
> +	/* If ll_dev == NULL then this is the first device so go to else */
> +	if (ll_dev) {
> +		/* If the 1st device_fh != 0 then we insert our device here. */
> +		if (ll_dev->dev.device_fh != 0)	{
> +			new_ll_dev->dev.device_fh = 0;
> +			new_ll_dev->next = ll_dev;
> +			user_ll_root = new_ll_dev;
> +		} else {
> +			/* Increment through the ll until we find un unused
> device_fh. Insert the device at that entry*/
> +			while ((ll_dev->next != NULL) && (ll_dev->dev.device_fh
> == (ll_dev->next->dev.device_fh - 1)))
> +				ll_dev = ll_dev->next;
> +
> +			new_ll_dev->dev.device_fh = ll_dev->dev.device_fh + 1;
> +			new_ll_dev->next = ll_dev->next;
> +			ll_dev->next = new_ll_dev;
> +		}
> +	} else {
> +		user_ll_root = new_ll_dev;
> +		user_ll_root->dev.device_fh = 0;
> +	}
> +
> +}
> +
> +/**
> + * Remove an entry from the device configuration linked list.
> + */
> +static struct virtio_net_config_ll *
> +user_rm_config_ll_entry(struct virtio_net_config_ll *ll_dev, struct
> virtio_net_config_ll *ll_dev_last)
> +{
> +	/* First remove the device and then clean it up. */
> +	if (ll_dev == user_ll_root) {
> +		user_ll_root = ll_dev->next;
> +		cleanup_device(&ll_dev->dev);
> +		free_device(ll_dev);
> +		return user_ll_root;
> +	} else {
> +		if (likely(ll_dev_last != NULL)) {
> +			ll_dev_last->next = ll_dev->next;
> +			cleanup_device(&ll_dev->dev);
> +			free_device(ll_dev);
> +			return ll_dev_last->next;
> +		} else {
> +			cleanup_device(&ll_dev->dev);
> +			free_device(ll_dev);
> +			RTE_LOG(ERR, VHOST_CONFIG, "Remove entry from
> config_ll failed\n");
> +			return NULL;
> +		}
> +	}
> +}
> +
> +/**
> + * Returns the root entry of linked list
> + */
> +static struct virtio_net_config_ll *
> +user_get_config_ll_root(void)
> +{
> +	return user_ll_root;
> +}
> +
> +/**
> + * vhost-user specific device initialization.
> + */
> +static void
> +user_init_device(struct vhost_device_ctx ctx, struct virtio_net *dev)
> +{
> +	dev->priv = ctx.user.drv->priv;
> +}
> +
> +/**
> + * Locate the file containing QEMU's memory space and map it to our address
> space.
> + */
> +static int
> +user_host_memory_map(struct virtio_net *dev, struct virtio_memory *mem,
> int fd, size_t size)
> +{
> +	void *map;
> +
> +	map = mmap(0, size, PROT_READ|PROT_WRITE ,
> MAP_POPULATE|MAP_SHARED, fd, 0);
> +	close(fd);
> +
> +	if (map == MAP_FAILED) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Error mapping the
> file fd %d\n",  dev->device_fh, fd);
> +		return -1;
> +	}
> +
> +	/* Store the memory address and size in the device data structure */
> +	mem->mapped_address = (uint64_t)(uintptr_t)map;
> +	mem->mapped_size = size;
> +
> +	LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Mem File: fd: %d - Size: %llu
> - VA: %p\n", dev->device_fh,
> +			fd, (long long unsigned)mem->mapped_size, map);
> +
> +	return 0;
> +}
> +
> +/*
> + * Called from IOCTL: VHOST_SET_MEM_TABLE
> + * This function creates and populates the memory structure for the device.
> This includes
> + * storing offsets used to translate buffer addresses.
> + */
> +static int
> +user_set_mem_table(struct vhost_device_ctx ctx, const void
> *mem_regions_addr, uint32_t nregions)
> +{
> +	struct virtio_net *dev;
> +	struct vhost_memory_region *mem_regions;
> +	struct virtio_memory *mem;
> +	uint64_t size = offsetof(struct vhost_memory, regions);
> +	uint32_t regionidx, valid_regions;
> +	size_t guest_memory_size = 0;
> +
> +	dev = user_get_device(ctx);
> +	if (dev == NULL)
> +		return -1;
> +
> +	if (dev->mem) {
> +		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> (size_t)dev->mem->mapped_size);
> +		free(dev->mem);
> +	}
> +
> +	/* Malloc the memory structure depending on the number of regions. */
> +	mem = calloc(1, sizeof(struct virtio_memory) + (sizeof(struct
> virtio_memory_regions) * nregions));
> +	if (mem == NULL) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to allocate
> memory for dev->mem.\n", dev->device_fh);
> +		return -1;
> +	}
> +
> +	mem->nregions = nregions;
> +
> +	mem_regions =
> (void*)(uintptr_t)((uint64_t)(uintptr_t)mem_regions_addr + size);
> +
> +	for (regionidx = 0; regionidx < mem->nregions; regionidx++) {
> +		/* Populate the region structure for each region. */
> +		mem->regions[regionidx].guest_phys_address =
> mem_regions[regionidx].guest_phys_addr;
> +		mem->regions[regionidx].guest_phys_address_end = mem-
> >regions[regionidx].guest_phys_address +
> +			mem_regions[regionidx].memory_size;
> +		mem->regions[regionidx].memory_size =
> mem_regions[regionidx].memory_size;
> +		mem->regions[regionidx].userspace_address =
> mem_regions[regionidx].userspace_addr;
> +		guest_memory_size += mem_regions[regionidx].memory_size;
> +
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") REGION: %u -
> GPA: %p - QEMU VA: %p - SIZE (%"PRIu64")\n", dev->device_fh,
> +				regionidx, (void*)(uintptr_t)mem-
> >regions[regionidx].guest_phys_address,
> +				(void*)(uintptr_t)mem-
> >regions[regionidx].userspace_address,
> +				mem->regions[regionidx].memory_size);
> +	}
> +
> +	for (regionidx = 0; regionidx < mem->nregions; regionidx++) {
> +		/*set the base address mapping*/
> +		if (mem->regions[regionidx].guest_phys_address == 0x0) {
> +			mem->base_address = mem-
> >regions[regionidx].userspace_address;
> +			/* Map VM memory file */
> +			if (user_host_memory_map(dev, mem,
> ctx.user.fds[regionidx], guest_memory_size) != 0) {
> +				free(mem);
> +				return -1;
> +			}
> +		} else
> +			close(ctx.user.fds[regionidx]);
> +	}
> +
> +	/* Check that we have a valid base address. */
> +	if (mem->base_address == 0) {
> +		RTE_LOG(ERR, VHOST_CONFIG, "(%"PRIu64") Failed to find base
> address of qemu memory file.\n", dev->device_fh);
> +		free(mem);
> +		return -1;
> +	}
> +
> +	/* Check if all of our regions have valid mappings. Usually one does not
> exist in the QEMU memory file. */
> +	valid_regions = mem->nregions;
> +	for (regionidx = 0; regionidx < mem->nregions; regionidx++) {
> +		if ((mem->regions[regionidx].userspace_address < mem-
> >base_address) ||
> +				(mem->regions[regionidx].userspace_address >
> (mem->base_address + mem->mapped_size)))
> +			valid_regions--;
> +	}
> +
> +	/* If a region does not have a valid mapping we rebuild our memory
> struct to contain only valid entries. */
> +	if (valid_regions != mem->nregions) {
> +		LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") Not all memory
> regions exist in the QEMU mem file. Re-populating mem structure\n",
> +				dev->device_fh);
> +
> +		/* Re-populate the memory structure with only valid regions.
> Invalid regions are over-written with memmove. */
> +		valid_regions = 0;
> +
> +		for (regionidx = mem->nregions; 0 != regionidx--;) {
> +			if ((mem->regions[regionidx].userspace_address <
> mem->base_address) ||
> +					(mem-
> >regions[regionidx].userspace_address > (mem->base_address + mem-
> >mapped_size))) {
> +				memmove(&mem->regions[regionidx], &mem-
> >regions[regionidx + 1],
> +						sizeof(struct
> virtio_memory_regions) * valid_regions);
> +			} else {
> +				valid_regions++;
> +			}
> +		}
> +	}
> +	mem->nregions = valid_regions;
> +	dev->mem = mem;
> +
> +	/*
> +	 * Calculate the address offset for each region. This offset is used to
> identify the vhost virtual address
> +	 * corresponding to a QEMU guest physical address.
> +	 */
> +	for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++)
> +		dev->mem->regions[regionidx].address_offset = dev->mem-
> >regions[regionidx].userspace_address - dev->mem->base_address
> +			+ dev->mem->mapped_address - dev->mem-
> >regions[regionidx].guest_phys_address;
> +
> +	return 0;
> +}
> +
> +/**
> + * Called from IOCTL: VHOST_GET_VRING_BASE
> + * We send the virtio device our available ring last used index.
> + */
> +static int
> +user_get_vring_base(struct vhost_device_ctx ctx, uint32_t index, struct
> vhost_vring_state *state)
> +{
> +	struct virtio_net *dev;
> +
> +	dev = user_get_device(ctx);
> +	if (dev == NULL)
> +		return -1;
> +
> +	state->index = index;
> +	/* State->index refers to the queue index. The TX queue is 1, RX queue is
> 0. */
> +	state->num = dev->virtqueue[state->index]->last_used_idx;
> +
> +	return 0;
> +}
> +
> +/**
> + * Called from IOCTL: VHOST_SET_VRING_CALL
> + * The virtio device sends an eventfd to interrupt the guest. This fd gets copied
> in
> + * to our process space.
> + * Also this message is sent when virtio-net device is reset by device driver on
> QEMU.
> + */
> +static int
> +user_set_vring_call(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
> +{
> +	struct virtio_net *dev;
> +	struct vhost_virtqueue *vq;
> +
> +	dev = user_get_device(ctx);
> +	if (dev == NULL)
> +		return -1;
> +
> +	/* file->index refers to the queue index. The TX queue is 1, RX queue is 0.
> */
> +	vq = dev->virtqueue[file->index];
> +
> +	if (vq->kickfd)
> +		close((int)vq->kickfd);
> +
> +	/* Populate the eventfd_copy structure and call eventfd_copy. */
> +	vq->kickfd = ctx.user.fds[0];
> +
> +	return 0;
> +}
> +
> +/**
> + * Called from IOCTL: VHOST_SET_VRING_KICK
> + * The virtio device sends an eventfd that it can use to notify us. This fd gets
> copied in
> + * to our process space.
> + */
> +static int
> +user_set_vring_kick(struct vhost_device_ctx ctx, struct vhost_vring_file *file)
> +{
> +	struct virtio_net *dev;
> +	struct vhost_virtqueue *vq;
> +
> +	dev = user_get_device(ctx);
> +	if (dev == NULL)
> +		return -1;
> +
> +	/* file->index refers to the queue index. The TX queue is 1, RX queue is 0.
> */
> +	vq = dev->virtqueue[file->index];
> +
> +	if (vq->callfd)
> +		close((int)vq->callfd);
> +
> +	/* Populate the eventfd_copy structure and call eventfd_copy. */
> +	vq->callfd = ctx.user.fds[0];
> +
> +	if ((dev->virtqueue[VIRTIO_RXQ] != NULL) && (dev-
> >virtqueue[VIRTIO_TXQ]) != NULL)
> +		return set_backend(ctx, file);
> +
> +	return 0;
> +}
> +
> +/**
> + * Function pointers are set for the device operations to allow to call functions
> + * when an IOCTL, device_add or device_release is received.
> + */
> +static const struct vhost_net_device_ops vhost_user_device_ops = {
> +	.new_device = new_device,
> +	.destroy_device = destroy_device,
> +
> +	.get_features = get_features,
> +	.set_features = set_features,
> +
> +	.set_mem_table = user_set_mem_table,
> +
> +	.set_vring_num = set_vring_num,
> +	.set_vring_addr = set_vring_addr,
> +	.set_vring_base = set_vring_base,
> +	.get_vring_base = user_get_vring_base,
> +
> +	.set_vring_kick = user_set_vring_kick,
> +	.set_vring_call = user_set_vring_call,
> +
> +	.set_backend = set_backend,
> +
> +	.set_owner = set_owner,
> +	.reset_owner = reset_owner,
> +};
> diff --git a/lib/librte_vhost/virtio-net.c b/lib/librte_vhost/virtio-net.c
> index 13fbb6f..db810e7 100644
> --- a/lib/librte_vhost/virtio-net.c
> +++ b/lib/librte_vhost/virtio-net.c
> @@ -96,6 +96,12 @@ qva_to_vva(struct virtio_net *dev, uint64_t qemu_va)
>   */
>  #include "virtio-net-cdev.c"
> 
> +/**
> + * Include vhost-user depend functions and definitions.
> + */
> +#include "virtio-net-user.c"
> +
> +
>  /*
>   * Retrieves an entry from the devices configuration linked list.
>   */
> @@ -105,6 +111,8 @@ get_config_ll_entry(struct vhost_device_ctx ctx)
>  	switch (ctx.type) {
>  	case VHOST_DRV_CUSE:
>  		return cdev_get_config_ll_entry(ctx);
> +	case VHOST_DRV_USER:
> +		return user_get_config_ll_entry(ctx);
>  	default:
>  		break;
>  	}
> @@ -120,6 +128,8 @@ get_device(struct vhost_device_ctx ctx)
>  	switch (ctx.type) {
>  	case VHOST_DRV_CUSE:
>  		return cdev_get_device(ctx);
> +	case VHOST_DRV_USER:
> +		return user_get_device(ctx);
>  	default:
>  		break;
>  	}
> @@ -136,6 +146,8 @@ add_config_ll_entry(vhost_driver_type_t type,
>  	switch (type) {
>  	case VHOST_DRV_CUSE:
>  		return cdev_add_config_ll_entry(new_ll_dev);
> +	case VHOST_DRV_USER:
> +		return user_add_config_ll_entry(new_ll_dev);
>  	default:
>  		break;
>  	}
> @@ -149,8 +161,39 @@ cleanup_device(struct virtio_net *dev)
>  {
>  	/* Unmap QEMU memory file if mapped. */
>  	if (dev->mem) {
> -		munmap((void *)(uintptr_t)dev->mem->mapped_address,
> -			(size_t)dev->mem->mapped_size);
> +		{
> +			/*
> +			 * 'munmap()' will be failed when mapped_size isn't
> +			 * aligned with hugepage size.
> +			 * Usually a file size of QEMU physical memory is
> +			 * aligned by hugepage size. So In a case of CUSE,
> +			 * there is no problem. But with vhost-user, there is
> +			 * no way to get physical memory size.
> +			 *
> +			 * Let's assume hugepage size is 2MB or 1GB here.
> +			 * BTW, 'mmap()' automatically fixed size parameter
> +			 * to be aligned. Why does 'munmap()' do like so?
> +			 */
> +			int ret = 0;
> +			size_t hugepagesize, size = dev->mem->mapped_size;
> +
> +			/* assume hugepage size is 2MB */
> +			hugepagesize = 2 * 1024 * 1024;
> +			size = (size + hugepagesize - 1) /
> +						hugepagesize * hugepagesize;
> +			ret = munmap((void *)(uintptr_t)
> +						dev->mem->mapped_address,
> +						size);
> +			if (ret) {
> +				/* assume hugepage size is 1GB, try again */
> +				hugepagesize = 1024 * 1024 * 1024;
> +				size = (size + hugepagesize - 1) /
> +						hugepagesize * hugepagesize;
> +				munmap((void *)(uintptr_t)
> +						dev->mem->mapped_address,
> +						size);
> +			}
> +		}
>  		free(dev->mem);
>  	}
> 
> @@ -187,6 +230,8 @@ rm_config_ll_entry(vhost_driver_type_t type,
>  	switch (type) {
>  	case VHOST_DRV_CUSE:
>  		return cdev_rm_config_ll_entry(ll_dev, ll_dev_last);
> +	case VHOST_DRV_USER:
> +		return user_rm_config_ll_entry(ll_dev, ll_dev_last);
>  	default:
>  		break;
>  	}
> @@ -201,7 +246,9 @@ get_config_ll_root(struct vhost_device_ctx ctx)
>  {
>  	switch (ctx.type) {
>  	case VHOST_DRV_CUSE:
> -		return cdev_get_config_ll_root(ctx);
> +		return cdev_get_config_ll_root();
> +	case VHOST_DRV_USER:
> +		return user_get_config_ll_root();
>  	default:
>  		break;
>  	}
> @@ -232,6 +279,8 @@ init_device(struct vhost_device_ctx ctx, struct virtio_net
> *dev)
>  	switch (ctx.type) {
>  	case VHOST_DRV_CUSE:
>  		return cdev_init_device(ctx, dev);
> +	case VHOST_DRV_USER:
> +		return user_init_device(ctx, dev);
>  	default:
>  		break;
>  	}
> @@ -527,6 +576,8 @@ get_virtio_net_callbacks(vhost_driver_type_t type)
>  	switch (type) {
>  	case VHOST_DRV_CUSE:
>  		return &vhost_cuse_device_ops;
> +	case VHOST_DRV_USER:
> +		return &vhost_user_device_ops;
>  	default:
>  		break;
>  	}
> @@ -570,9 +621,14 @@ int rte_vhost_feature_enable(uint64_t feature_mask)
>   * Register ops so that we can add/remove device to data core.
>   */
>  int
> -rte_vhost_driver_callback_register(struct virtio_net_device_ops const * const
> ops)
> +rte_vhost_driver_callback_register(struct vhost_driver *drv,
> +		struct virtio_net_device_ops const * const ops, void *priv)
>  {
> +	if (drv == NULL || ops == NULL)
> +		return -1;
> +
>  	notify_ops = ops;
> +	drv->priv = priv;
> 
>  	return 0;
>  }
> --
> 1.9.1



More information about the dev mailing list