[dpdk-dev] [PATCH] vhost: fix segfault as handle set_mem_table message

Wodkowski, PawelX pawelx.wodkowski at intel.com
Thu Mar 29 14:57:30 CEST 2018


> >>>>> DPDK vhost-user handles this message rudely by unmap all existing
> >>>>> regions and map new ones. This might lead to segfault if there
> >>>>> is pmd thread just trying to touch those unmapped memory regions.
> >>>>>
> >>>>> But for most cases, except VM memory hotplug,
> >>
> >> FYI, Victor is working on implementing a lock-less protection mechanism
> >> to prevent crashes in such cases. It is intended first to protect
> >> log_base in case of multiqueue + live-migration, but would solve thi
> >> issue too.
> >
> > Bring this issue back for discussion.
> >
> > Reported by SPDK guys, even with per-queue lock, they could still run into
> crash as of memory hot plug or unplug.
> > In detail, you add the lock in an internal struct, vhost_queue which is,
> unfortunately, not visible to the external datapath, like vhost-scsi in SPDK.
> 
> Yes, I agree current solution is not enough
> 
> > The memory hot plug and unplug would be the main issue from SPDK side so
> far. For this specific issue, I think we can continue this patch to filter out the
> changed regions, and keep unchanged regions not remapped.
> >
> > But I know that the per-vq lock is not only to resolve the memory table issue,
> some other vhost-user messages could also lead to problems? If yes, shall we
> take a step back, to think about how to solve this kind of issues for backends,
> like vhost-scsi.
> 
> Right, any message that can change the device or virtqueue states can be
> problematic.
> 
> > Thoughts?
> 
> In another thread, SPDK people proposed to destroy and re-create the
> device for every message. I think this is not acceptable.

Backend must know which part of device is outdated (memory table, ring
position etc) so it can take right action here. I  don't insist on destroy/create
scheme but this solve most of those problems (if not all). if we will have another
working solution this is perfectly fine for me. 

> 
> I proposed an alternative that I think would work:
> - external backends & applications implements the .vring_state_changed()
>    callback. On disable it stops processing the rings and only return
>    once all descriptor buffers are processed. On enable, they resume the
>    rings processing.
> - In vhost lib, we implement vhost_vring_pause and vhost_vring_resume
>    functions. In pause function, we save device state (enable or
>    disable) in a variable, and if the ring is enabled at device level it
>    calls .vring_state_changed() with disabled state. In resume, it checks
>    the vring state in the device and call .vring_state_changed() with
>    enable state if it was enabled in device state.

This will be not enough. We need to know what exactly changed. As for ring
state it is straight forward to save/fetch new ring state but eg. for set mem
table we need to finish all IO, remove currently registered RDMA memory.
Then, when new memory table is available we need to register it again for
RDMA then resume IO.

> 
> So, I think that would work but I hadn't had a clear reply from SPDK
> people proving it wouldn't.
> 
> They asked we revert Victor's patch, but I don't see the need as it does
> not hurt SPDK (but doesn't protect anything for them I agree), while it
> really fixes real issues with internal Net backend.
> 
> What do you think of my proposal? Do you see other alternative?
> 

As Victor is already working on the solution, can you post some info about how
you plan to solve it? I was thinking about something like code bellow (sorry for
how this code look like but this is my work-in-progress  to see if this make any
sense here). This code allow to:
1.  not introducing changes like http://dpdk.org/ml/archives/dev/2018-March/093922.html
     because backend will handle this by its own.
2. virtio-net specific messages can be moved out of generic vhost_user.c file
3. virtqueue locking stuff can be moved to virito-net specific backend.

Pls let me know what you think.

---
 lib/librte_vhost/Makefile         |   2 +-
 lib/librte_vhost/rte_vhost.h      |  60 ++++++++++++++++++-
 lib/librte_vhost/rte_vhost_user.h | 120 ++++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/vhost.h          |  14 -----
 lib/librte_vhost/vhost_user.c     |  30 ++++++++++
 lib/librte_vhost/vhost_user.h     |  88 ----------------------------
 6 files changed, 209 insertions(+), 105 deletions(-)
 create mode 100644 lib/librte_vhost/rte_vhost_user.h

diff --git a/lib/librte_vhost/Makefile b/lib/librte_vhost/Makefile
index 5d6c6abaed51..07439a186d91 100644
--- a/lib/librte_vhost/Makefile
+++ b/lib/librte_vhost/Makefile
@@ -25,6 +25,6 @@ SRCS-$(CONFIG_RTE_LIBRTE_VHOST) := fd_man.c iotlb.c socket.c vhost.c \
 					vhost_user.c virtio_net.c
 
 # install includes
-SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h
+SYMLINK-$(CONFIG_RTE_LIBRTE_VHOST)-include += rte_vhost.h rte_vhost_user.h
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_vhost/rte_vhost.h b/lib/librte_vhost/rte_vhost.h
index d33206997453..7b76952638dc 100644
--- a/lib/librte_vhost/rte_vhost.h
+++ b/lib/librte_vhost/rte_vhost.h
@@ -16,12 +16,13 @@
 #include <rte_memory.h>
 #include <rte_mempool.h>
 
+#include <rte_vhost_user.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /* These are not C++-aware. */
-#include <linux/vhost.h>
 #include <linux/virtio_ring.h>
 
 #define RTE_VHOST_USER_CLIENT		(1ULL << 0)
@@ -65,6 +66,51 @@ struct rte_vhost_vring {
 };
 
 /**
+ * Vhost library started processing given vhost user message.
+ *
+ * This state should be used eg. to stop rings processing in case of
+ * SET_MEM_TABLE message.
+ *
+ * Backend is allowed to return any result of RTE_VHOST_USER_MESSAGE_RESULT_*.
+ */
+#define RTE_VHOST_USER_MSG_START 0
+
+/**
+ * Vhost library is finishing processing given vhost user message.
+ * If backend have handled the message produced response is passed as message
+ * parameter. If response is needed it will be send after returning.
+ *
+ * This state might be used to resume ring processing in case of SET_MEM_TABLE
+ * message.
+ *
+ * Returning RTE_VHOST_USER_MSG_RESULT_FAILED will trigger failure action in
+ * vhost library.
+ */
+#define RTE_VHOST_USER_MSG_END 1
+
+/**
+ * Backend understood the message but processing it failed for some reason.
+ * vhost library will take the failure action - chance closing existing
+ * connection.
+ */
+#define RTE_VHOST_USER_MSG_RESULT_FAILED -1
+
+/**
+ * Backend understood the message and handled it entirly. Backend is responsible
+ * for filling message object with right response data.
+ */
+#define RTE_VHOST_USER_MSG_RESULT_HANDLED 0
+
+/**
+ * Backend ignored the message or understood and took some action. In either
+ * case the message need to be further processed by vhost library.
+ *
+ * Backend is not allowed to change passed message.
+ */
+#define RTE_VHOST_USER_MSG_RESULT_OK 1
+
+
+/**
  * Device and vring operations.
  */
 struct vhost_device_ops {
@@ -84,7 +130,17 @@ struct vhost_device_ops {
 	int (*new_connection)(int vid);
 	void (*destroy_connection)(int vid);
 
-	void *reserved[2]; /**< Reserved for future extension */
+	/**
+	 * Backend callback for user message.
+	 *
+	 * @param vid id of vhost device
+	 * @param msg message object.
+	 * @param phase RTE_VHOST_USER_MSG_START or RTE_VHOST_USER_MSG_END
+	 * @return one of RTE_VHOST_USER_MESSAGE_RESULT_*
+	 */
+	int (*user_message_handler)(int vid, struct VhostUserMsg *msg, int phase);
+
+	void *reserved[1]; /**< Reserved for future extension */
 };
 
 /**
diff --git a/lib/librte_vhost/rte_vhost_user.h b/lib/librte_vhost/rte_vhost_user.h
new file mode 100644
index 000000000000..f7678d33acc3
--- /dev/null
+++ b/lib/librte_vhost/rte_vhost_user.h
@@ -0,0 +1,120 @@
+#ifndef _VHOST_RTE_VHOST_USER_H_
+#define _VHOST_RTE_VHOST_USER_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* These are not C++-aware. */
+#include <linux/vhost.h>
+
+/* refer to hw/virtio/vhost-user.c */
+
+struct vhost_iotlb_msg {
+	__u64 iova;
+	__u64 size;
+	__u64 uaddr;
+#define VHOST_ACCESS_RO      0x1
+#define VHOST_ACCESS_WO      0x2
+#define VHOST_ACCESS_RW      0x3
+	__u8 perm;
+#define VHOST_IOTLB_MISS           1
+#define VHOST_IOTLB_UPDATE         2
+#define VHOST_IOTLB_INVALIDATE     3
+#define VHOST_IOTLB_ACCESS_FAIL    4
+	__u8 type;
+};
+
+#define VHOST_MEMORY_MAX_NREGIONS 8
+
+#define VHOST_USER_PROTOCOL_F_MQ	0
+#define VHOST_USER_PROTOCOL_F_LOG_SHMFD	1
+#define VHOST_USER_PROTOCOL_F_RARP	2
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK	3
+#define VHOST_USER_PROTOCOL_F_NET_MTU 4
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
+
+typedef enum VhostUserRequest {
+	VHOST_USER_NONE = 0,
+	VHOST_USER_GET_FEATURES = 1,
+	VHOST_USER_SET_FEATURES = 2,
+	VHOST_USER_SET_OWNER = 3,
+	VHOST_USER_RESET_OWNER = 4,
+	VHOST_USER_SET_MEM_TABLE = 5,
+	VHOST_USER_SET_LOG_BASE = 6,
+	VHOST_USER_SET_LOG_FD = 7,
+	VHOST_USER_SET_VRING_NUM = 8,
+	VHOST_USER_SET_VRING_ADDR = 9,
+	VHOST_USER_SET_VRING_BASE = 10,
+	VHOST_USER_GET_VRING_BASE = 11,
+	VHOST_USER_SET_VRING_KICK = 12,
+	VHOST_USER_SET_VRING_CALL = 13,
+	VHOST_USER_SET_VRING_ERR = 14,
+	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
+	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
+	VHOST_USER_GET_QUEUE_NUM = 17,
+	VHOST_USER_SET_VRING_ENABLE = 18,
+	VHOST_USER_SEND_RARP = 19,
+	VHOST_USER_NET_SET_MTU = 20,
+	VHOST_USER_SET_SLAVE_REQ_FD = 21,
+	VHOST_USER_IOTLB_MSG = 22,
+	VHOST_USER_MAX
+} VhostUserRequest;
+
+typedef enum VhostUserSlaveRequest {
+	VHOST_USER_SLAVE_NONE = 0,
+	VHOST_USER_SLAVE_IOTLB_MSG = 1,
+	VHOST_USER_SLAVE_MAX
+} VhostUserSlaveRequest;
+
+typedef struct VhostUserMemoryRegion {
+	uint64_t guest_phys_addr;
+	uint64_t memory_size;
+	uint64_t userspace_addr;
+	uint64_t mmap_offset;
+} VhostUserMemoryRegion;
+
+typedef struct VhostUserMemory {
+	uint32_t nregions;
+	uint32_t padding;
+	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
+} VhostUserMemory;
+
+typedef struct VhostUserLog {
+	uint64_t mmap_size;
+	uint64_t mmap_offset;
+} VhostUserLog;
+
+typedef struct VhostUserMsg {
+	union {
+		VhostUserRequest master;
+		VhostUserSlaveRequest slave;
+	} request;
+
+#define VHOST_USER_VERSION_MASK     0x3
+#define VHOST_USER_REPLY_MASK       (0x1 << 2)
+#define VHOST_USER_NEED_REPLY		(0x1 << 3)
+	uint32_t flags;
+	uint32_t size; /* the following payload size */
+	union {
+#define VHOST_USER_VRING_IDX_MASK   0xff
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
+		uint64_t u64;
+		struct vhost_vring_state state;
+		struct vhost_vring_addr addr;
+		VhostUserMemory memory;
+		VhostUserLog    log;
+		struct vhost_iotlb_msg iotlb;
+	} payload;
+	int fds[VHOST_MEMORY_MAX_NREGIONS];
+} __attribute((packed)) VhostUserMsg;
+
+#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _VHOST_RTE_VHOST_USER_H_ */
diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index d947bc9e3b3f..42a1474095a3 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -141,20 +141,6 @@ struct vhost_virtqueue {
 
 #define VIRTIO_F_IOMMU_PLATFORM 33
 
-struct vhost_iotlb_msg {
-	__u64 iova;
-	__u64 size;
-	__u64 uaddr;
-#define VHOST_ACCESS_RO      0x1
-#define VHOST_ACCESS_WO      0x2
-#define VHOST_ACCESS_RW      0x3
-	__u8 perm;
-#define VHOST_IOTLB_MISS           1
-#define VHOST_IOTLB_UPDATE         2
-#define VHOST_IOTLB_INVALIDATE     3
-#define VHOST_IOTLB_ACCESS_FAIL    4
-	__u8 type;
-};
 
 #define VHOST_IOTLB_MSG 0x1
 
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index 90ed2112e0af..15532e182b58 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1301,6 +1301,7 @@ vhost_user_msg_handler(int vid, int fd)
 	struct virtio_net *dev;
 	struct VhostUserMsg msg;
 	int ret;
+	int user_handler_result;
 	int unlock_required = 0;
 
 	dev = get_device(vid);
@@ -1347,6 +1348,26 @@ vhost_user_msg_handler(int vid, int fd)
 		return -1;
 	}
 
+
+	if (dev->notify_ops->user_message_handler) {
+		user_handler_result = dev->notify_ops->user_message_handler(
+				dev->vid, &msg, RTE_VHOST_USER_MSG_START);
+
+		switch (user_handler_result) {
+		case RTE_VHOST_USER_MSG_RESULT_FAILED:
+			RTE_LOG(ERR, VHOST_CONFIG,
+				"User message handler failed\n");
+			return -1;
+		case RTE_VHOST_USER_MSG_RESULT_HANDLED:
+			RTE_LOG(DEBUG, VHOST_CONFIG,
+				"User message handled by backend\n");
+			goto msg_handled;
+		case RTE_VHOST_USER_MSG_RESULT_OK:
+			break;
+		}
+	}
+
+
 	/*
 	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
 	 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
@@ -1485,6 +1506,15 @@ vhost_user_msg_handler(int vid, int fd)
 	if (unlock_required)
 		vhost_user_unlock_all_queue_pairs(dev);
 
+msg_handled:
+	if (dev->notify_ops->user_message_handler) {
+		user_handler_result = dev->notify_ops->user_message_handler(
+				dev->vid, &msg, RTE_VHOST_USER_MSG_END);
+
+		if (user_handler_result == RTE_VHOST_USER_MSG_RESULT_FAILED)
+			return -1;
+	}
+
 	if (msg.flags & VHOST_USER_NEED_REPLY) {
 		msg.payload.u64 = !!ret;
 		msg.size = sizeof(msg.payload.u64);
diff --git a/lib/librte_vhost/vhost_user.h b/lib/librte_vhost/vhost_user.h
index d4bd604b9d6b..cf3f0da0ec41 100644
--- a/lib/librte_vhost/vhost_user.h
+++ b/lib/librte_vhost/vhost_user.h
@@ -10,17 +10,6 @@
 
 #include "rte_vhost.h"
 
-/* refer to hw/virtio/vhost-user.c */
-
-#define VHOST_MEMORY_MAX_NREGIONS 8
-
-#define VHOST_USER_PROTOCOL_F_MQ	0
-#define VHOST_USER_PROTOCOL_F_LOG_SHMFD	1
-#define VHOST_USER_PROTOCOL_F_RARP	2
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK	3
-#define VHOST_USER_PROTOCOL_F_NET_MTU 4
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-
 #define VHOST_USER_PROTOCOL_FEATURES	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) |\
 					 (1ULL << VHOST_USER_PROTOCOL_F_RARP) | \
@@ -28,83 +17,6 @@
 					 (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
 					 (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ))
 
-typedef enum VhostUserRequest {
-	VHOST_USER_NONE = 0,
-	VHOST_USER_GET_FEATURES = 1,
-	VHOST_USER_SET_FEATURES = 2,
-	VHOST_USER_SET_OWNER = 3,
-	VHOST_USER_RESET_OWNER = 4,
-	VHOST_USER_SET_MEM_TABLE = 5,
-	VHOST_USER_SET_LOG_BASE = 6,
-	VHOST_USER_SET_LOG_FD = 7,
-	VHOST_USER_SET_VRING_NUM = 8,
-	VHOST_USER_SET_VRING_ADDR = 9,
-	VHOST_USER_SET_VRING_BASE = 10,
-	VHOST_USER_GET_VRING_BASE = 11,
-	VHOST_USER_SET_VRING_KICK = 12,
-	VHOST_USER_SET_VRING_CALL = 13,
-	VHOST_USER_SET_VRING_ERR = 14,
-	VHOST_USER_GET_PROTOCOL_FEATURES = 15,
-	VHOST_USER_SET_PROTOCOL_FEATURES = 16,
-	VHOST_USER_GET_QUEUE_NUM = 17,
-	VHOST_USER_SET_VRING_ENABLE = 18,
-	VHOST_USER_SEND_RARP = 19,
-	VHOST_USER_NET_SET_MTU = 20,
-	VHOST_USER_SET_SLAVE_REQ_FD = 21,
-	VHOST_USER_IOTLB_MSG = 22,
-	VHOST_USER_MAX
-} VhostUserRequest;
-
-typedef enum VhostUserSlaveRequest {
-	VHOST_USER_SLAVE_NONE = 0,
-	VHOST_USER_SLAVE_IOTLB_MSG = 1,
-	VHOST_USER_SLAVE_MAX
-} VhostUserSlaveRequest;
-
-typedef struct VhostUserMemoryRegion {
-	uint64_t guest_phys_addr;
-	uint64_t memory_size;
-	uint64_t userspace_addr;
-	uint64_t mmap_offset;
-} VhostUserMemoryRegion;
-
-typedef struct VhostUserMemory {
-	uint32_t nregions;
-	uint32_t padding;
-	VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
-} VhostUserMemory;
-
-typedef struct VhostUserLog {
-	uint64_t mmap_size;
-	uint64_t mmap_offset;
-} VhostUserLog;
-
-typedef struct VhostUserMsg {
-	union {
-		VhostUserRequest master;
-		VhostUserSlaveRequest slave;
-	} request;
-
-#define VHOST_USER_VERSION_MASK     0x3
-#define VHOST_USER_REPLY_MASK       (0x1 << 2)
-#define VHOST_USER_NEED_REPLY		(0x1 << 3)
-	uint32_t flags;
-	uint32_t size; /* the following payload size */
-	union {
-#define VHOST_USER_VRING_IDX_MASK   0xff
-#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
-		uint64_t u64;
-		struct vhost_vring_state state;
-		struct vhost_vring_addr addr;
-		VhostUserMemory memory;
-		VhostUserLog    log;
-		struct vhost_iotlb_msg iotlb;
-	} payload;
-	int fds[VHOST_MEMORY_MAX_NREGIONS];
-} __attribute((packed)) VhostUserMsg;
-
-#define VHOST_USER_HDR_SIZE offsetof(VhostUserMsg, payload.u64)
-
 /* The version of the protocol we support */
 #define VHOST_USER_VERSION    0x1
 
-- 
2.7.4



More information about the dev mailing list