[dpdk-dev,v3] vhost_user: protect active rings from async ring changes

Message ID 20171214133531-mutt-send-email-victork@redhat.com (mailing list archive)
State Superseded, archived
Delegated to: Yuanhan Liu
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Victor Kaplansky Dec. 14, 2017, 11:35 a.m. UTC
  When performing live migration or memory hot-plugging,
the changes to the device and vrings made by message handler
done independently from vring usage by PMD threads.

This causes for example segfaults during live-migration
with MQ enable, but in general virtually any request
sent by qemu changing the state of device can cause
problems.

These patches fixes all above issues by adding a spinlock
to every vring and requiring message handler to start operation
only after ensuring that all PMD threads related to the device
are out of critical section accessing the vring data.

Each vring has its own lock in order to not create contention
between PMD threads of different vrings and to prevent
performance degradation by scaling queue pair number.

See https://bugzilla.redhat.com/show_bug.cgi?id=1450680

Signed-off-by: Victor Kaplansky <victork@redhat.com>
Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---

v3:
   o Added locking to enqueue flow.
   o Enqueue path guarded as well as dequeue path.
   o Changed name of active_lock.
   o Added initialization of guarding spinlock.
   o Reworked functions skimming over all virt-queues.
   o Performance measurements done by Maxime Coquelin shows
     no degradation in bandwidth and throughput.
   o Spelling.
   o Taking lock only on set operations.
   o IOMMU messages are not guarded by access lock.

v2:
   o Fixed checkpatch complains.
   o Added Signed-off-by.
   o Refined placement of guard to exclude IOMMU messages.
   o TODO: performance degradation measurement.


 lib/librte_vhost/vhost.h      | 15 ++++++++++
 lib/librte_vhost/vhost.c      |  1 +
 lib/librte_vhost/vhost_user.c | 65 +++++++++++++++++++++++++++++++++++++++++++
 lib/librte_vhost/virtio_net.c | 20 +++++++++++--
 4 files changed, 99 insertions(+), 2 deletions(-)
  

Comments

Maxime Coquelin Dec. 14, 2017, 12:16 p.m. UTC | #1
Hi Victor,

On 12/14/2017 12:35 PM, Victor Kaplansky wrote:
> When performing live migration or memory hot-plugging,
> the changes to the device and vrings made by message handler
> done independently from vring usage by PMD threads.
> 
> This causes for example segfaults during live-migration
> with MQ enable, but in general virtually any request
> sent by qemu changing the state of device can cause
> problems.
> 
> These patches fixes all above issues by adding a spinlock
> to every vring and requiring message handler to start operation
> only after ensuring that all PMD threads related to the device
> are out of critical section accessing the vring data.
> 
> Each vring has its own lock in order to not create contention
> between PMD threads of different vrings and to prevent
> performance degradation by scaling queue pair number.
> 
> See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
> 
> Signed-off-by: Victor Kaplansky <victork@redhat.com>
> Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>

Sorry, but I didn't tested this patch. I just benchmarked the fact to
add a lock in the hot paths.

> ---
> 
> v3:
>     o Added locking to enqueue flow.
>     o Enqueue path guarded as well as dequeue path.
>     o Changed name of active_lock.
>     o Added initialization of guarding spinlock.
>     o Reworked functions skimming over all virt-queues.
>     o Performance measurements done by Maxime Coquelin shows
>       no degradation in bandwidth and throughput.
>     o Spelling.
>     o Taking lock only on set operations.
>     o IOMMU messages are not guarded by access lock.
> 
> v2:
>     o Fixed checkpatch complains.
>     o Added Signed-off-by.
>     o Refined placement of guard to exclude IOMMU messages.
>     o TODO: performance degradation measurement.
> 
> 
>   lib/librte_vhost/vhost.h      | 15 ++++++++++
>   lib/librte_vhost/vhost.c      |  1 +
>   lib/librte_vhost/vhost_user.c | 65 +++++++++++++++++++++++++++++++++++++++++++
>   lib/librte_vhost/virtio_net.c | 20 +++++++++++--
>   4 files changed, 99 insertions(+), 2 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 1cc81c17..26e2c571 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -137,6 +137,8 @@ struct vhost_virtqueue {
>   	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>   	int				iotlb_cache_nr;
>   	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> +
> +	rte_spinlock_t	access_lock;

On previous revision of the patch, Jianfeng mentioned taking the lock 
has an impact when nothing to enqueue/dequeue (80 cycles vs. 50 cycles
IIRC). I wonder whether moving the lock closer to enabled field would
make it be in same cache line and so might improve performance.

>   } __rte_cache_aligned;
>   
>   /* Old kernels have no such macros defined */
> @@ -302,6 +304,19 @@ vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>   	vhost_log_write(dev, vq->log_guest_addr + offset, len);
>   }
>   
> +static __rte_always_inline void
> +vhost_user_access_lock(struct vhost_virtqueue *vq)
> +{
> +	rte_spinlock_lock(&vq->access_lock);
> +}
> +
> +static __rte_always_inline void
> +vhost_user_access_unlock(struct vhost_virtqueue *vq)
> +{
> +	rte_spinlock_unlock(&vq->access_lock);
> +}
> +
> +
>   /* Macros for printing using RTE_LOG */
>   #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
>   #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
> diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
> index 4f8b73a0..dcc42fc7 100644
> --- a/lib/librte_vhost/vhost.c
> +++ b/lib/librte_vhost/vhost.c
> @@ -259,6 +259,7 @@ alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
>   
>   	dev->virtqueue[vring_idx] = vq;
>   	init_vring_queue(dev, vring_idx);
> +	rte_spinlock_init(&vq->access_lock);
>   
>   	dev->nr_vring += 1;
>   
> diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
> index f4c7ce46..a5b5e4be 100644
> --- a/lib/librte_vhost/vhost_user.c
> +++ b/lib/librte_vhost/vhost_user.c
> @@ -1190,12 +1190,48 @@ vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
>   	return alloc_vring_queue(dev, vring_idx);
>   }
>   
> +static void
> +vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
> +{
> +	unsigned int i = 0;
> +	unsigned int vq_num = 0;
> +
> +	while (vq_num < dev->nr_vring) {
> +		struct vhost_virtqueue *vq = dev->virtqueue[i];
> +
> +		if (vq) {
> +			vhost_user_access_lock(vq);
> +			vq_num++;
> +		}
> +		i++;
> +	}
> +}
> +
> +static void
> +vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
> +{
> +	unsigned int i = 0;
> +	unsigned int vq_num = 0;
> +
> +	while (vq_num < dev->nr_vring) {
> +		struct vhost_virtqueue *vq = dev->virtqueue[i];
> +
> +		if (vq) {
> +			vhost_user_access_unlock(vq);
> +			vq_num++;
> +		}
> +		i++;
> +	}
> +}
> +
>   int
>   vhost_user_msg_handler(int vid, int fd)
>   {
>   	struct virtio_net *dev;
>   	struct VhostUserMsg msg;
>   	int ret;
> +	int unlock_required = 0;
> +
>   
>   	dev = get_device(vid);
>   	if (dev == NULL)
> @@ -1241,6 +1277,32 @@ vhost_user_msg_handler(int vid, int fd)
>   		return -1;
>   	}
>   
> +	switch (msg.request.master) {
> +	case VHOST_USER_SET_FEATURES:
> +	case VHOST_USER_SET_PROTOCOL_FEATURES:
> +	case VHOST_USER_SET_OWNER:
> +	case VHOST_USER_RESET_OWNER:
> +	case VHOST_USER_SET_MEM_TABLE:
> +	case VHOST_USER_SET_LOG_BASE:
> +	case VHOST_USER_SET_LOG_FD:
> +	case VHOST_USER_SET_VRING_NUM:
> +	case VHOST_USER_SET_VRING_ADDR:
> +	case VHOST_USER_SET_VRING_BASE:
> +	case VHOST_USER_SET_VRING_KICK:
> +	case VHOST_USER_SET_VRING_CALL:
> +	case VHOST_USER_SET_VRING_ERR:
> +	case VHOST_USER_SET_VRING_ENABLE:

For the VRING specific requests, I think we could only take the
corresponding VQ lock instead of blocking all queues.

There might be some exceptions, like GET_VRING_BASE.

Maybe only protecting critical sections would be better?

> +	case VHOST_USER_SEND_RARP:
> +	case VHOST_USER_NET_SET_MTU:
> +	case VHOST_USER_SET_SLAVE_REQ_FD:
> +		vhost_user_lock_all_queue_pairs(dev
> +		unlock_required = 1;
> +		break;
> +	default:
> +		break;
> +
> +	}
> +
>   	switch (msg.request.master) {
>   	case VHOST_USER_GET_FEATURES:
>   		msg.payload.u64 = vhost_user_get_features(dev);
> @@ -1342,6 +1404,9 @@ vhost_user_msg_handler(int vid, int fd)
>   
>   	}
>   
> +	if (unlock_required)
> +		vhost_user_unlock_all_queue_pairs(dev);
> +
>   	if (msg.flags & VHOST_USER_NEED_REPLY) {
>   		msg.payload.u64 = !!ret;
>   		msg.size = sizeof(msg.payload.u64);
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 6fee16e5..b584e380 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -44,6 +44,7 @@
>   #include <rte_udp.h>
>   #include <rte_sctp.h>
>   #include <rte_arp.h>
> +#include <rte_spinlock.h>
>   
>   #include "iotlb.h"
>   #include "vhost.h"
> @@ -326,8 +327,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>   	}
>   
>   	vq = dev->virtqueue[queue_id];
> +
> +	vhost_user_access_lock(vq);
I suggested to use rte_spinlock_trylock() when reviewing previous
revision. It would have the advantage to not block other devices being
handled on the same CPU whlie handling the message.

Any thoughts?

> +
>   	if (unlikely(vq->enabled == 0))
> -		return 0;
> +		goto out;
>   
>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>   		vhost_user_iotlb_rd_lock(vq);
> @@ -416,6 +420,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>   			&& (vq->callfd >= 0))
>   		eventfd_write(vq->callfd, (eventfd_t)1);
>   out:
> +	vhost_user_access_unlock(vq);

I think it should be moved after the iotlb lock unlock:
LOCK A
LOCK B
UNLOCK B
UNLOCK A

>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>   		vhost_user_iotlb_rd_unlock(vq);
>   
> @@ -651,8 +656,11 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
>   	}
>   
>   	vq = dev->virtqueue[queue_id];
> +
> +	vhost_user_access_lock(vq);
> +
>   	if (unlikely(vq->enabled == 0))
> -		return 0;
> +		goto out;
>   
>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>   		vhost_user_iotlb_rd_lock(vq);
> @@ -712,6 +720,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
>   	}
>   
>   out:
> +	vhost_user_access_unlock(vq);

Ditto

>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>   		vhost_user_iotlb_rd_unlock(vq);
>   
> @@ -1180,9 +1189,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>   	}
>   
>   	vq = dev->virtqueue[queue_id];
> +
>   	if (unlikely(vq->enabled == 0))
>   		return 0;
>   
> +	vhost_user_access_lock(vq);
> +

To be consistent with other paths, I guess you should lock before 
checking the vq is enabled.

>   	vq->batch_copy_nb_elems = 0;
>   
>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> @@ -1240,6 +1252,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>   		if (rarp_mbuf == NULL) {
>   			RTE_LOG(ERR, VHOST_DATA,
>   				"Failed to allocate memory for mbuf.\n");
> +			vhost_user_access_unlock(vq);
>   			return 0;
>   		}
>   
> @@ -1356,6 +1369,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
>   		vhost_user_iotlb_rd_unlock(vq);
>   
> +	vhost_user_access_unlock(vq);
> +
>   	if (unlikely(rarp_mbuf != NULL)) {
>   		/*
>   		 * Inject it to the head of "pkts" array, so that switch's mac
> @@ -1366,5 +1381,6 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
>   		i += 1;
>   	}
>   
> +

Remove trailing line.

>   	return i;
>   }
> 

Thanks,
Maxime
  
Victor Kaplansky Dec. 14, 2017, 2:21 p.m. UTC | #2
----- Original Message -----
> From: "Maxime Coquelin" <maxime.coquelin@redhat.com>
> To: "Victor Kaplansky" <victork@redhat.com>, dev@dpdk.org
> Cc: stable@dpdk.org, "Jens Freimann" <jfreiman@redhat.com>, "Yuanhan Liu" <yliu@fridaylinux.org>, "Tiwei Bie"
> <tiwei.bie@intel.com>, "Jianfeng Tan" <jianfeng.tan@intel.com>
> Sent: Thursday, December 14, 2017 2:16:23 PM
> Subject: Re: [PATCH v3] vhost_user: protect active rings from async ring changes
> 
> Hi Victor,
> 
> On 12/14/2017 12:35 PM, Victor Kaplansky wrote:
> > When performing live migration or memory hot-plugging,
> > the changes to the device and vrings made by message handler
> > done independently from vring usage by PMD threads.
> > 
> > This causes for example segfaults during live-migration
> > with MQ enable, but in general virtually any request
> > sent by qemu changing the state of device can cause
> > problems.
> > 
> > These patches fixes all above issues by adding a spinlock
> > to every vring and requiring message handler to start operation
> > only after ensuring that all PMD threads related to the device
> > are out of critical section accessing the vring data.
> > 
> > Each vring has its own lock in order to not create contention
> > between PMD threads of different vrings and to prevent
> > performance degradation by scaling queue pair number.
> > 
> > See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
> > 
> > Signed-off-by: Victor Kaplansky <victork@redhat.com>
> > Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> 
> Sorry, but I didn't tested this patch. I just benchmarked the fact to
> add a lock in the hot paths.
> 
> > ---
> > 
> > v3:
> >     o Added locking to enqueue flow.
> >     o Enqueue path guarded as well as dequeue path.
> >     o Changed name of active_lock.
> >     o Added initialization of guarding spinlock.
> >     o Reworked functions skimming over all virt-queues.
> >     o Performance measurements done by Maxime Coquelin shows
> >       no degradation in bandwidth and throughput.
> >     o Spelling.
> >     o Taking lock only on set operations.
> >     o IOMMU messages are not guarded by access lock.
> > 
> > v2:
> >     o Fixed checkpatch complains.
> >     o Added Signed-off-by.
> >     o Refined placement of guard to exclude IOMMU messages.
> >     o TODO: performance degradation measurement.
> > 
> > 
> >   lib/librte_vhost/vhost.h      | 15 ++++++++++
> >   lib/librte_vhost/vhost.c      |  1 +
> >   lib/librte_vhost/vhost_user.c | 65
> >   +++++++++++++++++++++++++++++++++++++++++++
> >   lib/librte_vhost/virtio_net.c | 20 +++++++++++--
> >   4 files changed, 99 insertions(+), 2 deletions(-)
> > 
> > diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> > index 1cc81c17..26e2c571 100644
> > --- a/lib/librte_vhost/vhost.h
> > +++ b/lib/librte_vhost/vhost.h
> > @@ -137,6 +137,8 @@ struct vhost_virtqueue {
> >   	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
> >   	int				iotlb_cache_nr;
> >   	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
> > +
> > +	rte_spinlock_t	access_lock;
> 
> On previous revision of the patch, Jianfeng mentioned taking the lock
> has an impact when nothing to enqueue/dequeue (80 cycles vs. 50 cycles
> IIRC). I wonder whether moving the lock closer to enabled field would
> make it be in same cache line and so might improve performance.

Good point, I'll move it closer to enable.


> > +	switch (msg.request.master) {
> > +	case VHOST_USER_SET_FEATURES:
> > +	case VHOST_USER_SET_PROTOCOL_FEATURES:
> > +	case VHOST_USER_SET_OWNER:
> > +	case VHOST_USER_RESET_OWNER:
> > +	case VHOST_USER_SET_MEM_TABLE:
> > +	case VHOST_USER_SET_LOG_BASE:
> > +	case VHOST_USER_SET_LOG_FD:
> > +	case VHOST_USER_SET_VRING_NUM:
> > +	case VHOST_USER_SET_VRING_ADDR:
> > +	case VHOST_USER_SET_VRING_BASE:
> > +	case VHOST_USER_SET_VRING_KICK:
> > +	case VHOST_USER_SET_VRING_CALL:
> > +	case VHOST_USER_SET_VRING_ERR:
> > +	case VHOST_USER_SET_VRING_ENABLE:
> 
> For the VRING specific requests, I think we could only take the
> corresponding VQ lock instead of blocking all queues.
> 
> There might be some exceptions, like GET_VRING_BASE.
> 
> Maybe only protecting critical sections would be better?

I agree in general, but requests are so rare, that I prefer to
have a robust solution instead of dealing with questionable
optimizations prone to bugs. Anyway, we can add more fine tuning
later on case by case basis, as we become aware of more performance
bottlenecks.

> >   
> >   #include "iotlb.h"
> >   #include "vhost.h"
> > @@ -326,8 +327,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >   	}
> >   
> >   	vq = dev->virtqueue[queue_id];
> > +
> > +	vhost_user_access_lock(vq);
> I suggested to use rte_spinlock_trylock() when reviewing previous
> revision. It would have the advantage to not block other devices being
> handled on the same CPU whlie handling the message.
> 
> Any thoughts?

If we want trylock, we have two options in this case.  One is just
return zero if trylock fails and give the caller to decide what to
do: retry, issue an error, or yield the cpu to another thread, etc.
I don't think any caller of rte_eth_tx_burst in current dpdk source
doing this. Most frequently it is just considered as an error.

The second option is to invoke sched_yield() yielding the cpu to
other threads.  I'm in favor of second option, but looking into
current dpdk code base I don't see any instances of using
sched_yield(). Maybe it is for a reason?

> 
> > +
> >   	if (unlikely(vq->enabled == 0))
> > -		return 0;
> > +		goto out;
> >   
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >   		vhost_user_iotlb_rd_lock(vq);
> > @@ -416,6 +420,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >   			&& (vq->callfd >= 0))
> >   		eventfd_write(vq->callfd, (eventfd_t)1);
> >   out:
> > +	vhost_user_access_unlock(vq);
> 
> I think it should be moved after the iotlb lock unlock:
> LOCK A
> LOCK B
> UNLOCK B
> UNLOCK A

Yep, good catch.

> 
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >   		vhost_user_iotlb_rd_unlock(vq);
> >   
> > @@ -651,8 +656,11 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >   	}
> >   
> >   	vq = dev->virtqueue[queue_id];
> > +
> > +	vhost_user_access_lock(vq);
> > +
> >   	if (unlikely(vq->enabled == 0))
> > -		return 0;
> > +		goto out;
> >   
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >   		vhost_user_iotlb_rd_lock(vq);
> > @@ -712,6 +720,7 @@ virtio_dev_merge_rx(struct virtio_net *dev, uint16_t
> > queue_id,
> >   	}
> >   
> >   out:
> > +	vhost_user_access_unlock(vq);
> 
> Ditto
> 
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >   		vhost_user_iotlb_rd_unlock(vq);
> >   
> > @@ -1180,9 +1189,12 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >   	}
> >   
> >   	vq = dev->virtqueue[queue_id];
> > +
> >   	if (unlikely(vq->enabled == 0))
> >   		return 0;
> >   
> > +	vhost_user_access_lock(vq);
> > +
> 
> To be consistent with other paths, I guess you should lock before
> checking the vq is enabled.

Oops. Missed this one, thanks.

> 
> >   	vq->batch_copy_nb_elems = 0;
> >   
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> > @@ -1240,6 +1252,7 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >   		if (rarp_mbuf == NULL) {
> >   			RTE_LOG(ERR, VHOST_DATA,
> >   				"Failed to allocate memory for mbuf.\n");
> > +			vhost_user_access_unlock(vq);
> >   			return 0;
> >   		}
> >   
> > @@ -1356,6 +1369,8 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >   	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
> >   		vhost_user_iotlb_rd_unlock(vq);
> >   
> > +	vhost_user_access_unlock(vq);
> > +
> >   	if (unlikely(rarp_mbuf != NULL)) {
> >   		/*
> >   		 * Inject it to the head of "pkts" array, so that switch's mac
> > @@ -1366,5 +1381,6 @@ rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
> >   		i += 1;
> >   	}
> >   
> > +
> 
> Remove trailing line.
> 
> >   	return i;
> >   }
> > 
> 
> Thanks,
> Maxime
>
  
Maxime Coquelin Dec. 14, 2017, 5:28 p.m. UTC | #3
On 12/14/2017 03:21 PM, Victor Kaplansky wrote:
> 
> 
> ----- Original Message -----
>> From: "Maxime Coquelin" <maxime.coquelin@redhat.com>
>> To: "Victor Kaplansky" <victork@redhat.com>, dev@dpdk.org
>> Cc: stable@dpdk.org, "Jens Freimann" <jfreiman@redhat.com>, "Yuanhan Liu" <yliu@fridaylinux.org>, "Tiwei Bie"
>> <tiwei.bie@intel.com>, "Jianfeng Tan" <jianfeng.tan@intel.com>
>> Sent: Thursday, December 14, 2017 2:16:23 PM
>> Subject: Re: [PATCH v3] vhost_user: protect active rings from async ring changes
>>
>> Hi Victor,
>>
>> On 12/14/2017 12:35 PM, Victor Kaplansky wrote:
>>> When performing live migration or memory hot-plugging,
>>> the changes to the device and vrings made by message handler
>>> done independently from vring usage by PMD threads.
>>>
>>> This causes for example segfaults during live-migration
>>> with MQ enable, but in general virtually any request
>>> sent by qemu changing the state of device can cause
>>> problems.
>>>
>>> These patches fixes all above issues by adding a spinlock
>>> to every vring and requiring message handler to start operation
>>> only after ensuring that all PMD threads related to the device
>>> are out of critical section accessing the vring data.
>>>
>>> Each vring has its own lock in order to not create contention
>>> between PMD threads of different vrings and to prevent
>>> performance degradation by scaling queue pair number.
>>>
>>> See https://bugzilla.redhat.com/show_bug.cgi?id=1450680
>>>
>>> Signed-off-by: Victor Kaplansky <victork@redhat.com>
>>> Tested-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>
>> Sorry, but I didn't tested this patch. I just benchmarked the fact to
>> add a lock in the hot paths.
>>
>>> ---
>>>
>>> v3:
>>>      o Added locking to enqueue flow.
>>>      o Enqueue path guarded as well as dequeue path.
>>>      o Changed name of active_lock.
>>>      o Added initialization of guarding spinlock.
>>>      o Reworked functions skimming over all virt-queues.
>>>      o Performance measurements done by Maxime Coquelin shows
>>>        no degradation in bandwidth and throughput.
>>>      o Spelling.
>>>      o Taking lock only on set operations.
>>>      o IOMMU messages are not guarded by access lock.
>>>
>>> v2:
>>>      o Fixed checkpatch complains.
>>>      o Added Signed-off-by.
>>>      o Refined placement of guard to exclude IOMMU messages.
>>>      o TODO: performance degradation measurement.
>>>
>>>
>>>    lib/librte_vhost/vhost.h      | 15 ++++++++++
>>>    lib/librte_vhost/vhost.c      |  1 +
>>>    lib/librte_vhost/vhost_user.c | 65
>>>    +++++++++++++++++++++++++++++++++++++++++++
>>>    lib/librte_vhost/virtio_net.c | 20 +++++++++++--
>>>    4 files changed, 99 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>>> index 1cc81c17..26e2c571 100644
>>> --- a/lib/librte_vhost/vhost.h
>>> +++ b/lib/librte_vhost/vhost.h
>>> @@ -137,6 +137,8 @@ struct vhost_virtqueue {
>>>    	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
>>>    	int				iotlb_cache_nr;
>>>    	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
>>> +
>>> +	rte_spinlock_t	access_lock;
>>
>> On previous revision of the patch, Jianfeng mentioned taking the lock
>> has an impact when nothing to enqueue/dequeue (80 cycles vs. 50 cycles
>> IIRC). I wonder whether moving the lock closer to enabled field would
>> make it be in same cache line and so might improve performance.
> 
> Good point, I'll move it closer to enable.
> 
> 
>>> +	switch (msg.request.master) {
>>> +	case VHOST_USER_SET_FEATURES:
>>> +	case VHOST_USER_SET_PROTOCOL_FEATURES:
>>> +	case VHOST_USER_SET_OWNER:
>>> +	case VHOST_USER_RESET_OWNER:
>>> +	case VHOST_USER_SET_MEM_TABLE:
>>> +	case VHOST_USER_SET_LOG_BASE:
>>> +	case VHOST_USER_SET_LOG_FD:
>>> +	case VHOST_USER_SET_VRING_NUM:
>>> +	case VHOST_USER_SET_VRING_ADDR:
>>> +	case VHOST_USER_SET_VRING_BASE:
>>> +	case VHOST_USER_SET_VRING_KICK:
>>> +	case VHOST_USER_SET_VRING_CALL:
>>> +	case VHOST_USER_SET_VRING_ERR:
>>> +	case VHOST_USER_SET_VRING_ENABLE:
>>
>> For the VRING specific requests, I think we could only take the
>> corresponding VQ lock instead of blocking all queues.
>>
>> There might be some exceptions, like GET_VRING_BASE.
>>
>> Maybe only protecting critical sections would be better?
> 
> I agree in general, but requests are so rare, that I prefer to
> have a robust solution instead of dealing with questionable
> optimizations prone to bugs. Anyway, we can add more fine tuning
> later on case by case basis, as we become aware of more performance
> bottlenecks.

Ok, but take care of avoiding the possible with GET_VRING_BASE as we
discussed on IRC.

>>>    
>>>    #include "iotlb.h"
>>>    #include "vhost.h"
>>> @@ -326,8 +327,11 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t
>>> queue_id,
>>>    	}
>>>    
>>>    	vq = dev->virtqueue[queue_id];
>>> +
>>> +	vhost_user_access_lock(vq);
>> I suggested to use rte_spinlock_trylock() when reviewing previous
>> revision. It would have the advantage to not block other devices being
>> handled on the same CPU whlie handling the message.
>>
>> Any thoughts?
> 
> If we want trylock, we have two options in this case.  One is just
> return zero if trylock fails and give the caller to decide what to
> do: retry, issue an error, or yield the cpu to another thread, etc.
> I don't think any caller of rte_eth_tx_burst in current dpdk source
> doing this. Most frequently it is just considered as an error.

Good point. It could be used for the dequeue path though.
But maybe better not to do it for consistency.

> The second option is to invoke sched_yield() yielding the cpu to
> other threads.  I'm in favor of second option, but looking into
> current dpdk code base I don't see any instances of using
> sched_yield(). Maybe it is for a reason?

Yes, because CPU running PMD threads are likely to be isolated from
kernel scheduling.

Maxime
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 1cc81c17..26e2c571 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -137,6 +137,8 @@  struct vhost_virtqueue {
 	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_list;
 	int				iotlb_cache_nr;
 	TAILQ_HEAD(, vhost_iotlb_entry) iotlb_pending_list;
+
+	rte_spinlock_t	access_lock;
 } __rte_cache_aligned;
 
 /* Old kernels have no such macros defined */
@@ -302,6 +304,19 @@  vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	vhost_log_write(dev, vq->log_guest_addr + offset, len);
 }
 
+static __rte_always_inline void
+vhost_user_access_lock(struct vhost_virtqueue *vq)
+{
+	rte_spinlock_lock(&vq->access_lock);
+}
+
+static __rte_always_inline void
+vhost_user_access_unlock(struct vhost_virtqueue *vq)
+{
+	rte_spinlock_unlock(&vq->access_lock);
+}
+
+
 /* Macros for printing using RTE_LOG */
 #define RTE_LOGTYPE_VHOST_CONFIG RTE_LOGTYPE_USER1
 #define RTE_LOGTYPE_VHOST_DATA   RTE_LOGTYPE_USER1
diff --git a/lib/librte_vhost/vhost.c b/lib/librte_vhost/vhost.c
index 4f8b73a0..dcc42fc7 100644
--- a/lib/librte_vhost/vhost.c
+++ b/lib/librte_vhost/vhost.c
@@ -259,6 +259,7 @@  alloc_vring_queue(struct virtio_net *dev, uint32_t vring_idx)
 
 	dev->virtqueue[vring_idx] = vq;
 	init_vring_queue(dev, vring_idx);
+	rte_spinlock_init(&vq->access_lock);
 
 	dev->nr_vring += 1;
 
diff --git a/lib/librte_vhost/vhost_user.c b/lib/librte_vhost/vhost_user.c
index f4c7ce46..a5b5e4be 100644
--- a/lib/librte_vhost/vhost_user.c
+++ b/lib/librte_vhost/vhost_user.c
@@ -1190,12 +1190,48 @@  vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev, VhostUserMsg *msg)
 	return alloc_vring_queue(dev, vring_idx);
 }
 
+static void
+vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
+{
+	unsigned int i = 0;
+	unsigned int vq_num = 0;
+
+	while (vq_num < dev->nr_vring) {
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+		if (vq) {
+			vhost_user_access_lock(vq);
+			vq_num++;
+		}
+		i++;
+	}
+}
+
+static void
+vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
+{
+	unsigned int i = 0;
+	unsigned int vq_num = 0;
+
+	while (vq_num < dev->nr_vring) {
+		struct vhost_virtqueue *vq = dev->virtqueue[i];
+
+		if (vq) {
+			vhost_user_access_unlock(vq);
+			vq_num++;
+		}
+		i++;
+	}
+}
+
 int
 vhost_user_msg_handler(int vid, int fd)
 {
 	struct virtio_net *dev;
 	struct VhostUserMsg msg;
 	int ret;
+	int unlock_required = 0;
+
 
 	dev = get_device(vid);
 	if (dev == NULL)
@@ -1241,6 +1277,32 @@  vhost_user_msg_handler(int vid, int fd)
 		return -1;
 	}
 
+	switch (msg.request.master) {
+	case VHOST_USER_SET_FEATURES:
+	case VHOST_USER_SET_PROTOCOL_FEATURES:
+	case VHOST_USER_SET_OWNER:
+	case VHOST_USER_RESET_OWNER:
+	case VHOST_USER_SET_MEM_TABLE:
+	case VHOST_USER_SET_LOG_BASE:
+	case VHOST_USER_SET_LOG_FD:
+	case VHOST_USER_SET_VRING_NUM:
+	case VHOST_USER_SET_VRING_ADDR:
+	case VHOST_USER_SET_VRING_BASE:
+	case VHOST_USER_SET_VRING_KICK:
+	case VHOST_USER_SET_VRING_CALL:
+	case VHOST_USER_SET_VRING_ERR:
+	case VHOST_USER_SET_VRING_ENABLE:
+	case VHOST_USER_SEND_RARP:
+	case VHOST_USER_NET_SET_MTU:
+	case VHOST_USER_SET_SLAVE_REQ_FD:
+		vhost_user_lock_all_queue_pairs(dev);
+		unlock_required = 1;
+		break;
+	default:
+		break;
+
+	}
+
 	switch (msg.request.master) {
 	case VHOST_USER_GET_FEATURES:
 		msg.payload.u64 = vhost_user_get_features(dev);
@@ -1342,6 +1404,9 @@  vhost_user_msg_handler(int vid, int fd)
 
 	}
 
+	if (unlock_required)
+		vhost_user_unlock_all_queue_pairs(dev);
+
 	if (msg.flags & VHOST_USER_NEED_REPLY) {
 		msg.payload.u64 = !!ret;
 		msg.size = sizeof(msg.payload.u64);
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 6fee16e5..b584e380 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -44,6 +44,7 @@ 
 #include <rte_udp.h>
 #include <rte_sctp.h>
 #include <rte_arp.h>
+#include <rte_spinlock.h>
 
 #include "iotlb.h"
 #include "vhost.h"
@@ -326,8 +327,11 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
+
+	vhost_user_access_lock(vq);
+
 	if (unlikely(vq->enabled == 0))
-		return 0;
+		goto out;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_lock(vq);
@@ -416,6 +420,7 @@  virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
 			&& (vq->callfd >= 0))
 		eventfd_write(vq->callfd, (eventfd_t)1);
 out:
+	vhost_user_access_unlock(vq);
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
@@ -651,8 +656,11 @@  virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
+
+	vhost_user_access_lock(vq);
+
 	if (unlikely(vq->enabled == 0))
-		return 0;
+		goto out;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_lock(vq);
@@ -712,6 +720,7 @@  virtio_dev_merge_rx(struct virtio_net *dev, uint16_t queue_id,
 	}
 
 out:
+	vhost_user_access_unlock(vq);
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
@@ -1180,9 +1189,12 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	}
 
 	vq = dev->virtqueue[queue_id];
+
 	if (unlikely(vq->enabled == 0))
 		return 0;
 
+	vhost_user_access_lock(vq);
+
 	vq->batch_copy_nb_elems = 0;
 
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
@@ -1240,6 +1252,7 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 		if (rarp_mbuf == NULL) {
 			RTE_LOG(ERR, VHOST_DATA,
 				"Failed to allocate memory for mbuf.\n");
+			vhost_user_access_unlock(vq);
 			return 0;
 		}
 
@@ -1356,6 +1369,8 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
 		vhost_user_iotlb_rd_unlock(vq);
 
+	vhost_user_access_unlock(vq);
+
 	if (unlikely(rarp_mbuf != NULL)) {
 		/*
 		 * Inject it to the head of "pkts" array, so that switch's mac
@@ -1366,5 +1381,6 @@  rte_vhost_dequeue_burst(int vid, uint16_t queue_id,
 		i += 1;
 	}
 
+
 	return i;
 }