[dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance

Maxime Coquelin maxime.coquelin at redhat.com
Wed May 16 09:49:45 CEST 2018



On 05/16/2018 08:21 AM, Tiwei Bie wrote:
> On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
>> This patch caches all dirty pages logging until the used ring index
>> is updated. These dirty pages won't be accessed by the guest as
>> long as the host doesn't give them back to it by updating the
>> index.
>>
>> The goal of this optimization is to fix a performance regression
>> introduced when the vhost library started to use atomic operations
>> to set bits in the shared dirty log map. While the fix was valid
>> as previous implementation wasn't safe against concurent accesses,
> 
> Typo: concurent

Right.

>> contention was induced.
>>
>> With this patch, during migration, we have:
>> 1. Less atomic operations as only a single atomic OR operation
>> per 32 or 64 (depending on CPU)  pages.
>> 2. Less atomic operations as during a burst, the same page will
>> be marked dirty only once.
>> 3. Less write memory barriers.
>>
>> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
>>
>> Cc: stable at dpdk.org
>>
>> Cc: Tiwei Bie <tiwei.bie at intel.com>
>> Suggested-by: Michael S. Tsirkin <mst at redhat.com>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin at redhat.com>
> 
> This is a nice approach! Thanks for the work!

Thanks for the review, it is really appreciated.

>> ---
>>   lib/librte_vhost/vhost.h      | 113 ++++++++++++++++++++++++++++++++++++++++++
>>   lib/librte_vhost/virtio_net.c |  29 +++++++----
>>   2 files changed, 132 insertions(+), 10 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index 891978131..8f6a41d7e 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -36,6 +36,8 @@
>>   
>>   #define BUF_VECTOR_MAX 256
>>   
>> +#define VHOST_LOG_CACHE_NR 32
>> +
>>   /**
>>    * Structure contains buffer address, length and descriptor index
>>    * from vring to do scatter RX.
>> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>>   	uint64_t log_addr;
>>   };
>>   
>> +/*
>> + * Structure that contains the info for batched dirty logging.
>> + */
>> +struct log_cache_entry {
>> +	uint32_t offset;
>> +	unsigned long val;
>> +};
>> +
>>   /**
>>    * Structure contains variables relevant to RX/TX virtqueues.
>>    */
>> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>>   	struct batch_copy_elem	*batch_copy_elems;
>>   	uint16_t		batch_copy_nb_elems;
>>   
>> +	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
>> +	uint16_t log_cache_nb_elem;
>> +
>>   	rte_rwlock_t	iotlb_lock;
>>   	rte_rwlock_t	iotlb_pending_lock;
>>   	struct rte_mempool *iotlb_pool;
>> @@ -309,7 +322,15 @@ struct virtio_net {
>>   static __rte_always_inline void
>>   vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>>   {
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> 
> Just curious, is there any reference about why
> this version was chosen? Thanks!

I googled Michael reference to the LWN article [0], and they mention GCC
7.1.

I haven't checked by myself though whether generated code is different 
in GCC >= 7.1.

[0]: https://lwn.net/Articles/691128/
> 
>> +		/*
>> +		 * __sync_ built-ins are deprecated, but __atomic_ ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
> 
> The indent isn't right (just need one tab here).
Right, will fix.

> 
>>   	__sync_fetch_and_or_8(addr, (1U << nr));

This is unrelated to this patch set, but from GCC doc [1], shouldn't
we use __sync_fetch_and_or_1 as the size is in bytes?

[1]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html

>> +#else
>> +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
>> +#endif
>>   }
>>   
>>   static __rte_always_inline void
>> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>>   	}
>>   }
>>   
>> +static __rte_always_inline void
>> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> +	unsigned long *log_base;
>> +	int i;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base))
>> +		return;
>> +
>> +	log_base = (unsigned long *)(uintptr_t)dev->log_base;
>> +
>> +	/* To make sure guest memory updates are committed before logging */
>> +	rte_smp_wmb();
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
>> +		/*
>> +		 * '__sync' builtins are deprecated, but '__atomic' ones
>> +		 * are sub-optimized in older GCC versions.
>> +		 */
>> +		__sync_fetch_and_or(log_base + elem->offset, elem->val);
>> +#else
>> +		__atomic_fetch_or(log_base + elem->offset, elem->val,
>> +				__ATOMIC_RELAXED);
>> +#endif
>> +	}
>> +
>> +	vq->log_cache_nb_elem = 0;
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					 uint64_t page)
>> +{
>> +	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
>> +	uint32_t offset = page / (sizeof(unsigned long) << 3);
>> +	int i;
>> +
>> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> +		struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +		if (elem->offset == offset) {
>> +			elem->val |= (1U << bit_nr);
> 
> The val is unsigned long now, we need to use 1UL.

Good catch!

>> +			return;
>> +		}
>> +	}
>> +
>> +	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
>> +		/*
>> +		 * No more room for a new log cache entry,
>> +		 * so write the dirty log map directly.
>> +		 */
>> +		rte_smp_wmb();
>> +		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
>> +
>> +		return;
>> +	}
>> +
>> +	vq->log_cache[i].offset = offset;
>> +	vq->log_cache[i].val = (1U << bit_nr);
> 
> Ditto.
> 
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +					  uint64_t addr, uint64_t len)
> 
> The 8 spaces width tabs are more widely used in DPDK.
> And in below coding style document,
> 
> https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst
> 
> The width of each level indent in most examples is 8
> spaces. So maybe it's better to keep using 8 spaces
> width tabs.

Right, will fix this in v3.

>> +{
>> +	uint64_t page;
>> +
>> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> +		   !dev->log_base || !len))
>> +		return;
>> +
>> +	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
>> +		return;
>> +
>> +	page = addr / VHOST_LOG_PAGE;
>> +	while (page * VHOST_LOG_PAGE < addr + len) {
>> +		vhost_log_cache_page(dev, vq, page);
>> +		page += 1;
>> +	}
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> +		     uint64_t offset, uint64_t len)
>> +{
>> +	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
>> +}
>> +
>>   static __rte_always_inline void
>>   vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   		     uint64_t offset, uint64_t len)
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index eed6b0227..76ec5f089 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>>   	rte_memcpy(&vq->used->ring[to],
>>   			&vq->shadow_used_ring[from],
>>   			size * sizeof(struct vring_used_elem));
>> -	vhost_log_used_vring(dev, vq,
>> +	vhost_log_cache_used_vring(dev, vq,
>>   			offsetof(struct vring_used, ring[to]),
>>   			size * sizeof(struct vring_used_elem));
>>   }
>> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>   
>>   	rte_smp_wmb();
>>   
>> +	vhost_log_cache_sync(dev, vq);
> 
> Each time we call vhost_log_cache_sync(), there
> is already a rte_smp_wmb() which is to protect
> the used->idx update. So maybe there is no need
> to call rte_smp_wmb() in vhost_log_cache_sync().

Right, I can remove it in vhost_log_cache_sync(), and
maybe add a comment there stating that a write barrier
before calling the function is expected.

Thanks,
Maxime
> Best regards,
> Tiwei Bie
> 


More information about the dev mailing list