[dpdk-dev] [PATCH v3] vhost: improve dirty pages logging performance

Michael S. Tsirkin mst at redhat.com
Thu May 17 05:06:34 CEST 2018


On Wed, May 16, 2018 at 06:54:23PM +0200, Maxime Coquelin wrote:
> This patch caches all dirty pages logging until the used ring index
> is updated.
> 
> The goal of this optimization is to fix a performance regression
> introduced when the vhost library started to use atomic operations
> to set bits in the shared dirty log map. While the fix was valid
> as previous implementation wasn't safe against concurent accesses,
> contention was induced.
> 
> With this patch, during migration, we have:
> 1. Less atomic operations as only a single atomic OR operation
> per 32 or 64 (depending on CPU) pages.
> 2. Less atomic operations as during a burst, the same page will
> be marked dirty only once.
> 3. Less write memory barriers.
> 
> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
> 
> Cc: stable at dpdk.org
> 
> Cc: Tiwei Bie <tiwei.bie at intel.com>
> Suggested-by: Michael S. Tsirkin <mst at redhat.com>
> Signed-off-by: Maxime Coquelin <maxime.coquelin at redhat.com>


Deferring updates until GET_BASE would also be possible,
but would increase the chance that a disconnect causes
ring to become inconsistent.

I'm not sure whether there is a chance of that with this
patch (in case of a crash after used idx updated but
before dirty log update of the used idx), but
at least it's not bigger than before this patch.

> ---
> Changes since v2:
> - Fix indentation (Tiwei)
> - Change to use 1UL for bit shifting as unsigned long (Tiwei)
> - Remove additional rte_smp_wmb() in vhost_log_cache_sync() (Tiwei)
> - Fix __sync_fetch_and_or_8 to __sync_fetch_and_or_1 as one byte wide.
> 
>  lib/librte_vhost/vhost.h      | 117 +++++++++++++++++++++++++++++++++++++++++-
>  lib/librte_vhost/virtio_net.c |  29 +++++++----
>  2 files changed, 135 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 891978131..0e4e433e0 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -36,6 +36,8 @@
>  
>  #define BUF_VECTOR_MAX 256
>  
> +#define VHOST_LOG_CACHE_NR 32
> +
>  /**
>   * Structure contains buffer address, length and descriptor index
>   * from vring to do scatter RX.
> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>  	uint64_t log_addr;
>  };
>  
> +/*
> + * Structure that contains the info for batched dirty logging.
> + */
> +struct log_cache_entry {
> +	uint32_t offset;
> +	unsigned long val;
> +};
> +
>  /**
>   * Structure contains variables relevant to RX/TX virtqueues.
>   */
> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>  	struct batch_copy_elem	*batch_copy_elems;
>  	uint16_t		batch_copy_nb_elems;
>  
> +	struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
> +	uint16_t log_cache_nb_elem;
> +
>  	rte_rwlock_t	iotlb_lock;
>  	rte_rwlock_t	iotlb_pending_lock;
>  	struct rte_mempool *iotlb_pool;
> @@ -309,7 +322,15 @@ struct virtio_net {
>  static __rte_always_inline void
>  vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>  {
> -	__sync_fetch_and_or_8(addr, (1U << nr));
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> +	/*
> +	 * __sync_ built-ins are deprecated, but __atomic_ ones
> +	 * are sub-optimized in older GCC versions.
> +	 */
> +	__sync_fetch_and_or_1(addr, (1U << nr));
> +#else
> +	__atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
> +#endif
>  }
>  
>  static __rte_always_inline void
> @@ -340,6 +361,100 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>  	}
>  }
>  
> +static __rte_always_inline void
> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
> +{
> +	unsigned long *log_base;
> +	int i;
> +
> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +		   !dev->log_base))
> +		return;
> +
> +	log_base = (unsigned long *)(uintptr_t)dev->log_base;
> +
> +	/*
> +	 * It is expected a write memory barrier has been issued
> +	 * before this function is called.
> +	 */
> +
> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +		struct log_cache_entry *elem = vq->log_cache + i;
> +
> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
> +		/*
> +		 * '__sync' builtins are deprecated, but '__atomic' ones
> +		 * are sub-optimized in older GCC versions.
> +		 */
> +		__sync_fetch_and_or(log_base + elem->offset, elem->val);
> +#else
> +		__atomic_fetch_or(log_base + elem->offset, elem->val,
> +				__ATOMIC_RELAXED);
> +#endif
> +	}
> +
> +	vq->log_cache_nb_elem = 0;
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +			uint64_t page)
> +{
> +	uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
> +	uint32_t offset = page / (sizeof(unsigned long) << 3);
> +	int i;
> +
> +	for (i = 0; i < vq->log_cache_nb_elem; i++) {
> +		struct log_cache_entry *elem = vq->log_cache + i;
> +
> +		if (elem->offset == offset) {
> +			elem->val |= (1UL << bit_nr);
> +			return;
> +		}
> +	}
> +
> +	if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
> +		/*
> +		 * No more room for a new log cache entry,
> +		 * so write the dirty log map directly.
> +		 */
> +		rte_smp_wmb();
> +		vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
> +
> +		return;
> +	}
> +
> +	vq->log_cache[i].offset = offset;
> +	vq->log_cache[i].val = (1UL << bit_nr);
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +			uint64_t addr, uint64_t len)
> +{
> +	uint64_t page;
> +
> +	if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
> +		   !dev->log_base || !len))
> +		return;
> +
> +	if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
> +		return;
> +
> +	page = addr / VHOST_LOG_PAGE;
> +	while (page * VHOST_LOG_PAGE < addr + len) {
> +		vhost_log_cache_page(dev, vq, page);
> +		page += 1;
> +	}
> +}
> +
> +static __rte_always_inline void
> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +			uint64_t offset, uint64_t len)
> +{
> +	vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
> +}
> +
>  static __rte_always_inline void
>  vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  		     uint64_t offset, uint64_t len)
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index eed6b0227..76ec5f089 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	rte_memcpy(&vq->used->ring[to],
>  			&vq->shadow_used_ring[from],
>  			size * sizeof(struct vring_used_elem));
> -	vhost_log_used_vring(dev, vq,
> +	vhost_log_cache_used_vring(dev, vq,
>  			offsetof(struct vring_used, ring[to]),
>  			size * sizeof(struct vring_used_elem));
>  }
> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  
>  	rte_smp_wmb();
>  
> +	vhost_log_cache_sync(dev, vq);
> +
>  	*(volatile uint16_t *)&vq->used->idx += vq->shadow_used_idx;
>  	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
>  		sizeof(vq->used->idx));
> @@ -130,7 +132,7 @@ do_data_copy_enqueue(struct virtio_net *dev, struct vhost_virtqueue *vq)
>  
>  	for (i = 0; i < count; i++) {
>  		rte_memcpy(elem[i].dst, elem[i].src, elem[i].len);
> -		vhost_log_write(dev, elem[i].log_addr, elem[i].len);
> +		vhost_log_cache_write(dev, vq, elem[i].log_addr, elem[i].len);
>  		PRINT_PACKET(dev, (uintptr_t)elem[i].dst, elem[i].len, 0);
>  	}
>  }
> @@ -251,7 +253,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  		virtio_enqueue_offload(m,
>  				(struct virtio_net_hdr *)(uintptr_t)desc_addr);
>  		PRINT_PACKET(dev, (uintptr_t)desc_addr, dev->vhost_hlen, 0);
> -		vhost_log_write(dev, desc_gaddr, dev->vhost_hlen);
> +		vhost_log_cache_write(dev, vq, desc_gaddr, dev->vhost_hlen);
>  	} else {
>  		struct virtio_net_hdr vnet_hdr;
>  		uint64_t remain = dev->vhost_hlen;
> @@ -274,7 +276,7 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  					(void *)(uintptr_t)src, len);
>  
>  			PRINT_PACKET(dev, (uintptr_t)dst, (uint32_t)len, 0);
> -			vhost_log_write(dev, guest_addr, len);
> +			vhost_log_cache_write(dev, vq, guest_addr, len);
>  			remain -= len;
>  			guest_addr += len;
>  			src += len;
> @@ -355,7 +357,8 @@ copy_mbuf_to_desc(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  							desc_offset)),
>  				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
>  				cpy_len);
> -			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
> +			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
> +					cpy_len);
>  			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
>  				     cpy_len, 0);
>  		} else {
> @@ -444,7 +447,7 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  		vq->used->ring[used_idx].id = desc_indexes[i];
>  		vq->used->ring[used_idx].len = pkts[i]->pkt_len +
>  					       dev->vhost_hlen;
> -		vhost_log_used_vring(dev, vq,
> +		vhost_log_cache_used_vring(dev, vq,
>  			offsetof(struct vring_used, ring[used_idx]),
>  			sizeof(vq->used->ring[used_idx]));
>  	}
> @@ -504,6 +507,8 @@ virtio_dev_rx(struct virtio_net *dev, uint16_t queue_id,
>  
>  	rte_smp_wmb();
>  
> +	vhost_log_cache_sync(dev, vq);
> +
>  	*(volatile uint16_t *)&vq->used->idx += count;
>  	vq->last_used_idx += count;
>  	vhost_log_used_vring(dev, vq,
> @@ -767,7 +772,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  
>  					PRINT_PACKET(dev, (uintptr_t)dst,
>  							(uint32_t)len, 0);
> -					vhost_log_write(dev, guest_addr, len);
> +					vhost_log_cache_write(dev, vq,
> +							guest_addr, len);
>  
>  					remain -= len;
>  					guest_addr += len;
> @@ -776,7 +782,7 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  			} else {
>  				PRINT_PACKET(dev, (uintptr_t)hdr_addr,
>  						dev->vhost_hlen, 0);
> -				vhost_log_write(dev, hdr_phys_addr,
> +				vhost_log_cache_write(dev, vq, hdr_phys_addr,
>  						dev->vhost_hlen);
>  			}
>  
> @@ -790,7 +796,8 @@ copy_mbuf_to_desc_mergeable(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  							desc_offset)),
>  				rte_pktmbuf_mtod_offset(m, void *, mbuf_offset),
>  				cpy_len);
> -			vhost_log_write(dev, desc_gaddr + desc_offset, cpy_len);
> +			vhost_log_cache_write(dev, vq, desc_gaddr + desc_offset,
> +					cpy_len);
>  			PRINT_PACKET(dev, (uintptr_t)(desc_addr + desc_offset),
>  				cpy_len, 0);
>  		} else {
> @@ -1320,7 +1327,7 @@ update_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  {
>  	vq->used->ring[used_idx].id  = desc_idx;
>  	vq->used->ring[used_idx].len = 0;
> -	vhost_log_used_vring(dev, vq,
> +	vhost_log_cache_used_vring(dev, vq,
>  			offsetof(struct vring_used, ring[used_idx]),
>  			sizeof(vq->used->ring[used_idx]));
>  }
> @@ -1335,6 +1342,8 @@ update_used_idx(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	rte_smp_wmb();
>  	rte_smp_rmb();
>  
> +	vhost_log_cache_sync(dev, vq);
> +
>  	vq->used->idx += count;
>  	vhost_log_used_vring(dev, vq, offsetof(struct vring_used, idx),
>  			sizeof(vq->used->idx));
> -- 
> 2.14.3


More information about the dev mailing list