[dpdk-dev] [PATCH v2] vhost: improve dirty pages logging performance
Maxime Coquelin
maxime.coquelin at redhat.com
Wed May 16 09:49:45 CEST 2018
On 05/16/2018 08:21 AM, Tiwei Bie wrote:
> On Tue, May 15, 2018 at 07:30:21PM +0200, Maxime Coquelin wrote:
>> This patch caches all dirty pages logging until the used ring index
>> is updated. These dirty pages won't be accessed by the guest as
>> long as the host doesn't give them back to it by updating the
>> index.
>>
>> The goal of this optimization is to fix a performance regression
>> introduced when the vhost library started to use atomic operations
>> to set bits in the shared dirty log map. While the fix was valid
>> as previous implementation wasn't safe against concurent accesses,
>
> Typo: concurent
Right.
>> contention was induced.
>>
>> With this patch, during migration, we have:
>> 1. Less atomic operations as only a single atomic OR operation
>> per 32 or 64 (depending on CPU) pages.
>> 2. Less atomic operations as during a burst, the same page will
>> be marked dirty only once.
>> 3. Less write memory barriers.
>>
>> Fixes: 897f13a1f726 ("vhost: make page logging atomic")
>>
>> Cc: stable at dpdk.org
>>
>> Cc: Tiwei Bie <tiwei.bie at intel.com>
>> Suggested-by: Michael S. Tsirkin <mst at redhat.com>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin at redhat.com>
>
> This is a nice approach! Thanks for the work!
Thanks for the review, it is really appreciated.
>> ---
>> lib/librte_vhost/vhost.h | 113 ++++++++++++++++++++++++++++++++++++++++++
>> lib/librte_vhost/virtio_net.c | 29 +++++++----
>> 2 files changed, 132 insertions(+), 10 deletions(-)
>>
>> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
>> index 891978131..8f6a41d7e 100644
>> --- a/lib/librte_vhost/vhost.h
>> +++ b/lib/librte_vhost/vhost.h
>> @@ -36,6 +36,8 @@
>>
>> #define BUF_VECTOR_MAX 256
>>
>> +#define VHOST_LOG_CACHE_NR 32
>> +
>> /**
>> * Structure contains buffer address, length and descriptor index
>> * from vring to do scatter RX.
>> @@ -69,6 +71,14 @@ struct batch_copy_elem {
>> uint64_t log_addr;
>> };
>>
>> +/*
>> + * Structure that contains the info for batched dirty logging.
>> + */
>> +struct log_cache_entry {
>> + uint32_t offset;
>> + unsigned long val;
>> +};
>> +
>> /**
>> * Structure contains variables relevant to RX/TX virtqueues.
>> */
>> @@ -112,6 +122,9 @@ struct vhost_virtqueue {
>> struct batch_copy_elem *batch_copy_elems;
>> uint16_t batch_copy_nb_elems;
>>
>> + struct log_cache_entry log_cache[VHOST_LOG_CACHE_NR];
>> + uint16_t log_cache_nb_elem;
>> +
>> rte_rwlock_t iotlb_lock;
>> rte_rwlock_t iotlb_pending_lock;
>> struct rte_mempool *iotlb_pool;
>> @@ -309,7 +322,15 @@ struct virtio_net {
>> static __rte_always_inline void
>> vhost_set_bit(unsigned int nr, volatile uint8_t *addr)
>> {
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
>
> Just curious, is there any reference about why
> this version was chosen? Thanks!
I googled Michael reference to the LWN article [0], and they mention GCC
7.1.
I haven't checked by myself though whether generated code is different
in GCC >= 7.1.
[0]: https://lwn.net/Articles/691128/
>
>> + /*
>> + * __sync_ built-ins are deprecated, but __atomic_ ones
>> + * are sub-optimized in older GCC versions.
>> + */
>
> The indent isn't right (just need one tab here).
Right, will fix.
>
>> __sync_fetch_and_or_8(addr, (1U << nr));
This is unrelated to this patch set, but from GCC doc [1], shouldn't
we use __sync_fetch_and_or_1 as the size is in bytes?
[1]: https://gcc.gnu.org/onlinedocs/gcc/_005f_005fsync-Builtins.html
>> +#else
>> + __atomic_fetch_or(addr, (1U << nr), __ATOMIC_RELAXED);
>> +#endif
>> }
>>
>> static __rte_always_inline void
>> @@ -340,6 +361,98 @@ vhost_log_write(struct virtio_net *dev, uint64_t addr, uint64_t len)
>> }
>> }
>>
>> +static __rte_always_inline void
>> +vhost_log_cache_sync(struct virtio_net *dev, struct vhost_virtqueue *vq)
>> +{
>> + unsigned long *log_base;
>> + int i;
>> +
>> + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> + !dev->log_base))
>> + return;
>> +
>> + log_base = (unsigned long *)(uintptr_t)dev->log_base;
>> +
>> + /* To make sure guest memory updates are committed before logging */
>> + rte_smp_wmb();
>> +
>> + for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> + struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> +#if defined(RTE_TOOLCHAIN_GCC) && (GCC_VERSION < 70100)
>> + /*
>> + * '__sync' builtins are deprecated, but '__atomic' ones
>> + * are sub-optimized in older GCC versions.
>> + */
>> + __sync_fetch_and_or(log_base + elem->offset, elem->val);
>> +#else
>> + __atomic_fetch_or(log_base + elem->offset, elem->val,
>> + __ATOMIC_RELAXED);
>> +#endif
>> + }
>> +
>> + vq->log_cache_nb_elem = 0;
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_page(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> + uint64_t page)
>> +{
>> + uint32_t bit_nr = page % (sizeof(unsigned long) << 3);
>> + uint32_t offset = page / (sizeof(unsigned long) << 3);
>> + int i;
>> +
>> + for (i = 0; i < vq->log_cache_nb_elem; i++) {
>> + struct log_cache_entry *elem = vq->log_cache + i;
>> +
>> + if (elem->offset == offset) {
>> + elem->val |= (1U << bit_nr);
>
> The val is unsigned long now, we need to use 1UL.
Good catch!
>> + return;
>> + }
>> + }
>> +
>> + if (unlikely(i >= VHOST_LOG_CACHE_NR)) {
>> + /*
>> + * No more room for a new log cache entry,
>> + * so write the dirty log map directly.
>> + */
>> + rte_smp_wmb();
>> + vhost_log_page((uint8_t *)(uintptr_t)dev->log_base, page);
>> +
>> + return;
>> + }
>> +
>> + vq->log_cache[i].offset = offset;
>> + vq->log_cache[i].val = (1U << bit_nr);
>
> Ditto.
>
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_write(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> + uint64_t addr, uint64_t len)
>
> The 8 spaces width tabs are more widely used in DPDK.
> And in below coding style document,
>
> https://github.com/DPDK/dpdk/blob/master/doc/guides/contributing/coding_style.rst
>
> The width of each level indent in most examples is 8
> spaces. So maybe it's better to keep using 8 spaces
> width tabs.
Right, will fix this in v3.
>> +{
>> + uint64_t page;
>> +
>> + if (likely(((dev->features & (1ULL << VHOST_F_LOG_ALL)) == 0) ||
>> + !dev->log_base || !len))
>> + return;
>> +
>> + if (unlikely(dev->log_size <= ((addr + len - 1) / VHOST_LOG_PAGE / 8)))
>> + return;
>> +
>> + page = addr / VHOST_LOG_PAGE;
>> + while (page * VHOST_LOG_PAGE < addr + len) {
>> + vhost_log_cache_page(dev, vq, page);
>> + page += 1;
>> + }
>> +}
>> +
>> +static __rte_always_inline void
>> +vhost_log_cache_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> + uint64_t offset, uint64_t len)
>> +{
>> + vhost_log_cache_write(dev, vq, vq->log_guest_addr + offset, len);
>> +}
>> +
>> static __rte_always_inline void
>> vhost_log_used_vring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> uint64_t offset, uint64_t len)
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index eed6b0227..76ec5f089 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -78,7 +78,7 @@ do_flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq,
>> rte_memcpy(&vq->used->ring[to],
>> &vq->shadow_used_ring[from],
>> size * sizeof(struct vring_used_elem));
>> - vhost_log_used_vring(dev, vq,
>> + vhost_log_cache_used_vring(dev, vq,
>> offsetof(struct vring_used, ring[to]),
>> size * sizeof(struct vring_used_elem));
>> }
>> @@ -106,6 +106,8 @@ flush_shadow_used_ring(struct virtio_net *dev, struct vhost_virtqueue *vq)
>>
>> rte_smp_wmb();
>>
>> + vhost_log_cache_sync(dev, vq);
>
> Each time we call vhost_log_cache_sync(), there
> is already a rte_smp_wmb() which is to protect
> the used->idx update. So maybe there is no need
> to call rte_smp_wmb() in vhost_log_cache_sync().
Right, I can remove it in vhost_log_cache_sync(), and
maybe add a comment there stating that a write barrier
before calling the function is expected.
Thanks,
Maxime
> Best regards,
> Tiwei Bie
>
More information about the dev
mailing list