[v2] vhost: batch used descs chains write-back with packed ring
Checks
Commit Message
Instead of writing back descriptors chains in order, let's
write the first chain flags last in order to improve batching.
With Kernel's pktgen benchmark, ~3% performance gain is measured.
Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
---
V2:
Revert back to initial implementation to have a write
barrier before every descs flags store, but still
store first desc flags last. (Missing barrier reported
by Ilya)
lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
1 file changed, 16 insertions(+), 3 deletions(-)
Comments
On Wed, Dec 19, 2018 at 10:29:52AM +0100, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
>
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
Acked-by: Michael S. Tsirkin <mst@redhat.com>
> ---
>
> V2:
> Revert back to initial implementation to have a write
> barrier before every descs flags store, but still
> store first desc flags last. (Missing barrier reported
> by Ilya)
>
>
> lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
> 1 file changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8c657a101..de436af79 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> {
> int i;
> uint16_t used_idx = vq->last_used_idx;
> + uint16_t head_idx = vq->last_used_idx;
> + uint16_t head_flags = 0;
>
> /* Split loop in two to save memory barriers */
> for (i = 0; i < vq->shadow_used_idx; i++) {
> @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> flags &= ~VRING_DESC_F_AVAIL;
> }
>
> - vq->desc_packed[vq->last_used_idx].flags = flags;
> + if (i > 0) {
> + vq->desc_packed[vq->last_used_idx].flags = flags;
>
> - vhost_log_cache_used_vring(dev, vq,
> + vhost_log_cache_used_vring(dev, vq,
> vq->last_used_idx *
> sizeof(struct vring_packed_desc),
> sizeof(struct vring_packed_desc));
> + } else {
> + head_idx = vq->last_used_idx;
> + head_flags = flags;
> + }
>
> vq->last_used_idx += vq->shadow_used_packed[i].count;
> if (vq->last_used_idx >= vq->size) {
> @@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> }
> }
>
> - rte_smp_wmb();
> + vq->desc_packed[head_idx].flags = head_flags;
> +
> + vhost_log_cache_used_vring(dev, vq,
> + vq->last_used_idx *
> + sizeof(struct vring_packed_desc),
> + sizeof(struct vring_packed_desc));
> +
> vq->shadow_used_idx = 0;
> vhost_log_cache_sync(dev, vq);
> }
> --
> 2.17.2
On Wed, Dec 19, 2018 at 10:29:52AM +0100, Maxime Coquelin wrote:
> Instead of writing back descriptors chains in order, let's
> write the first chain flags last in order to improve batching.
>
> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>
> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> ---
>
> V2:
> Revert back to initial implementation to have a write
> barrier before every descs flags store, but still
> store first desc flags last. (Missing barrier reported
> by Ilya)
>
>
> lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
> 1 file changed, 16 insertions(+), 3 deletions(-)
>
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 8c657a101..de436af79 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> {
> int i;
> uint16_t used_idx = vq->last_used_idx;
> + uint16_t head_idx = vq->last_used_idx;
> + uint16_t head_flags = 0;
>
> /* Split loop in two to save memory barriers */
> for (i = 0; i < vq->shadow_used_idx; i++) {
> @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> flags &= ~VRING_DESC_F_AVAIL;
> }
>
> - vq->desc_packed[vq->last_used_idx].flags = flags;
> + if (i > 0) {
> + vq->desc_packed[vq->last_used_idx].flags = flags;
>
> - vhost_log_cache_used_vring(dev, vq,
> + vhost_log_cache_used_vring(dev, vq,
> vq->last_used_idx *
> sizeof(struct vring_packed_desc),
> sizeof(struct vring_packed_desc));
> + } else {
> + head_idx = vq->last_used_idx;
> + head_flags = flags;
> + }
>
> vq->last_used_idx += vq->shadow_used_packed[i].count;
> if (vq->last_used_idx >= vq->size) {
> @@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> }
> }
>
> - rte_smp_wmb();
> + vq->desc_packed[head_idx].flags = head_flags;
> +
> + vhost_log_cache_used_vring(dev, vq,
> + vq->last_used_idx *
Should be head_idx.
> + sizeof(struct vring_packed_desc),
> + sizeof(struct vring_packed_desc));
> +
> vq->shadow_used_idx = 0;
A wmb() is needed before log_cache_sync?
> vhost_log_cache_sync(dev, vq);
> }
> --
> 2.17.2
>
On 12/20/18 5:44 AM, Tiwei Bie wrote:
> On Wed, Dec 19, 2018 at 10:29:52AM +0100, Maxime Coquelin wrote:
>> Instead of writing back descriptors chains in order, let's
>> write the first chain flags last in order to improve batching.
>>
>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>
>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>> ---
>>
>> V2:
>> Revert back to initial implementation to have a write
>> barrier before every descs flags store, but still
>> store first desc flags last. (Missing barrier reported
>> by Ilya)
>>
>>
>> lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
>> 1 file changed, 16 insertions(+), 3 deletions(-)
>>
>> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
>> index 8c657a101..de436af79 100644
>> --- a/lib/librte_vhost/virtio_net.c
>> +++ b/lib/librte_vhost/virtio_net.c
>> @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> {
>> int i;
>> uint16_t used_idx = vq->last_used_idx;
>> + uint16_t head_idx = vq->last_used_idx;
>> + uint16_t head_flags = 0;
>>
>> /* Split loop in two to save memory barriers */
>> for (i = 0; i < vq->shadow_used_idx; i++) {
>> @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> flags &= ~VRING_DESC_F_AVAIL;
>> }
>>
>> - vq->desc_packed[vq->last_used_idx].flags = flags;
>> + if (i > 0) {
>> + vq->desc_packed[vq->last_used_idx].flags = flags;
>>
>> - vhost_log_cache_used_vring(dev, vq,
>> + vhost_log_cache_used_vring(dev, vq,
>> vq->last_used_idx *
>> sizeof(struct vring_packed_desc),
>> sizeof(struct vring_packed_desc));
>> + } else {
>> + head_idx = vq->last_used_idx;
>> + head_flags = flags;
>> + }
>>
>> vq->last_used_idx += vq->shadow_used_packed[i].count;
>> if (vq->last_used_idx >= vq->size) {
>> @@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>> }
>> }
>>
>> - rte_smp_wmb();
>> + vq->desc_packed[head_idx].flags = head_flags;
>> +
>> + vhost_log_cache_used_vring(dev, vq,
>> + vq->last_used_idx *
>
> Should be head_idx.
Oh yes, thanks for spotting this.
>
>> + sizeof(struct vring_packed_desc),
>> + sizeof(struct vring_packed_desc));
>> +
>> vq->shadow_used_idx = 0;
>
> A wmb() is needed before log_cache_sync?
I think you're right, I was wrong but thought we had a barrier in cache
sync function.
That's not very important for x86, but I think it should be preferable
to do it in vhost_log_cache_sync(), if logging is enabled.
What do you think?
>> vhost_log_cache_sync(dev, vq);
>> }
>> --
>> 2.17.2
>>
On 12/20/18 9:49 AM, Maxime Coquelin wrote:
>
>
> On 12/20/18 5:44 AM, Tiwei Bie wrote:
>> On Wed, Dec 19, 2018 at 10:29:52AM +0100, Maxime Coquelin wrote:
>>> Instead of writing back descriptors chains in order, let's
>>> write the first chain flags last in order to improve batching.
>>>
>>> With Kernel's pktgen benchmark, ~3% performance gain is measured.
>>>
>>> Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
>>> ---
>>>
>>> V2:
>>> Revert back to initial implementation to have a write
>>> barrier before every descs flags store, but still
>>> store first desc flags last. (Missing barrier reported
>>> by Ilya)
>>>
>>>
>>> lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
>>> 1 file changed, 16 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/lib/librte_vhost/virtio_net.c
>>> b/lib/librte_vhost/virtio_net.c
>>> index 8c657a101..de436af79 100644
>>> --- a/lib/librte_vhost/virtio_net.c
>>> +++ b/lib/librte_vhost/virtio_net.c
>>> @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
>>> {
>>> int i;
>>> uint16_t used_idx = vq->last_used_idx;
>>> + uint16_t head_idx = vq->last_used_idx;
>>> + uint16_t head_flags = 0;
>>> /* Split loop in two to save memory barriers */
>>> for (i = 0; i < vq->shadow_used_idx; i++) {
>>> @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net
>>> *dev,
>>> flags &= ~VRING_DESC_F_AVAIL;
>>> }
>>> - vq->desc_packed[vq->last_used_idx].flags = flags;
>>> + if (i > 0) {
>>> + vq->desc_packed[vq->last_used_idx].flags = flags;
>>> - vhost_log_cache_used_vring(dev, vq,
>>> + vhost_log_cache_used_vring(dev, vq,
>>> vq->last_used_idx *
>>> sizeof(struct vring_packed_desc),
>>> sizeof(struct vring_packed_desc));
>>> + } else {
>>> + head_idx = vq->last_used_idx;
>>> + head_flags = flags;
>>> + }
>>> vq->last_used_idx += vq->shadow_used_packed[i].count;
>>> if (vq->last_used_idx >= vq->size) {
>>> @@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net
>>> *dev,
>>> }
>>> }
>>> - rte_smp_wmb();
>>> + vq->desc_packed[head_idx].flags = head_flags;
>>> +
>>> + vhost_log_cache_used_vring(dev, vq,
>>> + vq->last_used_idx *
>>
>> Should be head_idx.
>
> Oh yes, thanks for spotting this.
>
>>
>>> + sizeof(struct vring_packed_desc),
>>> + sizeof(struct vring_packed_desc));
>>> +
>>> vq->shadow_used_idx = 0;
>>
>> A wmb() is needed before log_cache_sync?
>
> I think you're right, I was wrong but thought we had a barrier in cache
> sync function.
> That's not very important for x86, but I think it should be preferable
> to do it in vhost_log_cache_sync(), if logging is enabled.
>
> What do you think?
I'll keep it in this function for now, as I think we cannot remove the
one in the split variant so it would mean having two barriers in that
case.
>>> vhost_log_cache_sync(dev, vq);
>>> }
>>> --
>>> 2.17.2
>>>
On Thu, Dec 20, 2018 at 12:44:46PM +0800, Tiwei Bie wrote:
> On Wed, Dec 19, 2018 at 10:29:52AM +0100, Maxime Coquelin wrote:
> > Instead of writing back descriptors chains in order, let's
> > write the first chain flags last in order to improve batching.
> >
> > With Kernel's pktgen benchmark, ~3% performance gain is measured.
> >
> > Signed-off-by: Maxime Coquelin <maxime.coquelin@redhat.com>
> > ---
> >
> > V2:
> > Revert back to initial implementation to have a write
> > barrier before every descs flags store, but still
> > store first desc flags last. (Missing barrier reported
> > by Ilya)
> >
> >
> > lib/librte_vhost/virtio_net.c | 19 ++++++++++++++++---
> > 1 file changed, 16 insertions(+), 3 deletions(-)
> >
> > diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> > index 8c657a101..de436af79 100644
> > --- a/lib/librte_vhost/virtio_net.c
> > +++ b/lib/librte_vhost/virtio_net.c
> > @@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> > {
> > int i;
> > uint16_t used_idx = vq->last_used_idx;
> > + uint16_t head_idx = vq->last_used_idx;
> > + uint16_t head_flags = 0;
> >
> > /* Split loop in two to save memory barriers */
> > for (i = 0; i < vq->shadow_used_idx; i++) {
> > @@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> > flags &= ~VRING_DESC_F_AVAIL;
> > }
> >
> > - vq->desc_packed[vq->last_used_idx].flags = flags;
> > + if (i > 0) {
> > + vq->desc_packed[vq->last_used_idx].flags = flags;
> >
> > - vhost_log_cache_used_vring(dev, vq,
> > + vhost_log_cache_used_vring(dev, vq,
> > vq->last_used_idx *
> > sizeof(struct vring_packed_desc),
> > sizeof(struct vring_packed_desc));
> > + } else {
> > + head_idx = vq->last_used_idx;
> > + head_flags = flags;
> > + }
> >
> > vq->last_used_idx += vq->shadow_used_packed[i].count;
> > if (vq->last_used_idx >= vq->size) {
> > @@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
> > }
> > }
> >
> > - rte_smp_wmb();
> > + vq->desc_packed[head_idx].flags = head_flags;
> > +
> > + vhost_log_cache_used_vring(dev, vq,
> > + vq->last_used_idx *
>
> Should be head_idx.
>
> > + sizeof(struct vring_packed_desc),
> > + sizeof(struct vring_packed_desc));
> > +
> > vq->shadow_used_idx = 0;
>
> A wmb() is needed before log_cache_sync?
>
> > vhost_log_cache_sync(dev, vq);
Probably smarter to have the wmb in there.
This way we can skip it if not logging.
> > }
> > --
> > 2.17.2
> >
@@ -97,6 +97,8 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
{
int i;
uint16_t used_idx = vq->last_used_idx;
+ uint16_t head_idx = vq->last_used_idx;
+ uint16_t head_flags = 0;
/* Split loop in two to save memory barriers */
for (i = 0; i < vq->shadow_used_idx; i++) {
@@ -126,12 +128,17 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
flags &= ~VRING_DESC_F_AVAIL;
}
- vq->desc_packed[vq->last_used_idx].flags = flags;
+ if (i > 0) {
+ vq->desc_packed[vq->last_used_idx].flags = flags;
- vhost_log_cache_used_vring(dev, vq,
+ vhost_log_cache_used_vring(dev, vq,
vq->last_used_idx *
sizeof(struct vring_packed_desc),
sizeof(struct vring_packed_desc));
+ } else {
+ head_idx = vq->last_used_idx;
+ head_flags = flags;
+ }
vq->last_used_idx += vq->shadow_used_packed[i].count;
if (vq->last_used_idx >= vq->size) {
@@ -140,7 +147,13 @@ flush_shadow_used_ring_packed(struct virtio_net *dev,
}
}
- rte_smp_wmb();
+ vq->desc_packed[head_idx].flags = head_flags;
+
+ vhost_log_cache_used_vring(dev, vq,
+ vq->last_used_idx *
+ sizeof(struct vring_packed_desc),
+ sizeof(struct vring_packed_desc));
+
vq->shadow_used_idx = 0;
vhost_log_cache_sync(dev, vq);
}