[v1,02/14] vhost: add burst enqueue function for packed ring

Message ID 20190905161421.55981-3-yong.liu@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series vhost packed ring performance optimization |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Marvin Liu Sept. 5, 2019, 4:14 p.m. UTC
  Burst enqueue function will first check whether descriptors are cache
aligned. It will also check prerequisites in the beginning. Burst
enqueue function not support chained mbufs, single packet enqueue
function will handle it.

Signed-off-by: Marvin Liu <yong.liu@intel.com>
  

Comments

Ilya Maximets Sept. 5, 2019, 10:31 a.m. UTC | #1
On 05.09.2019 19:14, Marvin Liu wrote:
> Burst enqueue function will first check whether descriptors are cache
> aligned. It will also check prerequisites in the beginning. Burst
> enqueue function not support chained mbufs, single packet enqueue
> function will handle it.
> 
> Signed-off-by: Marvin Liu <yong.liu@intel.com>

Hi.

Can we rely on loop unrolling by compiler instead of repeating each
command 4 times?

For example:

    uint64_t len[PACKED_DESCS_BURST];

    for (i = 0; i < PACKED_DESCS_BURST; i++)
        len[i] = descs[avail_idx + i].len;


For 'if's:

    res = false;
    for (i = 0; i < PACKED_DESCS_BURST; i++)
        res |= pkts[i]->next != NULL;
    if (unlikely(res))
        return -1;

or just

    for (i = 0; i < PACKED_DESCS_BURST; i++)
        if (unlikely(pkts[i]->next != NULL))
            return -1;

Since PACKED_DESCS_BURST is a fairly small constant, loops should be
unrolled by compiler producing almost same code.

This will significantly reduce code size and will also allow to
play with PACKED_DESCS_BURST value without massive code changes.

Same is applicable to other patches in the series.

What do you think?

Best regards, Ilya Maximets.

> 
> diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
> index 884befa85..ed8b4aabf 100644
> --- a/lib/librte_vhost/vhost.h
> +++ b/lib/librte_vhost/vhost.h
> @@ -39,6 +39,8 @@
>  
>  #define VHOST_LOG_CACHE_NR 32
>  
> +#define PACKED_DESCS_BURST 4
> +#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1)
>  /**
>   * Structure contains buffer address, length and descriptor index
>   * from vring to do scatter RX.
> diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
> index 5ad0a8175..51ed20543 100644
> --- a/lib/librte_vhost/virtio_net.c
> +++ b/lib/librte_vhost/virtio_net.c
> @@ -896,6 +896,106 @@ virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	return pkt_idx;
>  }
>  
> +static __rte_unused uint16_t
> +virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
> +	 struct rte_mbuf **pkts)
> +{
> +	bool wrap_counter = vq->avail_wrap_counter;
> +	struct vring_packed_desc *descs = vq->desc_packed;
> +	uint16_t avail_idx = vq->last_avail_idx;
> +	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3;
> +	uint64_t len, len1, len2, len3;
> +	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
> +	uint32_t buf_offset = dev->vhost_hlen;
> +
> +	if (unlikely(avail_idx & PACKED_BURST_MASK))
> +		return -1;
> +
> +	if (unlikely((pkts[0]->next != NULL) |
> +		(pkts[1]->next != NULL) |
> +		(pkts[2]->next != NULL) |
> +		(pkts[3]->next != NULL)))
> +		return -1;
> +
> +	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
> +		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
> +		return 1;
> +
> +	rte_smp_rmb();
> +
> +	len = descs[avail_idx].len;
> +	len1 = descs[avail_idx + 1].len;
> +	len2 = descs[avail_idx + 2].len;
> +	len3 = descs[avail_idx + 3].len;
> +
> +	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
> +		     (pkts[1]->pkt_len > (len1 - buf_offset)) |
> +		     (pkts[2]->pkt_len > (len2 - buf_offset)) |
> +		     (pkts[3]->pkt_len > (len3 - buf_offset))))
> +		return -1;
> +
> +	desc_addr = vhost_iova_to_vva(dev, vq, descs[avail_idx].addr, &len,
> +				      VHOST_ACCESS_RW);
> +
> +	desc_addr1 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 1].addr,
> +				       &len1, VHOST_ACCESS_RW);
> +
> +	desc_addr2 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 2].addr,
> +				       &len2, VHOST_ACCESS_RW);
> +
> +	desc_addr3 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 3].addr,
> +				       &len3, VHOST_ACCESS_RW);
> +
> +	if (unlikely((len != descs[avail_idx].len) |
> +		(len1 != descs[avail_idx + 1].len) |
> +		(len2 != descs[avail_idx + 2].len) |
> +		(len3 != descs[avail_idx + 3].len)))
> +		return -1;
> +
> +	rte_prefetch0((void *)(uintptr_t)desc_addr);
> +	rte_prefetch0((void *)(uintptr_t)desc_addr1);
> +	rte_prefetch0((void *)(uintptr_t)desc_addr2);
> +	rte_prefetch0((void *)(uintptr_t)desc_addr3);
> +
> +	hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
> +	hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1;
> +	hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2;
> +	hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3;
> +
> +	virtio_enqueue_offload(pkts[0], &hdr->hdr);
> +	virtio_enqueue_offload(pkts[1], &hdr1->hdr);
> +	virtio_enqueue_offload(pkts[2], &hdr2->hdr);
> +	virtio_enqueue_offload(pkts[3], &hdr3->hdr);
> +
> +	len = pkts[0]->pkt_len + dev->vhost_hlen;
> +	len1 = pkts[1]->pkt_len + dev->vhost_hlen;
> +	len2 = pkts[2]->pkt_len + dev->vhost_hlen;
> +	len3 = pkts[3]->pkt_len + dev->vhost_hlen;
> +
> +	vq->last_avail_idx += PACKED_DESCS_BURST;
> +	if (vq->last_avail_idx >= vq->size) {
> +		vq->last_avail_idx -= vq->size;
> +		vq->avail_wrap_counter ^= 1;
> +	}
> +
> +	rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset),
> +		   rte_pktmbuf_mtod_offset(pkts[0], void *, 0),
> +		   pkts[0]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset),
> +		    rte_pktmbuf_mtod_offset(pkts[1], void *, 0),
> +		    pkts[1]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset),
> +		    rte_pktmbuf_mtod_offset(pkts[2], void *, 0),
> +		    pkts[2]->pkt_len);
> +	rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset),
> +		    rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
> +		    pkts[3]->pkt_len);
> +
> +	return 0;
> +}
> +
>  static __rte_unused int16_t
>  virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
>  	struct rte_mbuf *pkt)
>
  
Marvin Liu Sept. 6, 2019, 1:42 a.m. UTC | #2
> -----Original Message-----
> From: Ilya Maximets [mailto:i.maximets@samsung.com]
> Sent: Thursday, September 05, 2019 6:31 PM
> To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> for packed ring
> 
> On 05.09.2019 19:14, Marvin Liu wrote:
> > Burst enqueue function will first check whether descriptors are cache
> > aligned. It will also check prerequisites in the beginning. Burst
> > enqueue function not support chained mbufs, single packet enqueue
> > function will handle it.
> >
> > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> 
> Hi.
> 
> Can we rely on loop unrolling by compiler instead of repeating each
> command 4 times?	
> 
> For example:
> 
>     uint64_t len[PACKED_DESCS_BURST];
> 
>     for (i = 0; i < PACKED_DESCS_BURST; i++)
>         len[i] = descs[avail_idx + i].len;
> 
> 
> For 'if's:
> 
>     res = false;
>     for (i = 0; i < PACKED_DESCS_BURST; i++)
>         res |= pkts[i]->next != NULL;
>     if (unlikely(res))
>         return -1;
> 
> or just
> 
>     for (i = 0; i < PACKED_DESCS_BURST; i++)
>         if (unlikely(pkts[i]->next != NULL))
>             return -1;
> 
> Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> unrolled by compiler producing almost same code.
> 
> This will significantly reduce code size and will also allow to
> play with PACKED_DESCS_BURST value without massive code changes.
> 
> Same is applicable to other patches in the series.
> 
> What do you think?
> 

Hi Ilya,
I did some test with the unroll availability of various compilers before.
All listed compilers will cause loopback performance drop compared to repeating code version, especially GCC7.4 and ICC. 
Newer compilers will have much less impact (around 3%) on the throughput.
If we can accept that, repeating code can be replaced with small loop function.

|----------------|---------------|-------------|------|
| Compiler       | Auto unrolled | Fixed batch | Gap  |
|----------------|---------------|-------------|------|
| Clang6.0.0     | 13.1M         | 13.5M       | 0.4M |
|----------------|---------------|-------------|------|
| GCC 8.3.0      | 13.9M         | 14.4M       | 0.5M |
|----------------|---------------|-------------|------|
| GCC 7.4.0      | 12.6M         | 13.5M       | 0.9M |
|----------------|---------------|-------------|------|
| ICC 19.0.4.243 | 11.0M         | 12.3M       | 1.3M |
|----------------|---------------|-------------|------|

Thanks,
Marvin

> Best regards, Ilya Maximets.
> 
> >
  
Marvin Liu Sept. 6, 2019, 9 a.m. UTC | #3
Hi,
After checked gcc9.0.2, pragma is needed for notifying compiler to unroll the loop.
Code will look like below after added compile-time macro for pragma.
If this format is fine, I will send out patch set with the update later.

#ifdef SUPPORT_GCC_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("GCC unroll 4")
#endif

#ifdef SUPPORT_CLANG_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("unroll 4")
#endif

#ifdef SUPPORT_ICC_UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma("unroll (4)")
#endif

#ifndef UNROLL_PRAGMA
#define UNROLL_PRAGMA _Pragma()
#endif

UNROLL_PRAGMA
for (i = 0; i < PACKED_DESCS_BURST; i++) {
	if (unlikely(pkts[i]->next != NULL))
		return -1;
}

Also checked compiler clang6.0.0, performance of small loop with pragma will be same as repeating code.

Regards,
Marvin

> -----Original Message-----
> From: Liu, Yong
> Sent: Friday, September 06, 2019 9:43 AM
> To: Ilya Maximets <i.maximets@samsung.com>; Bie, Tiwei
> <tiwei.bie@intel.com>; maxime.coquelin@redhat.com
> Cc: dev@dpdk.org
> Subject: RE: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> for packed ring
> 
> 
> 
> > -----Original Message-----
> > From: Ilya Maximets [mailto:i.maximets@samsung.com]
> > Sent: Thursday, September 05, 2019 6:31 PM
> > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > maxime.coquelin@redhat.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue
> function
> > for packed ring
> >
> > On 05.09.2019 19:14, Marvin Liu wrote:
> > > Burst enqueue function will first check whether descriptors are cache
> > > aligned. It will also check prerequisites in the beginning. Burst
> > > enqueue function not support chained mbufs, single packet enqueue
> > > function will handle it.
> > >
> > > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> >
> > Hi.
> >
> > Can we rely on loop unrolling by compiler instead of repeating each
> > command 4 times?
> >
> > For example:
> >
> >     uint64_t len[PACKED_DESCS_BURST];
> >
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         len[i] = descs[avail_idx + i].len;
> >
> >
> > For 'if's:
> >
> >     res = false;
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         res |= pkts[i]->next != NULL;
> >     if (unlikely(res))
> >         return -1;
> >
> > or just
> >
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         if (unlikely(pkts[i]->next != NULL))
> >             return -1;
> >
> > Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> > unrolled by compiler producing almost same code.
> >
> > This will significantly reduce code size and will also allow to
> > play with PACKED_DESCS_BURST value without massive code changes.
> >
> > Same is applicable to other patches in the series.
> >
> > What do you think?
> >
> 
> Hi Ilya,
> I did some test with the unroll availability of various compilers before.
> All listed compilers will cause loopback performance drop compared to
> repeating code version, especially GCC7.4 and ICC.
> Newer compilers will have much less impact (around 3%) on the throughput.
> If we can accept that, repeating code can be replaced with small loop
> function.
> 
> |----------------|---------------|-------------|------|
> | Compiler       | Auto unrolled | Fixed batch | Gap  |
> |----------------|---------------|-------------|------|
> | Clang6.0.0     | 13.1M         | 13.5M       | 0.4M |
> |----------------|---------------|-------------|------|
> | GCC 8.3.0      | 13.9M         | 14.4M       | 0.5M |
> |----------------|---------------|-------------|------|
> | GCC 7.4.0      | 12.6M         | 13.5M       | 0.9M |
> |----------------|---------------|-------------|------|
> | ICC 19.0.4.243 | 11.0M         | 12.3M       | 1.3M |
> |----------------|---------------|-------------|------|
> 
> Thanks,
> Marvin
> 
> > Best regards, Ilya Maximets.
> >
> > >
  
Bruce Richardson Sept. 6, 2019, 9:11 a.m. UTC | #4
On Fri, Sep 06, 2019 at 01:42:44AM +0000, Liu, Yong wrote:
> 
> 
> > -----Original Message-----
> > From: Ilya Maximets [mailto:i.maximets@samsung.com]
> > Sent: Thursday, September 05, 2019 6:31 PM
> > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > maxime.coquelin@redhat.com; dev@dpdk.org
> > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> > for packed ring
> > 
> > On 05.09.2019 19:14, Marvin Liu wrote:
> > > Burst enqueue function will first check whether descriptors are cache
> > > aligned. It will also check prerequisites in the beginning. Burst
> > > enqueue function not support chained mbufs, single packet enqueue
> > > function will handle it.
> > >
> > > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> > 
> > Hi.
> > 
> > Can we rely on loop unrolling by compiler instead of repeating each
> > command 4 times?	
> > 
> > For example:
> > 
> >     uint64_t len[PACKED_DESCS_BURST];
> > 
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         len[i] = descs[avail_idx + i].len;
> > 
> > 
> > For 'if's:
> > 
> >     res = false;
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         res |= pkts[i]->next != NULL;
> >     if (unlikely(res))
> >         return -1;
> > 
> > or just
> > 
> >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> >         if (unlikely(pkts[i]->next != NULL))
> >             return -1;
> > 
> > Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> > unrolled by compiler producing almost same code.
> > 
> > This will significantly reduce code size and will also allow to
> > play with PACKED_DESCS_BURST value without massive code changes.
> > 
> > Same is applicable to other patches in the series.
> > 
> > What do you think?
> > 
> 
> Hi Ilya,
> I did some test with the unroll availability of various compilers before.
> All listed compilers will cause loopback performance drop compared to repeating code version, especially GCC7.4 and ICC. 
> Newer compilers will have much less impact (around 3%) on the throughput.
> If we can accept that, repeating code can be replaced with small loop function.
> 
> |----------------|---------------|-------------|------|
> | Compiler       | Auto unrolled | Fixed batch | Gap  |
> |----------------|---------------|-------------|------|
> | Clang6.0.0     | 13.1M         | 13.5M       | 0.4M |
> |----------------|---------------|-------------|------|
> | GCC 8.3.0      | 13.9M         | 14.4M       | 0.5M |
> |----------------|---------------|-------------|------|
> | GCC 7.4.0      | 12.6M         | 13.5M       | 0.9M |
> |----------------|---------------|-------------|------|
> | ICC 19.0.4.243 | 11.0M         | 12.3M       | 1.3M |
> |----------------|---------------|-------------|------|
> 
> Thanks,
> Marvin
> 
Did you verify that the compiler was actually unrolling the loops? You may
need to put __attribute__((optimize("unroll-loops"))) in the function
definition.

/Bruce
  
Marvin Liu Sept. 6, 2019, 9:23 a.m. UTC | #5
> -----Original Message-----
> From: Bruce Richardson [mailto:bruce.richardson@intel.com]
> Sent: Friday, September 06, 2019 5:12 PM
> To: Liu, Yong <yong.liu@intel.com>
> Cc: Ilya Maximets <i.maximets@samsung.com>; Bie, Tiwei
> <tiwei.bie@intel.com>; maxime.coquelin@redhat.com; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue function
> for packed ring
> 
> On Fri, Sep 06, 2019 at 01:42:44AM +0000, Liu, Yong wrote:
> >
> >
> > > -----Original Message-----
> > > From: Ilya Maximets [mailto:i.maximets@samsung.com]
> > > Sent: Thursday, September 05, 2019 6:31 PM
> > > To: Liu, Yong <yong.liu@intel.com>; Bie, Tiwei <tiwei.bie@intel.com>;
> > > maxime.coquelin@redhat.com; dev@dpdk.org
> > > Subject: Re: [dpdk-dev] [PATCH v1 02/14] vhost: add burst enqueue
> function
> > > for packed ring
> > >
> > > On 05.09.2019 19:14, Marvin Liu wrote:
> > > > Burst enqueue function will first check whether descriptors are cache
> > > > aligned. It will also check prerequisites in the beginning. Burst
> > > > enqueue function not support chained mbufs, single packet enqueue
> > > > function will handle it.
> > > >
> > > > Signed-off-by: Marvin Liu <yong.liu@intel.com>
> > >
> > > Hi.
> > >
> > > Can we rely on loop unrolling by compiler instead of repeating each
> > > command 4 times?
> > >
> > > For example:
> > >
> > >     uint64_t len[PACKED_DESCS_BURST];
> > >
> > >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> > >         len[i] = descs[avail_idx + i].len;
> > >
> > >
> > > For 'if's:
> > >
> > >     res = false;
> > >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> > >         res |= pkts[i]->next != NULL;
> > >     if (unlikely(res))
> > >         return -1;
> > >
> > > or just
> > >
> > >     for (i = 0; i < PACKED_DESCS_BURST; i++)
> > >         if (unlikely(pkts[i]->next != NULL))
> > >             return -1;
> > >
> > > Since PACKED_DESCS_BURST is a fairly small constant, loops should be
> > > unrolled by compiler producing almost same code.
> > >
> > > This will significantly reduce code size and will also allow to
> > > play with PACKED_DESCS_BURST value without massive code changes.
> > >
> > > Same is applicable to other patches in the series.
> > >
> > > What do you think?
> > >
> >
> > Hi Ilya,
> > I did some test with the unroll availability of various compilers before.
> > All listed compilers will cause loopback performance drop compared to
> repeating code version, especially GCC7.4 and ICC.
> > Newer compilers will have much less impact (around 3%) on the throughput.
> > If we can accept that, repeating code can be replaced with small loop
> function.
> >
> > |----------------|---------------|-------------|------|
> > | Compiler       | Auto unrolled | Fixed batch | Gap  |
> > |----------------|---------------|-------------|------|
> > | Clang6.0.0     | 13.1M         | 13.5M       | 0.4M |
> > |----------------|---------------|-------------|------|
> > | GCC 8.3.0      | 13.9M         | 14.4M       | 0.5M |
> > |----------------|---------------|-------------|------|
> > | GCC 7.4.0      | 12.6M         | 13.5M       | 0.9M |
> > |----------------|---------------|-------------|------|
> > | ICC 19.0.4.243 | 11.0M         | 12.3M       | 1.3M |
> > |----------------|---------------|-------------|------|
> >
> > Thanks,
> > Marvin
> >
> Did you verify that the compiler was actually unrolling the loops? You may
> need to put __attribute__((optimize("unroll-loops"))) in the function
> definition.

Thanks for note, Bruce. I only checked GCC compiled binaries, loop have been unrolled.
Will double check clang and ICC compiling result.

Regards,
Marvin

> 
> /Bruce
  

Patch

diff --git a/lib/librte_vhost/vhost.h b/lib/librte_vhost/vhost.h
index 884befa85..ed8b4aabf 100644
--- a/lib/librte_vhost/vhost.h
+++ b/lib/librte_vhost/vhost.h
@@ -39,6 +39,8 @@ 
 
 #define VHOST_LOG_CACHE_NR 32
 
+#define PACKED_DESCS_BURST 4
+#define PACKED_BURST_MASK (PACKED_DESCS_BURST - 1)
 /**
  * Structure contains buffer address, length and descriptor index
  * from vring to do scatter RX.
diff --git a/lib/librte_vhost/virtio_net.c b/lib/librte_vhost/virtio_net.c
index 5ad0a8175..51ed20543 100644
--- a/lib/librte_vhost/virtio_net.c
+++ b/lib/librte_vhost/virtio_net.c
@@ -896,6 +896,106 @@  virtio_dev_rx_split(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	return pkt_idx;
 }
 
+static __rte_unused uint16_t
+virtio_dev_rx_burst_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
+	 struct rte_mbuf **pkts)
+{
+	bool wrap_counter = vq->avail_wrap_counter;
+	struct vring_packed_desc *descs = vq->desc_packed;
+	uint16_t avail_idx = vq->last_avail_idx;
+	uint64_t desc_addr, desc_addr1, desc_addr2, desc_addr3;
+	uint64_t len, len1, len2, len3;
+	struct virtio_net_hdr_mrg_rxbuf *hdr, *hdr1, *hdr2, *hdr3;
+	uint32_t buf_offset = dev->vhost_hlen;
+
+	if (unlikely(avail_idx & PACKED_BURST_MASK))
+		return -1;
+
+	if (unlikely((pkts[0]->next != NULL) |
+		(pkts[1]->next != NULL) |
+		(pkts[2]->next != NULL) |
+		(pkts[3]->next != NULL)))
+		return -1;
+
+	if (unlikely(!desc_is_avail(&descs[avail_idx], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 1], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 2], wrap_counter)) |
+		unlikely(!desc_is_avail(&descs[avail_idx + 3], wrap_counter)))
+		return 1;
+
+	rte_smp_rmb();
+
+	len = descs[avail_idx].len;
+	len1 = descs[avail_idx + 1].len;
+	len2 = descs[avail_idx + 2].len;
+	len3 = descs[avail_idx + 3].len;
+
+	if (unlikely((pkts[0]->pkt_len > (len - buf_offset)) |
+		     (pkts[1]->pkt_len > (len1 - buf_offset)) |
+		     (pkts[2]->pkt_len > (len2 - buf_offset)) |
+		     (pkts[3]->pkt_len > (len3 - buf_offset))))
+		return -1;
+
+	desc_addr = vhost_iova_to_vva(dev, vq, descs[avail_idx].addr, &len,
+				      VHOST_ACCESS_RW);
+
+	desc_addr1 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 1].addr,
+				       &len1, VHOST_ACCESS_RW);
+
+	desc_addr2 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 2].addr,
+				       &len2, VHOST_ACCESS_RW);
+
+	desc_addr3 = vhost_iova_to_vva(dev, vq, descs[avail_idx + 3].addr,
+				       &len3, VHOST_ACCESS_RW);
+
+	if (unlikely((len != descs[avail_idx].len) |
+		(len1 != descs[avail_idx + 1].len) |
+		(len2 != descs[avail_idx + 2].len) |
+		(len3 != descs[avail_idx + 3].len)))
+		return -1;
+
+	rte_prefetch0((void *)(uintptr_t)desc_addr);
+	rte_prefetch0((void *)(uintptr_t)desc_addr1);
+	rte_prefetch0((void *)(uintptr_t)desc_addr2);
+	rte_prefetch0((void *)(uintptr_t)desc_addr3);
+
+	hdr = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr;
+	hdr1 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr1;
+	hdr2 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr2;
+	hdr3 = (struct virtio_net_hdr_mrg_rxbuf *)(uintptr_t)desc_addr3;
+
+	virtio_enqueue_offload(pkts[0], &hdr->hdr);
+	virtio_enqueue_offload(pkts[1], &hdr1->hdr);
+	virtio_enqueue_offload(pkts[2], &hdr2->hdr);
+	virtio_enqueue_offload(pkts[3], &hdr3->hdr);
+
+	len = pkts[0]->pkt_len + dev->vhost_hlen;
+	len1 = pkts[1]->pkt_len + dev->vhost_hlen;
+	len2 = pkts[2]->pkt_len + dev->vhost_hlen;
+	len3 = pkts[3]->pkt_len + dev->vhost_hlen;
+
+	vq->last_avail_idx += PACKED_DESCS_BURST;
+	if (vq->last_avail_idx >= vq->size) {
+		vq->last_avail_idx -= vq->size;
+		vq->avail_wrap_counter ^= 1;
+	}
+
+	rte_memcpy((void *)(uintptr_t)(desc_addr + buf_offset),
+		   rte_pktmbuf_mtod_offset(pkts[0], void *, 0),
+		   pkts[0]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr1 + buf_offset),
+		    rte_pktmbuf_mtod_offset(pkts[1], void *, 0),
+		    pkts[1]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr2 + buf_offset),
+		    rte_pktmbuf_mtod_offset(pkts[2], void *, 0),
+		    pkts[2]->pkt_len);
+	rte_memcpy((void *)(uintptr_t)(desc_addr3 + buf_offset),
+		    rte_pktmbuf_mtod_offset(pkts[3], void *, 0),
+		    pkts[3]->pkt_len);
+
+	return 0;
+}
+
 static __rte_unused int16_t
 virtio_dev_rx_single_packed(struct virtio_net *dev, struct vhost_virtqueue *vq,
 	struct rte_mbuf *pkt)