app/testpmd: use mempool bulk get for txonly mode

Message ID 20190228194128.14236-1-pbhagavatula@marvell.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers
Series app/testpmd: use mempool bulk get for txonly mode |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK
ci/intel-Performance-Testing success Performance Testing PASS
ci/mellanox-Performance-Testing success Performance Testing PASS

Commit Message

Pavan Nikhilesh Bhagavatula Feb. 28, 2019, 7:42 p.m. UTC
  From: Pavan Nikhilesh <pbhagavatula@marvell.com>

Use mempool bulk get ops to alloc burst of packets and process them
instead of calling pktalloc for every packet.

Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
---
 app/test-pmd/txonly.c | 139 +++++++++++++++++++++---------------------
 1 file changed, 71 insertions(+), 68 deletions(-)
  

Comments

Andrew Rybchenko March 1, 2019, 7:38 a.m. UTC | #1
On 2/28/19 10:42 PM, Pavan Nikhilesh Bhagavatula wrote:
> From: Pavan Nikhilesh <pbhagavatula@marvell.com>
>
> Use mempool bulk get ops to alloc burst of packets and process them
> instead of calling pktalloc for every packet.
>
> Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> ---
>   app/test-pmd/txonly.c | 139 +++++++++++++++++++++---------------------
>   1 file changed, 71 insertions(+), 68 deletions(-)
>
> diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
> index 1f08b6ed3..eef8b3a45 100644
> --- a/app/test-pmd/txonly.c
> +++ b/app/test-pmd/txonly.c
> @@ -147,6 +147,61 @@ setup_pkt_udp_ip_headers(struct ipv4_hdr *ip_hdr,
>   	ip_hdr->hdr_checksum = (uint16_t) ip_cksum;
>   }
>   
> +static inline bool
> +pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
> +		struct ether_hdr *eth_hdr, const uint16_t vlan_tci,
> +		const uint16_t vlan_tci_outer, const uint64_t ol_flags)
> +{
> +	uint32_t nb_segs, pkt_len = 0;
> +	struct rte_mbuf *pkt_seg;
> +	uint8_t i;
> +
> +	if (unlikely(tx_pkt_split == TX_PKT_SPLIT_RND))
> +		nb_segs = random() % tx_pkt_nb_segs + 1;
> +	else
> +		nb_segs = tx_pkt_nb_segs;
> +
> +	rte_pktmbuf_reset_headroom(pkt);
> +	pkt->data_len = tx_pkt_seg_lengths[0];
> +	pkt->ol_flags = ol_flags;
> +	pkt->vlan_tci = vlan_tci;
> +	pkt->vlan_tci_outer = vlan_tci_outer;
> +	pkt->l2_len = sizeof(struct ether_hdr);
> +	pkt->l3_len = sizeof(struct ipv4_hdr);
> +
> +	pkt_seg = pkt;
> +	for (i = 1; i < nb_segs; i++) {
> +		pkt_seg->next = rte_mbuf_raw_alloc(mbp);

Why is bulk allocation not used here?

> +		if (pkt_seg->next == NULL) {
> +			pkt->nb_segs = i;
> +			rte_pktmbuf_free(pkt);
> +			return false;
> +		}
> +		pkt_seg = pkt_seg->next;
> +		pkt_seg->data_len = tx_pkt_seg_lengths[i];
> +		pkt_len += pkt_seg->data_len;
> +	}
> +	pkt_seg->next = NULL; /* Last segment of packet. */
> +	/*
> +	 * Copy headers in first packet segment(s).
> +	 */
> +	copy_buf_to_pkt(eth_hdr, sizeof(eth_hdr), pkt, 0);
> +	copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> +			sizeof(struct ether_hdr));
> +	copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> +			sizeof(struct ether_hdr) +
> +			sizeof(struct ipv4_hdr));
> +
> +	/*
> +	 * Complete first mbuf of packet and append it to the
> +	 * burst of packets to be transmitted.
> +	 */
> +	pkt->nb_segs = nb_segs;
> +	pkt->pkt_len += pkt_len;
> +
> +	return true;
> +}
> +
>   /*
>    * Transmit a burst of multi-segments packets.
>    */
> @@ -155,8 +210,6 @@ pkt_burst_transmit(struct fwd_stream *fs)
>   {
>   	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
>   	struct rte_port *txp;
> -	struct rte_mbuf *pkt;
> -	struct rte_mbuf *pkt_seg;
>   	struct rte_mempool *mbp;
>   	struct ether_hdr eth_hdr;
>   	uint16_t nb_tx;
> @@ -164,14 +217,12 @@ pkt_burst_transmit(struct fwd_stream *fs)
>   	uint16_t vlan_tci, vlan_tci_outer;
>   	uint32_t retry;
>   	uint64_t ol_flags = 0;
> -	uint8_t  i;
>   	uint64_t tx_offloads;
>   #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>   	uint64_t start_tsc;
>   	uint64_t end_tsc;
>   	uint64_t core_cycles;
>   #endif
> -	uint32_t nb_segs, pkt_len;
>   
>   #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
>   	start_tsc = rte_rdtsc();
> @@ -188,72 +239,24 @@ pkt_burst_transmit(struct fwd_stream *fs)
>   		ol_flags |= PKT_TX_QINQ_PKT;
>   	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
>   		ol_flags |= PKT_TX_MACSEC;
> -	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
> -		pkt = rte_mbuf_raw_alloc(mbp);
> -		if (pkt == NULL) {
> -		nomore_mbuf:
> -			if (nb_pkt == 0)
> -				return;
> -			break;
> -		}
>   
> -		/*
> -		 * Using raw alloc is good to improve performance,
> -		 * but some consumers may use the headroom and so
> -		 * decrement data_off. We need to make sure it is
> -		 * reset to default value.
> -		 */
> -		rte_pktmbuf_reset_headroom(pkt);
> -		pkt->data_len = tx_pkt_seg_lengths[0];
> -		pkt_seg = pkt;
> -		if (tx_pkt_split == TX_PKT_SPLIT_RND)
> -			nb_segs = random() % tx_pkt_nb_segs + 1;
> -		else
> -			nb_segs = tx_pkt_nb_segs;
> -		pkt_len = pkt->data_len;
> -		for (i = 1; i < nb_segs; i++) {
> -			pkt_seg->next = rte_mbuf_raw_alloc(mbp);
> -			if (pkt_seg->next == NULL) {
> -				pkt->nb_segs = i;
> -				rte_pktmbuf_free(pkt);
> -				goto nomore_mbuf;
> -			}
> -			pkt_seg = pkt_seg->next;
> -			pkt_seg->data_len = tx_pkt_seg_lengths[i];
> -			pkt_len += pkt_seg->data_len;
> -		}
> -		pkt_seg->next = NULL; /* Last segment of packet. */
> -
> -		/*
> -		 * Initialize Ethernet header.
> -		 */
> -		ether_addr_copy(&peer_eth_addrs[fs->peer_addr],&eth_hdr.d_addr);
> -		ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
> -		eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> -
> -		/*
> -		 * Copy headers in first packet segment(s).
> -		 */
> -		copy_buf_to_pkt(&eth_hdr, sizeof(eth_hdr), pkt, 0);
> -		copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> -				sizeof(struct ether_hdr));
> -		copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> -				sizeof(struct ether_hdr) +
> -				sizeof(struct ipv4_hdr));
> -
> -		/*
> -		 * Complete first mbuf of packet and append it to the
> -		 * burst of packets to be transmitted.
> -		 */
> -		pkt->nb_segs = nb_segs;
> -		pkt->pkt_len = pkt_len;
> -		pkt->ol_flags = ol_flags;
> -		pkt->vlan_tci = vlan_tci;
> -		pkt->vlan_tci_outer = vlan_tci_outer;
> -		pkt->l2_len = sizeof(struct ether_hdr);
> -		pkt->l3_len = sizeof(struct ipv4_hdr);
> -		pkts_burst[nb_pkt] = pkt;
> +	/*
> +	 * Initialize Ethernet header.
> +	 */
> +	ether_addr_copy(&peer_eth_addrs[fs->peer_addr], &eth_hdr.d_addr);
> +	ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
> +	eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> +
> +	if (rte_mempool_get_bulk(mbp, (void **)pkts_burst, nb_pkt_per_burst))
> +		return;

Before the patch the code survived insufficient of mbufs condition and
sent as much as it can allocate. Now it is not. I can't say for sure if the
new behaviour is acceptable or not (I'd say no), but even if it is 
acceptable
it should be highlighted in the changeset description.

Taking segments allocation into account may I suggest to consider
a bit sophisticated implementation which allocates packets in bulks
with fallback to individual mbufs allocation and usage of the mechanism
for all segments (i.e. allocate bulk, use it, allocate next, use it, etc).

> +
> +	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
> +		if (unlikely(!pkt_burst_prepare(pkts_burst[nb_pkt], mbp,
> +				&eth_hdr, vlan_tci, vlan_tci_outer, ol_flags)))
> +			goto tx_pkts;

If segment allocation fails, who frees remaining packets from the bulk?

>   	}
> +tx_pkts:
> +
>   	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
>   	/*
>   	 * Retry if necessary
  
Pavan Nikhilesh Bhagavatula March 1, 2019, 8:45 a.m. UTC | #2
On Fri, 2019-03-01 at 10:38 +0300, Andrew Rybchenko wrote:
> On 2/28/19 10:42 PM, Pavan Nikhilesh Bhagavatula wrote:
> > From: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > 
> > Use mempool bulk get ops to alloc burst of packets and process them
> > instead of calling pktalloc for every packet.
> > 
> > Signed-off-by: Pavan Nikhilesh <pbhagavatula@marvell.com>
> > ---
> >  app/test-pmd/txonly.c | 139 +++++++++++++++++++++-----------------
> > ----
> >  1 file changed, 71 insertions(+), 68 deletions(-)
> > 
> > diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
> > index 1f08b6ed3..eef8b3a45 100644
> > --- a/app/test-pmd/txonly.c
> > +++ b/app/test-pmd/txonly.c
> > @@ -147,6 +147,61 @@ setup_pkt_udp_ip_headers(struct ipv4_hdr
> > *ip_hdr,
> >  	ip_hdr->hdr_checksum = (uint16_t) ip_cksum;
> >  }
> >  
> > +static inline bool
> > +pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
> > +		struct ether_hdr *eth_hdr, const uint16_t vlan_tci,
> > +		const uint16_t vlan_tci_outer, const uint64_t ol_flags)
> > +{
> > +	uint32_t nb_segs, pkt_len = 0;
> > +	struct rte_mbuf *pkt_seg;
> > +	uint8_t i;
> > +
> > +	if (unlikely(tx_pkt_split == TX_PKT_SPLIT_RND))
> > +		nb_segs = random() % tx_pkt_nb_segs + 1;
> > +	else
> > +		nb_segs = tx_pkt_nb_segs;
> > +
> > +	rte_pktmbuf_reset_headroom(pkt);
> > +	pkt->data_len = tx_pkt_seg_lengths[0];
> > +	pkt->ol_flags = ol_flags;
> > +	pkt->vlan_tci = vlan_tci;
> > +	pkt->vlan_tci_outer = vlan_tci_outer;
> > +	pkt->l2_len = sizeof(struct ether_hdr);
> > +	pkt->l3_len = sizeof(struct ipv4_hdr);
> > +
> > +	pkt_seg = pkt;
> > +	for (i = 1; i < nb_segs; i++) {
> > +		pkt_seg->next = rte_mbuf_raw_alloc(mbp);
>  
> Why is bulk allocation not used here?

Will update in v2.

> 
> > +		if (pkt_seg->next == NULL) {
> > +			pkt->nb_segs = i;
> > +			rte_pktmbuf_free(pkt);
> > +			return false;
> > +		}
> > +		pkt_seg = pkt_seg->next;
> > +		pkt_seg->data_len = tx_pkt_seg_lengths[i];
> > +		pkt_len += pkt_seg->data_len;
> > +	}
> > +	pkt_seg->next = NULL; /* Last segment of packet. */
> > +	/*
> > +	 * Copy headers in first packet segment(s).
> > +	 */
> > +	copy_buf_to_pkt(eth_hdr, sizeof(eth_hdr), pkt, 0);
> > +	copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> > +			sizeof(struct ether_hdr));
> > +	copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> > +			sizeof(struct ether_hdr) +
> > +			sizeof(struct ipv4_hdr));
> > +
> > +	/*
> > +	 * Complete first mbuf of packet and append it to the
> > +	 * burst of packets to be transmitted.
> > +	 */
> > +	pkt->nb_segs = nb_segs;
> > +	pkt->pkt_len += pkt_len;
> > +
> > +	return true;
> > +}
> > +
> >  /*
> >   * Transmit a burst of multi-segments packets.
> >   */
> > @@ -155,8 +210,6 @@ pkt_burst_transmit(struct fwd_stream *fs)
> >  {
> >  	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
> >  	struct rte_port *txp;
> > -	struct rte_mbuf *pkt;
> > -	struct rte_mbuf *pkt_seg;
> >  	struct rte_mempool *mbp;
> >  	struct ether_hdr eth_hdr;
> >  	uint16_t nb_tx;
> > @@ -164,14 +217,12 @@ pkt_burst_transmit(struct fwd_stream *fs)
> >  	uint16_t vlan_tci, vlan_tci_outer;
> >  	uint32_t retry;
> >  	uint64_t ol_flags = 0;
> > -	uint8_t  i;
> >  	uint64_t tx_offloads;
> >  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  	uint64_t start_tsc;
> >  	uint64_t end_tsc;
> >  	uint64_t core_cycles;
> >  #endif
> > -	uint32_t nb_segs, pkt_len;
> >  
> >  #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
> >  	start_tsc = rte_rdtsc();
> > @@ -188,72 +239,24 @@ pkt_burst_transmit(struct fwd_stream *fs)
> >  		ol_flags |= PKT_TX_QINQ_PKT;
> >  	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
> >  		ol_flags |= PKT_TX_MACSEC;
> > -	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
> > -		pkt = rte_mbuf_raw_alloc(mbp);
> > -		if (pkt == NULL) {
> > -		nomore_mbuf:
> > -			if (nb_pkt == 0)
> > -				return;
> > -			break;
> > -		}
> >  
> > -		/*
> > -		 * Using raw alloc is good to improve performance,
> > -		 * but some consumers may use the headroom and so
> > -		 * decrement data_off. We need to make sure it is
> > -		 * reset to default value.
> > -		 */
> > -		rte_pktmbuf_reset_headroom(pkt);
> > -		pkt->data_len = tx_pkt_seg_lengths[0];
> > -		pkt_seg = pkt;
> > -		if (tx_pkt_split == TX_PKT_SPLIT_RND)
> > -			nb_segs = random() % tx_pkt_nb_segs + 1;
> > -		else
> > -			nb_segs = tx_pkt_nb_segs;
> > -		pkt_len = pkt->data_len;
> > -		for (i = 1; i < nb_segs; i++) {
> > -			pkt_seg->next = rte_mbuf_raw_alloc(mbp);
> > -			if (pkt_seg->next == NULL) {
> > -				pkt->nb_segs = i;
> > -				rte_pktmbuf_free(pkt);
> > -				goto nomore_mbuf;
> > -			}
> > -			pkt_seg = pkt_seg->next;
> > -			pkt_seg->data_len = tx_pkt_seg_lengths[i];
> > -			pkt_len += pkt_seg->data_len;
> > -		}
> > -		pkt_seg->next = NULL; /* Last segment of packet. */
> > -
> > -		/*
> > -		 * Initialize Ethernet header.
> > -		 */
> > -		ether_addr_copy(&peer_eth_addrs[fs-
> > >peer_addr],&eth_hdr.d_addr);
> > -		ether_addr_copy(&ports[fs->tx_port].eth_addr,
> > &eth_hdr.s_addr);
> > -		eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> > -
> > -		/*
> > -		 * Copy headers in first packet segment(s).
> > -		 */
> > -		copy_buf_to_pkt(&eth_hdr, sizeof(eth_hdr), pkt, 0);
> > -		copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
> > -				sizeof(struct ether_hdr));
> > -		copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
> > -				sizeof(struct ether_hdr) +
> > -				sizeof(struct ipv4_hdr));
> > -
> > -		/*
> > -		 * Complete first mbuf of packet and append it to the
> > -		 * burst of packets to be transmitted.
> > -		 */
> > -		pkt->nb_segs = nb_segs;
> > -		pkt->pkt_len = pkt_len;
> > -		pkt->ol_flags = ol_flags;
> > -		pkt->vlan_tci = vlan_tci;
> > -		pkt->vlan_tci_outer = vlan_tci_outer;
> > -		pkt->l2_len = sizeof(struct ether_hdr);
> > -		pkt->l3_len = sizeof(struct ipv4_hdr);
> > -		pkts_burst[nb_pkt] = pkt;
> > +	/*
> > +	 * Initialize Ethernet header.
> > +	 */
> > +	ether_addr_copy(&peer_eth_addrs[fs->peer_addr],
> > &eth_hdr.d_addr);
> > +	ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
> > +	eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
> > +
> > +	if (rte_mempool_get_bulk(mbp, (void **)pkts_burst,
> > nb_pkt_per_burst))
> > +		return;
>  
> Before the patch the code survived insufficient of mbufs condition
> and
> sent as much as it can allocate. Now it is not. I can't say for sure
> if the
> new behaviour is acceptable or not (I'd say no), but even if it is
> acceptable
> it should be highlighted in the changeset description.
> 

Ack.

> Taking segments allocation into account may I suggest to consider
> a bit sophisticated implementation which allocates packets in bulks
> with fallback to individual mbufs allocation and usage of the
> mechanism
> for all segments (i.e. allocate bulk, use it, allocate next, use it,
> etc).

I think segment allocation can always use bulk as failure should always
free the entire packet chain. 
I will send v2 with fallback to individual mbuf allocs. 

> 
> > +
> > +	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
> > +		if (unlikely(!pkt_burst_prepare(pkts_burst[nb_pkt],
> > mbp,
> > +				&eth_hdr, vlan_tci, vlan_tci_outer,
> > ol_flags)))
> > +			goto tx_pkts;
>  
> If segment allocation fails, who frees remaining packets from the
> bulk?

My bad. Thanks for the heads up.

> 
> >  	}
> > +tx_pkts:
> > +
> >  	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst,
> > nb_pkt);
> >  	/*
> >  	 * Retry if necessary
>
  

Patch

diff --git a/app/test-pmd/txonly.c b/app/test-pmd/txonly.c
index 1f08b6ed3..eef8b3a45 100644
--- a/app/test-pmd/txonly.c
+++ b/app/test-pmd/txonly.c
@@ -147,6 +147,61 @@  setup_pkt_udp_ip_headers(struct ipv4_hdr *ip_hdr,
 	ip_hdr->hdr_checksum = (uint16_t) ip_cksum;
 }
 
+static inline bool
+pkt_burst_prepare(struct rte_mbuf *pkt, struct rte_mempool *mbp,
+		struct ether_hdr *eth_hdr, const uint16_t vlan_tci,
+		const uint16_t vlan_tci_outer, const uint64_t ol_flags)
+{
+	uint32_t nb_segs, pkt_len = 0;
+	struct rte_mbuf *pkt_seg;
+	uint8_t i;
+
+	if (unlikely(tx_pkt_split == TX_PKT_SPLIT_RND))
+		nb_segs = random() % tx_pkt_nb_segs + 1;
+	else
+		nb_segs = tx_pkt_nb_segs;
+
+	rte_pktmbuf_reset_headroom(pkt);
+	pkt->data_len = tx_pkt_seg_lengths[0];
+	pkt->ol_flags = ol_flags;
+	pkt->vlan_tci = vlan_tci;
+	pkt->vlan_tci_outer = vlan_tci_outer;
+	pkt->l2_len = sizeof(struct ether_hdr);
+	pkt->l3_len = sizeof(struct ipv4_hdr);
+
+	pkt_seg = pkt;
+	for (i = 1; i < nb_segs; i++) {
+		pkt_seg->next = rte_mbuf_raw_alloc(mbp);
+		if (pkt_seg->next == NULL) {
+			pkt->nb_segs = i;
+			rte_pktmbuf_free(pkt);
+			return false;
+		}
+		pkt_seg = pkt_seg->next;
+		pkt_seg->data_len = tx_pkt_seg_lengths[i];
+		pkt_len += pkt_seg->data_len;
+	}
+	pkt_seg->next = NULL; /* Last segment of packet. */
+	/*
+	 * Copy headers in first packet segment(s).
+	 */
+	copy_buf_to_pkt(eth_hdr, sizeof(eth_hdr), pkt, 0);
+	copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
+			sizeof(struct ether_hdr));
+	copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
+			sizeof(struct ether_hdr) +
+			sizeof(struct ipv4_hdr));
+
+	/*
+	 * Complete first mbuf of packet and append it to the
+	 * burst of packets to be transmitted.
+	 */
+	pkt->nb_segs = nb_segs;
+	pkt->pkt_len += pkt_len;
+
+	return true;
+}
+
 /*
  * Transmit a burst of multi-segments packets.
  */
@@ -155,8 +210,6 @@  pkt_burst_transmit(struct fwd_stream *fs)
 {
 	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
 	struct rte_port *txp;
-	struct rte_mbuf *pkt;
-	struct rte_mbuf *pkt_seg;
 	struct rte_mempool *mbp;
 	struct ether_hdr eth_hdr;
 	uint16_t nb_tx;
@@ -164,14 +217,12 @@  pkt_burst_transmit(struct fwd_stream *fs)
 	uint16_t vlan_tci, vlan_tci_outer;
 	uint32_t retry;
 	uint64_t ol_flags = 0;
-	uint8_t  i;
 	uint64_t tx_offloads;
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	uint64_t start_tsc;
 	uint64_t end_tsc;
 	uint64_t core_cycles;
 #endif
-	uint32_t nb_segs, pkt_len;
 
 #ifdef RTE_TEST_PMD_RECORD_CORE_CYCLES
 	start_tsc = rte_rdtsc();
@@ -188,72 +239,24 @@  pkt_burst_transmit(struct fwd_stream *fs)
 		ol_flags |= PKT_TX_QINQ_PKT;
 	if (tx_offloads & DEV_TX_OFFLOAD_MACSEC_INSERT)
 		ol_flags |= PKT_TX_MACSEC;
-	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
-		pkt = rte_mbuf_raw_alloc(mbp);
-		if (pkt == NULL) {
-		nomore_mbuf:
-			if (nb_pkt == 0)
-				return;
-			break;
-		}
 
-		/*
-		 * Using raw alloc is good to improve performance,
-		 * but some consumers may use the headroom and so
-		 * decrement data_off. We need to make sure it is
-		 * reset to default value.
-		 */
-		rte_pktmbuf_reset_headroom(pkt);
-		pkt->data_len = tx_pkt_seg_lengths[0];
-		pkt_seg = pkt;
-		if (tx_pkt_split == TX_PKT_SPLIT_RND)
-			nb_segs = random() % tx_pkt_nb_segs + 1;
-		else
-			nb_segs = tx_pkt_nb_segs;
-		pkt_len = pkt->data_len;
-		for (i = 1; i < nb_segs; i++) {
-			pkt_seg->next = rte_mbuf_raw_alloc(mbp);
-			if (pkt_seg->next == NULL) {
-				pkt->nb_segs = i;
-				rte_pktmbuf_free(pkt);
-				goto nomore_mbuf;
-			}
-			pkt_seg = pkt_seg->next;
-			pkt_seg->data_len = tx_pkt_seg_lengths[i];
-			pkt_len += pkt_seg->data_len;
-		}
-		pkt_seg->next = NULL; /* Last segment of packet. */
-
-		/*
-		 * Initialize Ethernet header.
-		 */
-		ether_addr_copy(&peer_eth_addrs[fs->peer_addr],&eth_hdr.d_addr);
-		ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
-		eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
-
-		/*
-		 * Copy headers in first packet segment(s).
-		 */
-		copy_buf_to_pkt(&eth_hdr, sizeof(eth_hdr), pkt, 0);
-		copy_buf_to_pkt(&pkt_ip_hdr, sizeof(pkt_ip_hdr), pkt,
-				sizeof(struct ether_hdr));
-		copy_buf_to_pkt(&pkt_udp_hdr, sizeof(pkt_udp_hdr), pkt,
-				sizeof(struct ether_hdr) +
-				sizeof(struct ipv4_hdr));
-
-		/*
-		 * Complete first mbuf of packet and append it to the
-		 * burst of packets to be transmitted.
-		 */
-		pkt->nb_segs = nb_segs;
-		pkt->pkt_len = pkt_len;
-		pkt->ol_flags = ol_flags;
-		pkt->vlan_tci = vlan_tci;
-		pkt->vlan_tci_outer = vlan_tci_outer;
-		pkt->l2_len = sizeof(struct ether_hdr);
-		pkt->l3_len = sizeof(struct ipv4_hdr);
-		pkts_burst[nb_pkt] = pkt;
+	/*
+	 * Initialize Ethernet header.
+	 */
+	ether_addr_copy(&peer_eth_addrs[fs->peer_addr], &eth_hdr.d_addr);
+	ether_addr_copy(&ports[fs->tx_port].eth_addr, &eth_hdr.s_addr);
+	eth_hdr.ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+
+	if (rte_mempool_get_bulk(mbp, (void **)pkts_burst, nb_pkt_per_burst))
+		return;
+
+	for (nb_pkt = 0; nb_pkt < nb_pkt_per_burst; nb_pkt++) {
+		if (unlikely(!pkt_burst_prepare(pkts_burst[nb_pkt], mbp,
+				&eth_hdr, vlan_tci, vlan_tci_outer, ol_flags)))
+			goto tx_pkts;
 	}
+tx_pkts:
+
 	nb_tx = rte_eth_tx_burst(fs->tx_port, fs->tx_queue, pkts_burst, nb_pkt);
 	/*
 	 * Retry if necessary