[dpdk-stable] [EXT] patch 'net/qede: fix performance bottleneck in Rx path' has been queued to LTS release 17.11.6

Shahed Shaikh shshaikh at marvell.com
Tue Mar 12 18:04:26 CET 2019


> -----Original Message-----
> From: Yongseok Koh <yskoh at mellanox.com>
> Sent: Friday, March 8, 2019 11:18 PM
> To: Shahed Shaikh <shshaikh at marvell.com>
> Cc: Rasesh Mody <rmody at marvell.com>; dpdk stable <stable at dpdk.org>
> Subject: [EXT] patch 'net/qede: fix performance bottleneck in Rx path' has been
> queued to LTS release 17.11.6
> Hi,
> 
> FYI, your patch has been queued to LTS release 17.11.6
> 
> Note it hasn't been pushed to http://dpdk.org/browse/dpdk-stable yet.
> It will be pushed if I get no objection by 03/13/19. So please shout if anyone has
> objection.

Hi,

We recently found that this patch introduces a regression for which I have just sent out a fix.
http://patchwork.dpdk.org/patch/51134/   ("net/qede: fix receive packet drop")

How do we handle such situation? Will you also pull above fix once it gets accepted or ignore current patch for 17.11.6 and consider for next release?
Please advise.

Thanks,
Shahed
 

> 
> Also note that after the patch there's a diff of the upstream commit vs the patch
> applied to the branch. If the code is different (ie: not only metadata diffs), due
> for example to a change in context or macro names, please double check it.
> 
> Thanks.
> 
> Yongseok
> 
> ---
> From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00 2001
> From: Shahed Shaikh <shshaikh at marvell.com>
> Date: Fri, 18 Jan 2019 02:29:29 -0800
> Subject: [PATCH] net/qede: fix performance bottleneck in Rx path
> 
> [ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ]
> 
> Allocating replacement buffer per received packet is expensive.
> Instead, process received packets first and allocate replacement buffers in bulk
> later.
> 
> This improves performance by ~25% in terms of PPS on AMD platforms.
> 
> Fixes: 2ea6f76aff40 ("qede: add core driver")
> 
> Signed-off-by: Shahed Shaikh <shshaikh at marvell.com>
> Acked-by: Rasesh Mody <rmody at marvell.com>
> ---
>  drivers/net/qede/qede_rxtx.c | 97 +++++++++++++++++++++++++++++++++-----
> ------
>  drivers/net/qede/qede_rxtx.h |  2 +
>  2 files changed, 75 insertions(+), 24 deletions(-)
> 
> diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c index
> cdb85c218..b525075ca 100644
> --- a/drivers/net/qede/qede_rxtx.c
> +++ b/drivers/net/qede/qede_rxtx.c
> @@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct
> qede_rx_queue *rxq)
>  	return 0;
>  }
> 
> +#define QEDE_MAX_BULK_ALLOC_COUNT 512
> +
> +static inline int qede_alloc_rx_bulk_mbufs(struct qede_rx_queue *rxq,
> +int count) {
> +	void *obj_p[QEDE_MAX_BULK_ALLOC_COUNT] __rte_cache_aligned;
> +	struct rte_mbuf *mbuf = NULL;
> +	struct eth_rx_bd *rx_bd;
> +	dma_addr_t mapping;
> +	int i, ret = 0;
> +	uint16_t idx;
> +
> +	idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
> +
> +	if (count > QEDE_MAX_BULK_ALLOC_COUNT)
> +		count = QEDE_MAX_BULK_ALLOC_COUNT;
> +
> +	ret = rte_mempool_get_bulk(rxq->mb_pool, obj_p, count);
> +	if (unlikely(ret)) {
> +		PMD_RX_LOG(ERR, rxq,
> +			   "Failed to allocate %d rx buffers "
> +			    "sw_rx_prod %u sw_rx_cons %u mp entries %u free
> %u",
> +			    count, idx, rxq->sw_rx_cons & NUM_RX_BDS(rxq),
> +			    rte_mempool_avail_count(rxq->mb_pool),
> +			    rte_mempool_in_use_count(rxq->mb_pool));
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < count; i++) {
> +		mbuf = obj_p[i];
> +		if (likely(i < count - 1))
> +			rte_prefetch0(obj_p[i + 1]);
> +
> +		idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
> +		rxq->sw_rx_ring[idx].mbuf = mbuf;
> +		rxq->sw_rx_ring[idx].page_offset = 0;
> +		mapping = rte_mbuf_data_iova_default(mbuf);
> +		rx_bd = (struct eth_rx_bd *)
> +			ecore_chain_produce(&rxq->rx_bd_ring);
> +		rx_bd->addr.hi = rte_cpu_to_le_32(U64_HI(mapping));
> +		rx_bd->addr.lo = rte_cpu_to_le_32(U64_LO(mapping));
> +		rxq->sw_rx_prod++;
> +	}
> +
> +	return 0;
> +}
> +
>  /* Criterias for calculating Rx buffer size -
>   * 1) rx_buf_size should not exceed the size of mbuf
>   * 2) In scattered_rx mode - minimum rx_buf_size should be @@ -1134,7
> +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
>  		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)  {
>  	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq-
> >rx_bd_ring);
> -	uint16_t idx = rxq->sw_rx_cons & NUM_RX_BDS(rxq);
> +	uint16_t idx = rxq->sw_rx_prod & NUM_RX_BDS(rxq);
>  	struct qede_rx_entry *curr_prod;
>  	dma_addr_t new_mapping;
> 
> @@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  	uint8_t bitfield_val;
>  #endif
>  	uint8_t tunn_parse_flag;
> -	uint8_t j;
>  	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
>  	uint64_t ol_flags;
>  	uint32_t packet_type;
> @@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  	uint8_t offset, tpa_agg_idx, flags;
>  	struct qede_agg_info *tpa_info = NULL;
>  	uint32_t rss_hash;
> +	int rx_alloc_count = 0;
> 
>  	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
>  	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
> @@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  	if (hw_comp_cons == sw_comp_cons)
>  		return 0;
> 
> +	/* Allocate buffers that we used in previous loop */
> +	if (rxq->rx_alloc_count) {
> +		if (unlikely(qede_alloc_rx_bulk_mbufs(rxq,
> +			     rxq->rx_alloc_count))) {
> +			struct rte_eth_dev *dev;
> +
> +			PMD_RX_LOG(ERR, rxq,
> +				   "New buffer allocation failed,"
> +				   "dropping incoming packetn");
> +			dev = &rte_eth_devices[rxq->port_id];
> +			dev->data->rx_mbuf_alloc_failed +=
> +							rxq->rx_alloc_count;
> +			rxq->rx_alloc_errors += rxq->rx_alloc_count;
> +			return 0;
> +		}
> +		qede_update_rx_prod(qdev, rxq);
> +		rxq->rx_alloc_count = 0;
> +	}
> +
>  	while (sw_comp_cons != hw_comp_cons) {
>  		ol_flags = 0;
>  		packet_type = RTE_PTYPE_UNKNOWN;
> @@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
>  			rx_mb->hash.rss = rss_hash;
>  		}
> 
> -		if (unlikely(qede_alloc_rx_buffer(rxq) != 0)) {
> -			PMD_RX_LOG(ERR, rxq,
> -				   "New buffer allocation failed,"
> -				   "dropping incoming packet\n");
> -			qede_recycle_rx_bd_ring(rxq, qdev, fp_cqe->bd_num);
> -			rte_eth_devices[rxq->port_id].
> -			    data->rx_mbuf_alloc_failed++;
> -			rxq->rx_alloc_errors++;
> -			break;
> -		}
> +		rx_alloc_count++;
>  		qede_rx_bd_ring_consume(rxq);
> 
>  		if (!tpa_start_flg && fp_cqe->bd_num > 1) { @@ -1577,17
> +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t
> nb_pkts)
>  			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
>  						 pkt_len - len))
>  				goto next_cqe;
> -			for (j = 0; j < num_segs; j++) {
> -				if (qede_alloc_rx_buffer(rxq)) {
> -					PMD_RX_LOG(ERR, rxq,
> -						"Buffer allocation failed");
> -					rte_eth_devices[rxq->port_id].
> -						data->rx_mbuf_alloc_failed++;
> -					rxq->rx_alloc_errors++;
> -					break;
> -				}
> -				rxq->rx_segs++;
> -			}
> +
> +			rx_alloc_count += num_segs;
> +			rxq->rx_segs += num_segs;
>  		}
>  		rxq->rx_segs++; /* for the first segment */
> 
> @@ -1629,7 +1677,8 @@ next_cqe:
>  		}
>  	}
> 
> -	qede_update_rx_prod(qdev, rxq);
> +	/* Request number of bufferes to be allocated in next loop */
> +	rxq->rx_alloc_count = rx_alloc_count;
> 
>  	rxq->rcv_pkts += rx_pkt;
> 
> diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h index
> fe80237d5..574831e61 100644
> --- a/drivers/net/qede/qede_rxtx.h
> +++ b/drivers/net/qede/qede_rxtx.h
> @@ -196,6 +196,8 @@ struct qede_rx_queue {
>  	uint16_t queue_id;
>  	uint16_t port_id;
>  	uint16_t rx_buf_size;
> +	uint16_t rx_alloc_count;
> +	uint16_t unused;
>  	uint64_t rcv_pkts;
>  	uint64_t rx_segs;
>  	uint64_t rx_hw_errors;
> --
> 2.11.0
> 
> ---
>   Diff of the applied patch vs upstream commit (please double-check if non-
> empty:
> ---
> --- -	2019-03-08 09:46:43.105876155 -0800
> +++ 0059-net-qede-fix-performance-bottleneck-in-Rx-path.patch	2019-03-08
> 09:46:40.311403000 -0800
> @@ -1,8 +1,10 @@
> -From 8f2312474529ad7ff0e4b65b82efc8530e7484ce Mon Sep 17 00:00:00
> 2001
> +From f4f2aff537e1ff13ed85a9d4e52038ca34e7e005 Mon Sep 17 00:00:00
> 2001
>  From: Shahed Shaikh <shshaikh at marvell.com>
>  Date: Fri, 18 Jan 2019 02:29:29 -0800
>  Subject: [PATCH] net/qede: fix performance bottleneck in Rx path
> 
> +[ upstream commit 8f2312474529ad7ff0e4b65b82efc8530e7484ce ]
> +
>  Allocating replacement buffer per received packet is expensive.
>  Instead, process received packets first and allocate  replacement buffers in bulk
> later.
> @@ -11,7 +13,6 @@
>  platforms.
> 
>  Fixes: 2ea6f76aff40 ("qede: add core driver")
> -Cc: stable at dpdk.org
> 
>  Signed-off-by: Shahed Shaikh <shshaikh at marvell.com>
>  Acked-by: Rasesh Mody <rmody at marvell.com> @@ -21,10 +22,10 @@
>   2 files changed, 75 insertions(+), 24 deletions(-)
> 
>  diff --git a/drivers/net/qede/qede_rxtx.c b/drivers/net/qede/qede_rxtx.c -index
> 0e33be1a3..684c4aeef 100644
> +index cdb85c218..b525075ca 100644
>  --- a/drivers/net/qede/qede_rxtx.c
>  +++ b/drivers/net/qede/qede_rxtx.c
> -@@ -35,6 +35,52 @@ static inline int qede_alloc_rx_buffer(struct
> qede_rx_queue *rxq)
> +@@ -37,6 +37,52 @@ static inline int qede_alloc_rx_buffer(struct
> +qede_rx_queue *rxq)
>   	return 0;
>   }
> 
> @@ -77,7 +78,7 @@
>   /* Criterias for calculating Rx buffer size -
>    * 1) rx_buf_size should not exceed the size of mbuf
>    * 2) In scattered_rx mode - minimum rx_buf_size should be -@@ -1131,7
> +1177,7 @@ qede_reuse_page(__rte_unused struct qede_dev *qdev,
> +@@ -1134,7 +1180,7 @@ qede_reuse_page(__rte_unused struct qede_dev
> +*qdev,
>   		struct qede_rx_queue *rxq, struct qede_rx_entry *curr_cons)
>   {
>   	struct eth_rx_bd *rx_bd_prod = ecore_chain_produce(&rxq-
> >rx_bd_ring);
> @@ -86,7 +87,7 @@
>   	struct qede_rx_entry *curr_prod;
>   	dma_addr_t new_mapping;
> 
> -@@ -1364,7 +1410,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> +@@ -1367,7 +1413,6 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>   	uint8_t bitfield_val;
>   #endif
>   	uint8_t tunn_parse_flag;
> @@ -94,7 +95,7 @@
>   	struct eth_fast_path_rx_tpa_start_cqe *cqe_start_tpa;
>   	uint64_t ol_flags;
>   	uint32_t packet_type;
> -@@ -1373,6 +1418,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> +@@ -1376,6 +1421,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>   	uint8_t offset, tpa_agg_idx, flags;
>   	struct qede_agg_info *tpa_info = NULL;
>   	uint32_t rss_hash;
> @@ -102,7 +103,7 @@
> 
>   	hw_comp_cons = rte_le_to_cpu_16(*rxq->hw_cons_ptr);
>   	sw_comp_cons = ecore_chain_get_cons_idx(&rxq->rx_comp_ring);
> -@@ -1382,6 +1428,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> +@@ -1385,6 +1431,25 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>   	if (hw_comp_cons == sw_comp_cons)
>   		return 0;
> 
> @@ -128,7 +129,7 @@
>   	while (sw_comp_cons != hw_comp_cons) {
>   		ol_flags = 0;
>   		packet_type = RTE_PTYPE_UNKNOWN;
> -@@ -1553,16 +1618,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> **rx_pkts, uint16_t nb_pkts)
> +@@ -1556,16 +1621,7 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>   			rx_mb->hash.rss = rss_hash;
>   		}
> 
> @@ -146,7 +147,7 @@
>   		qede_rx_bd_ring_consume(rxq);
> 
>   		if (!tpa_start_flg && fp_cqe->bd_num > 1) { -@@ -1574,17
> +1630,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf **rx_pkts, uint16_t
> nb_pkts)
> +@@ -1577,17 +1633,9 @@ qede_recv_pkts(void *p_rxq, struct rte_mbuf
> +**rx_pkts, uint16_t nb_pkts)
>   			if (qede_process_sg_pkts(p_rxq, seg1, num_segs,
>   						 pkt_len - len))
>   				goto next_cqe;
> @@ -167,7 +168,7 @@
>   		}
>   		rxq->rx_segs++; /* for the first segment */
> 
> -@@ -1626,7 +1674,8 @@ next_cqe:
> +@@ -1629,7 +1677,8 @@ next_cqe:
>   		}
>   	}
> 
> @@ -178,10 +179,10 @@
>   	rxq->rcv_pkts += rx_pkt;
> 
>  diff --git a/drivers/net/qede/qede_rxtx.h b/drivers/net/qede/qede_rxtx.h -index
> 454daa07b..5b249cbb2 100644
> +index fe80237d5..574831e61 100644
>  --- a/drivers/net/qede/qede_rxtx.h
>  +++ b/drivers/net/qede/qede_rxtx.h
> -@@ -192,6 +192,8 @@ struct qede_rx_queue {
> +@@ -196,6 +196,8 @@ struct qede_rx_queue {
>   	uint16_t queue_id;
>   	uint16_t port_id;
>   	uint16_t rx_buf_size;


More information about the stable mailing list