[dpdk-dev] [PATCH v2] mbuf: add helpers to prefetch mbuf

Jerin Jacob jerin.jacob at caviumnetworks.com
Thu May 19 08:46:55 CEST 2016


On Wed, May 18, 2016 at 06:02:08PM +0200, Olivier Matz wrote:
> Some architectures (ex: Power8) have a cache line size of 128 bytes,
> so the drivers should not expect that prefetching the second part of
> the mbuf with rte_prefetch0(&m->cacheline1) is valid.
> 
> This commit add helpers that can be used by drivers to prefetch the
> rx or tx part of the mbuf, whatever the cache line size.
> 
> Signed-off-by: Olivier Matz <olivier.matz at 6wind.com>

Reviewed-by: Jerin Jacob <jerin.jacob at caviumnetworks.com>

> ---
> 
> v1 -> v2:
> - rename part0 as part1 and part1 as part2, as suggested by Thomas
> 
> 
>  drivers/net/fm10k/fm10k_rxtx_vec.c |  8 ++++----
>  drivers/net/i40e/i40e_rxtx_vec.c   |  8 ++++----
>  drivers/net/ixgbe/ixgbe_rxtx_vec.c |  8 ++++----
>  drivers/net/mlx4/mlx4.c            |  4 ++--
>  drivers/net/mlx5/mlx5_rxtx.c       |  4 ++--
>  examples/ipsec-secgw/ipsec-secgw.c |  2 +-
>  lib/librte_mbuf/rte_mbuf.h         | 38 ++++++++++++++++++++++++++++++++++++++
>  7 files changed, 55 insertions(+), 17 deletions(-)
> 
> diff --git a/drivers/net/fm10k/fm10k_rxtx_vec.c b/drivers/net/fm10k/fm10k_rxtx_vec.c
> index 03e4a5c..ef256a5 100644
> --- a/drivers/net/fm10k/fm10k_rxtx_vec.c
> +++ b/drivers/net/fm10k/fm10k_rxtx_vec.c
> @@ -487,10 +487,10 @@ fm10k_recv_raw_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
>  		rte_compiler_barrier();
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
> diff --git a/drivers/net/i40e/i40e_rxtx_vec.c b/drivers/net/i40e/i40e_rxtx_vec.c
> index f7a62a8..eef80d9 100644
> --- a/drivers/net/i40e/i40e_rxtx_vec.c
> +++ b/drivers/net/i40e/i40e_rxtx_vec.c
> @@ -297,10 +297,10 @@ _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>  		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* avoid compiler reorder optimization */
> diff --git a/drivers/net/ixgbe/ixgbe_rxtx_vec.c b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> index c4d709b..e97ea82 100644
> --- a/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> +++ b/drivers/net/ixgbe/ixgbe_rxtx_vec.c
> @@ -307,10 +307,10 @@ _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
>  		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);
>  
>  		if (split_packet) {
> -			rte_prefetch0(&rx_pkts[pos]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 1]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 2]->cacheline1);
> -			rte_prefetch0(&rx_pkts[pos + 3]->cacheline1);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
> +			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
>  		}
>  
>  		/* avoid compiler reorder optimization */
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
> index c5d8535..733d192 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -3235,8 +3235,8 @@ mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		 * Fetch initial bytes of packet descriptor into a
>  		 * cacheline while allocating rep.
>  		 */
> -		rte_prefetch0(seg);
> -		rte_prefetch0(&seg->cacheline1);
> +		rte_mbuf_prefetch_part1(seg);
> +		rte_mbuf_prefetch_part2(seg);
>  		ret = rxq->if_cq->poll_length_flags(rxq->cq, NULL, NULL,
>  						    &flags);
>  		if (unlikely(ret < 0)) {
> diff --git a/drivers/net/mlx5/mlx5_rxtx.c b/drivers/net/mlx5/mlx5_rxtx.c
> index 1832a21..5be8c62 100644
> --- a/drivers/net/mlx5/mlx5_rxtx.c
> +++ b/drivers/net/mlx5/mlx5_rxtx.c
> @@ -1086,8 +1086,8 @@ mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
>  		 * Fetch initial bytes of packet descriptor into a
>  		 * cacheline while allocating rep.
>  		 */
> -		rte_prefetch0(seg);
> -		rte_prefetch0(&seg->cacheline1);
> +		rte_mbuf_prefetch_part1(seg);
> +		rte_mbuf_prefetch_part2(seg);
>  		ret = rxq->poll(rxq->cq, NULL, NULL, &flags, &vlan_tci);
>  		if (unlikely(ret < 0)) {
>  			struct ibv_wc wc;
> diff --git a/examples/ipsec-secgw/ipsec-secgw.c b/examples/ipsec-secgw/ipsec-secgw.c
> index 1dc505c..ebd7c23 100644
> --- a/examples/ipsec-secgw/ipsec-secgw.c
> +++ b/examples/ipsec-secgw/ipsec-secgw.c
> @@ -298,7 +298,7 @@ prepare_tx_burst(struct rte_mbuf *pkts[], uint16_t nb_pkts, uint8_t port)
>  	const int32_t prefetch_offset = 2;
>  
>  	for (i = 0; i < (nb_pkts - prefetch_offset); i++) {
> -		rte_prefetch0(pkts[i + prefetch_offset]->cacheline1);
> +		rte_mbuf_prefetch_part2(pkts[i + prefetch_offset]);
>  		prepare_tx_pkt(pkts[i], port);
>  	}
>  	/* Process left packets */
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 7b92b88..3ee8d66 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -842,6 +842,44 @@ struct rte_mbuf {
>  	uint16_t timesync;
>  } __rte_cache_aligned;
>  
> +/**
> + * Prefetch the first part of the mbuf
> + *
> + * The first 64 bytes of the mbuf corresponds to fields that are used early
> + * in the receive path. If the cache line of the architecture is higher than
> + * 64B, the second part will also be prefetched.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + */
> +static inline void
> +rte_mbuf_prefetch_part1(struct rte_mbuf *m)
> +{
> +	rte_prefetch0(&m->cacheline0);
> +}
> +
> +/**
> + * Prefetch the second part of the mbuf
> + *
> + * The next 64 bytes of the mbuf corresponds to fields that are used in the
> + * transmit path. If the cache line of the architecture is higher than 64B,
> + * this function does nothing as it is expected that the full mbuf is
> + * already in cache.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + */
> +static inline void
> +rte_mbuf_prefetch_part2(struct rte_mbuf *m)
> +{
> +#if RTE_CACHE_LINE_SIZE == 64
> +	rte_prefetch0(&m->cacheline1);
> +#else
> +	RTE_SET_USED(m);
> +#endif
> +}
> +
> +
>  static inline uint16_t rte_pktmbuf_priv_size(struct rte_mempool *mp);
>  
>  /**
> -- 
> 2.8.0.rc3
> 


More information about the dev mailing list