[dpdk-dev,v2,2/7] net/mlx4: inline more Tx functions

Message ID 1508768520-4810-3-git-send-email-ophirmu@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Ferruh Yigit
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Ophir Munk Oct. 23, 2017, 2:21 p.m. UTC
  Change functions to inline on Tx fast path to improve performance

Inside the inline function call other functions to handle "unlikely"
cases such that the inline function code footprint is small.

Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
---
 drivers/net/mlx4/mlx4_rxtx.c | 43 ++++++------------------------------
 drivers/net/mlx4/mlx4_rxtx.h | 52 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 58 insertions(+), 37 deletions(-)
  

Comments

Adrien Mazarguil Oct. 25, 2017, 4:49 p.m. UTC | #1
Hi Ophir,

On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> Change functions to inline on Tx fast path to improve performance
> 
> Inside the inline function call other functions to handle "unlikely"
> cases such that the inline function code footprint is small.
> 
> Signed-off-by: Ophir Munk <ophirmu@mellanox.com>

Reading this, it's like adding __rte_always_inline improves performance at
all, which I doubt unless you can show proof through performance results.

When in doubt, leave it to the compiler, the static keyword is usually
enough of a hint. Too much forced inlining may actually be harmful.

What this patch really does is splitting the heavy lookup/registration
function in two halves with one small static inline function for the lookup
part that calls the separate registration part in the unlikely event MR is
not already registered.

Thankfully the compiler doesn't inline the large registration function back,
which results in the perceived performance improvement for the time being,
however there is no guarantee it won't happen in the future (you didn't use
the noinline keyword on the registration function for that).

Therefore I have a bunch of comments and suggestions, see below.

> ---
>  drivers/net/mlx4/mlx4_rxtx.c | 43 ++++++------------------------------
>  drivers/net/mlx4/mlx4_rxtx.h | 52 +++++++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 58 insertions(+), 37 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
> index 011ea79..ae37f9b 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.c
> +++ b/drivers/net/mlx4/mlx4_rxtx.c
> @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
>  	return 0;
>  }
>  
> -/**
> - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
> - * the cloned mbuf is allocated is returned instead.
> - *
> - * @param buf
> - *   Pointer to mbuf.
> - *
> - * @return
> - *   Memory pool where data is located for given mbuf.
> - */
> -static struct rte_mempool *
> -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> -{
> -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> -		return rte_mbuf_from_indirect(buf)->pool;
> -	return buf->pool;
> -}
>  
>  /**
> - * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
> - * remove an entry first.
> + * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
> + * If mp2mr[] is full, remove an entry first.
>   *
>   * @param txq
>   *   Pointer to Tx queue structure.
>   * @param[in] mp
> - *   Memory pool for which a memory region lkey must be returned.
> + *   Memory pool for which a memory region lkey must be added
> + * @param[in] i
> + *   Index in memory pool (MP) where to add memory region (MR)
>   *
>   * @return
> - *   mr->lkey on success, (uint32_t)-1 on failure.
> + *   Added mr->lkey on success, (uint32_t)-1 on failure.
>   */
> -uint32_t
> -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
>  {
> -	unsigned int i;
>  	struct ibv_mr *mr;
>  
> -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> -			/* Unknown MP, add a new MR for it. */
> -			break;
> -		}
> -		if (txq->mp2mr[i].mp == mp) {
> -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> -			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
> -			return txq->mp2mr[i].lkey;
> -		}
> -	}
>  	/* Add a new entry, register MR first. */
>  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
>  	      (void *)txq, mp->name, (void *)mp);
> diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> index e10bbca..719ef45 100644
> --- a/drivers/net/mlx4/mlx4_rxtx.h
> +++ b/drivers/net/mlx4/mlx4_rxtx.h
> @@ -53,6 +53,7 @@
>  
>  #include "mlx4.h"
>  #include "mlx4_prm.h"
> +#include "mlx4_utils.h"

Why?

>  
>  /** Rx queue counters. */
>  struct mlx4_rxq_stats {
> @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
>  
>  /* mlx4_rxtx.c */
>  
> -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
>  uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
>  		       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
> @@ -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
>  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
>  			       uint16_t pkts_n);
> +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> +				unsigned int i);
>  
>  /* mlx4_txq.c */
>  
> @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
>  			const struct rte_eth_txconf *conf);
>  void mlx4_tx_queue_release(void *dpdk_txq);
>  
> +/**
> + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
> + * the cloned mbuf is allocated is returned instead.
> + *
> + * @param buf
> + *   Pointer to mbuf.
> + *
> + * @return
> + *   Memory pool where data is located for given mbuf.
> + */
> +static __rte_always_inline struct rte_mempool *
> +mlx4_txq_mb2mp(struct rte_mbuf *buf)
> +{
> +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> +		return rte_mbuf_from_indirect(buf)->pool;
> +	return buf->pool;
> +}
> +
> +/**
> + * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
> + * Call mlx4_txq_add_mr() if MP is not registered yet.
> + *
> + * @param txq
> + *   Pointer to Tx queue structure.
> + * @param[in] mp
> + *   Memory pool for which a memory region lkey must be returned.
> + *
> + * @return
> + *   mr->lkey on success, (uint32_t)-1 on failure.
> + */
> +static __rte_always_inline uint32_t

Note __rte_always_inline is defined in rte_common.h and should be explicitly
included (however don't do that, see below).

> +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> +{
> +	unsigned int i;
> +
> +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> +			/* Unknown MP, add a new MR for it. */
> +			break;
> +		}
> +		if (txq->mp2mr[i].mp == mp) {
> +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> +			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);

assert() requires assert.h (but don't include it, see subsequent
suggestion).

> +			return txq->mp2mr[i].lkey;
> +		}
> +	}
> +	return mlx4_txq_add_mr(txq, mp, i);
> +}
>  #endif /* MLX4_RXTX_H_ */

So as described above, these functions do not need the __rte_always_inline,
please remove it. They also do not need to be located in a header file; the
reason it's the case for their mlx5 counterparts is that they have to be
shared between vectorized/non-vectorized code. No such requirement here, you
should move them back to their original spot.

My suggestion for this performance improvement is to move mlx4_txq_add_mr()
to a different file, mlx4_mr.c looks like a good candidate. This fact will
ensure it's never inlined and far away from the data path.
  
Ophir Munk Oct. 25, 2017, 9:42 p.m. UTC | #2
Hi Adrien,

On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> 
> Hi Ophir,
> 
> On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > Change functions to inline on Tx fast path to improve performance
> >
> > Inside the inline function call other functions to handle "unlikely"
> > cases such that the inline function code footprint is small.
> >
> > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> 
> Reading this, it's like adding __rte_always_inline improves performance at
> all, which I doubt unless you can show proof through performance results.
> 
> When in doubt, leave it to the compiler, the static keyword is usually enough
> of a hint. Too much forced inlining may actually be harmful.
> 
> What this patch really does is splitting the heavy lookup/registration function
> in two halves with one small static inline function for the lookup part that
> calls the separate registration part in the unlikely event MR is not already
> registered.
> 
> Thankfully the compiler doesn't inline the large registration function back,
> which results in the perceived performance improvement for the time being,
> however there is no guarantee it won't happen in the future (you didn't use
> the noinline keyword on the registration function for that).
> 
> Therefore I have a bunch of comments and suggestions, see below.
> 
> > ---
> >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > ++++++------------------------------
> >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > +++++++++++++++++++++++++++++++++++++++++++-
> >  2 files changed, 58 insertions(+), 37 deletions(-)
> >
> > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> >  	return 0;
> >  }
> >
> > -/**
> > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > which
> > - * the cloned mbuf is allocated is returned instead.
> > - *
> > - * @param buf
> > - *   Pointer to mbuf.
> > - *
> > - * @return
> > - *   Memory pool where data is located for given mbuf.
> > - */
> > -static struct rte_mempool *
> > -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> > -{
> > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > -		return rte_mbuf_from_indirect(buf)->pool;
> > -	return buf->pool;
> > -}
> >
> >  /**
> > - * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is
> > full,
> > - * remove an entry first.
> > + * Add memory region (MR) <-> memory pool (MP) association to txq-
> >mp2mr[].
> > + * If mp2mr[] is full, remove an entry first.
> >   *
> >   * @param txq
> >   *   Pointer to Tx queue structure.
> >   * @param[in] mp
> > - *   Memory pool for which a memory region lkey must be returned.
> > + *   Memory pool for which a memory region lkey must be added
> > + * @param[in] i
> > + *   Index in memory pool (MP) where to add memory region (MR)
> >   *
> >   * @return
> > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> >   */
> > -uint32_t
> > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +uint32_t i)
> >  {
> > -	unsigned int i;
> >  	struct ibv_mr *mr;
> >
> > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > -			/* Unknown MP, add a new MR for it. */
> > -			break;
> > -		}
> > -		if (txq->mp2mr[i].mp == mp) {
> > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > -			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> > -			return txq->mp2mr[i].lkey;
> > -		}
> > -	}
> >  	/* Add a new entry, register MR first. */
> >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > e10bbca..719ef45 100644
> > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > @@ -53,6 +53,7 @@
> >
> >  #include "mlx4.h"
> >  #include "mlx4_prm.h"
> > +#include "mlx4_utils.h"
> 
> Why?
> 
> >
> >  /** Rx queue counters. */
> >  struct mlx4_rxq_stats {
> > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> >
> >  /* mlx4_rxtx.c */
> >
> > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> >  		       uint16_t pkts_n);
> >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> struct rte_mbuf **pkts,
> >  			       uint16_t pkts_n);
> >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
> >  			       uint16_t pkts_n);
> > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > +				unsigned int i);
> >
> >  /* mlx4_txq.c */
> >
> > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> *dev, uint16_t idx,
> >  			const struct rte_eth_txconf *conf);  void
> > mlx4_tx_queue_release(void *dpdk_txq);
> >
> > +/**
> > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > +which
> > + * the cloned mbuf is allocated is returned instead.
> > + *
> > + * @param buf
> > + *   Pointer to mbuf.
> > + *
> > + * @return
> > + *   Memory pool where data is located for given mbuf.
> > + */
> > +static __rte_always_inline struct rte_mempool * mlx4_txq_mb2mp(struct
> > +rte_mbuf *buf) {
> > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > +		return rte_mbuf_from_indirect(buf)->pool;
> > +	return buf->pool;
> > +}
> > +
> > +/**
> > + * Get memory region (MR) <-> memory pool (MP) association from txq-
> >mp2mr[].
> > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > + *
> > + * @param txq
> > + *   Pointer to Tx queue structure.
> > + * @param[in] mp
> > + *   Memory pool for which a memory region lkey must be returned.
> > + *
> > + * @return
> > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > + */
> > +static __rte_always_inline uint32_t
> 
> Note __rte_always_inline is defined in rte_common.h and should be
> explicitly included (however don't do that, see below).
> 
> > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > +	unsigned int i;
> > +
> > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > +			/* Unknown MP, add a new MR for it. */
> > +			break;
> > +		}
> > +		if (txq->mp2mr[i].mp == mp) {
> > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > +			assert(txq->mp2mr[i].mr->lkey == txq-
> >mp2mr[i].lkey);
> 
> assert() requires assert.h (but don't include it, see subsequent suggestion).
> 
> > +			return txq->mp2mr[i].lkey;
> > +		}
> > +	}
> > +	return mlx4_txq_add_mr(txq, mp, i);
> > +}
> >  #endif /* MLX4_RXTX_H_ */
> 
> So as described above, these functions do not need the __rte_always_inline,
> please remove it. They also do not need to be located in a header file; the
> reason it's the case for their mlx5 counterparts is that they have to be shared
> between vectorized/non-vectorized code. No such requirement here, you
> should move them back to their original spot.
> 

Static function mlx4_txq_mp2mr() must be in a header file because it is shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
It is not related to vectorized/non-vectorized code in mlx5.
Having said that -__rte_always_inline is required as well otherwise compilation fails with 
drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined but not used [-Werror=unused-function]
for files which include mlx4_rxtx.h

> My suggestion for this performance improvement is to move
> mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> candidate. This fact will ensure it's never inlined and far away from the data
> path.
> 

Function mlx4_txq_add_mr() is relatively small. 
What do you say about preceding it with __attribute((noinline)) instead of creating a new file?

> --
> Adrien Mazarguil
> 6WIND
  
Adrien Mazarguil Oct. 26, 2017, 7:48 a.m. UTC | #3
Hi Ophir,

Please see below.

On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> Hi Adrien,
> 
> On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > 
> > Hi Ophir,
> > 
> > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > Change functions to inline on Tx fast path to improve performance
> > >
> > > Inside the inline function call other functions to handle "unlikely"
> > > cases such that the inline function code footprint is small.
> > >
> > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > 
> > Reading this, it's like adding __rte_always_inline improves performance at
> > all, which I doubt unless you can show proof through performance results.
> > 
> > When in doubt, leave it to the compiler, the static keyword is usually enough
> > of a hint. Too much forced inlining may actually be harmful.
> > 
> > What this patch really does is splitting the heavy lookup/registration function
> > in two halves with one small static inline function for the lookup part that
> > calls the separate registration part in the unlikely event MR is not already
> > registered.
> > 
> > Thankfully the compiler doesn't inline the large registration function back,
> > which results in the perceived performance improvement for the time being,
> > however there is no guarantee it won't happen in the future (you didn't use
> > the noinline keyword on the registration function for that).
> > 
> > Therefore I have a bunch of comments and suggestions, see below.
> > 
> > > ---
> > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > ++++++------------------------------
> > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > +++++++++++++++++++++++++++++++++++++++++++-
> > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > >
> > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > >  	return 0;
> > >  }
> > >
> > > -/**
> > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > > which
> > > - * the cloned mbuf is allocated is returned instead.
> > > - *
> > > - * @param buf
> > > - *   Pointer to mbuf.
> > > - *
> > > - * @return
> > > - *   Memory pool where data is located for given mbuf.
> > > - */
> > > -static struct rte_mempool *
> > > -mlx4_txq_mb2mp(struct rte_mbuf *buf)
> > > -{
> > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > -	return buf->pool;
> > > -}
> > >
> > >  /**
> > > - * Get memory region (MR) <-> memory pool (MP) association from txq-
> > >mp2mr[].
> > > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is
> > > full,
> > > - * remove an entry first.
> > > + * Add memory region (MR) <-> memory pool (MP) association to txq-
> > >mp2mr[].
> > > + * If mp2mr[] is full, remove an entry first.
> > >   *
> > >   * @param txq
> > >   *   Pointer to Tx queue structure.
> > >   * @param[in] mp
> > > - *   Memory pool for which a memory region lkey must be returned.
> > > + *   Memory pool for which a memory region lkey must be added
> > > + * @param[in] i
> > > + *   Index in memory pool (MP) where to add memory region (MR)
> > >   *
> > >   * @return
> > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > >   */
> > > -uint32_t
> > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > +uint32_t i)
> > >  {
> > > -	unsigned int i;
> > >  	struct ibv_mr *mr;
> > >
> > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > -			/* Unknown MP, add a new MR for it. */
> > > -			break;
> > > -		}
> > > -		if (txq->mp2mr[i].mp == mp) {
> > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > >mp2mr[i].lkey);
> > > -			return txq->mp2mr[i].lkey;
> > > -		}
> > > -	}
> > >  	/* Add a new entry, register MR first. */
> > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > > e10bbca..719ef45 100644
> > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > @@ -53,6 +53,7 @@
> > >
> > >  #include "mlx4.h"
> > >  #include "mlx4_prm.h"
> > > +#include "mlx4_utils.h"
> > 
> > Why?
> > 
> > >
> > >  /** Rx queue counters. */
> > >  struct mlx4_rxq_stats {
> > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> > >
> > >  /* mlx4_rxtx.c */
> > >
> > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > >  		       uint16_t pkts_n);
> > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > struct rte_mbuf **pkts,
> > >  			       uint16_t pkts_n);
> > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
> > >  			       uint16_t pkts_n);
> > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > +				unsigned int i);
> > >
> > >  /* mlx4_txq.c */
> > >
> > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > *dev, uint16_t idx,
> > >  			const struct rte_eth_txconf *conf);  void
> > > mlx4_tx_queue_release(void *dpdk_txq);
> > >
> > > +/**
> > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from
> > > +which
> > > + * the cloned mbuf is allocated is returned instead.
> > > + *
> > > + * @param buf
> > > + *   Pointer to mbuf.
> > > + *
> > > + * @return
> > > + *   Memory pool where data is located for given mbuf.
> > > + */
> > > +static __rte_always_inline struct rte_mempool * mlx4_txq_mb2mp(struct
> > > +rte_mbuf *buf) {
> > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > +	return buf->pool;
> > > +}
> > > +
> > > +/**
> > > + * Get memory region (MR) <-> memory pool (MP) association from txq-
> > >mp2mr[].
> > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > + *
> > > + * @param txq
> > > + *   Pointer to Tx queue structure.
> > > + * @param[in] mp
> > > + *   Memory pool for which a memory region lkey must be returned.
> > > + *
> > > + * @return
> > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > + */
> > > +static __rte_always_inline uint32_t
> > 
> > Note __rte_always_inline is defined in rte_common.h and should be
> > explicitly included (however don't do that, see below).
> > 
> > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > +	unsigned int i;
> > > +
> > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > +			/* Unknown MP, add a new MR for it. */
> > > +			break;
> > > +		}
> > > +		if (txq->mp2mr[i].mp == mp) {
> > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > >mp2mr[i].lkey);
> > 
> > assert() requires assert.h (but don't include it, see subsequent suggestion).
> > 
> > > +			return txq->mp2mr[i].lkey;
> > > +		}
> > > +	}
> > > +	return mlx4_txq_add_mr(txq, mp, i);
> > > +}
> > >  #endif /* MLX4_RXTX_H_ */
> > 
> > So as described above, these functions do not need the __rte_always_inline,
> > please remove it. They also do not need to be located in a header file; the
> > reason it's the case for their mlx5 counterparts is that they have to be shared
> > between vectorized/non-vectorized code. No such requirement here, you
> > should move them back to their original spot.
> > 
> 
> Static function mlx4_txq_mp2mr() must be in a header file because it is shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> It is not related to vectorized/non-vectorized code in mlx5.
> Having said that -__rte_always_inline is required as well otherwise compilation fails with 
> drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined but not used [-Werror=unused-function]
> for files which include mlx4_rxtx.h

All right, then what you were looking or was static inline, not *force*
inline. The former is a hint, the latter doesn't leave much of a choice to
the compiler, it means you're sure this way brings the most performance,
however for this patch I really think inlining plays a really minor part
(even changes anything at all) compared to dividing this function, which is
the real performance improvement.

> > My suggestion for this performance improvement is to move
> > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > candidate. This fact will ensure it's never inlined and far away from the data
> > path.
> > 
> 
> Function mlx4_txq_add_mr() is relatively small. 
> What do you say about preceding it with __attribute((noinline)) instead of creating a new file?

What I mean is you should declare mlx4_txq_add_mr() which does the heavy
lifting inside mlx4_mr.c and provide its definition in mlx4.h instead of
mlx4_rxtx.h.

Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its original
spot as a non-static function with its public declaration remaining in
mlx4_rxtx.h for users outside of this file.

The fact mlx4_txq_mp2mr() remains defined in that file *before*
mlx4_post_send()'s definition where it's needed allows the compiler to
optimize it away as if it was static inline thanks to -O3, that is, unless
it thinks doing so would hurt performance, but as a (now) small function
this shouldn't be an issue.

Other reasons includes that doing so would make a smaller diff that focuses
on the performance improvement itself. The extra performance brought by a
statically inlined version of mlx4_txq_mp2mr() is not needed in mlx4_txq.c,
whose only purpose is to set up queues.
  
Ophir Munk Oct. 26, 2017, 2:27 p.m. UTC | #4
Hi,
Please see inside

On Thursday, October 26, 2017 10:49 AM Adrien Mazarguil wrote:
> To: Ophir Munk <ophirmu@mellanox.com>
> Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Olga Shern
> <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> Subject: Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
> 
> Hi Ophir,
> 
> Please see below.
> 
> On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> > Hi Adrien,
> >
> > On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > >
> > > Hi Ophir,
> > >
> > > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > > Change functions to inline on Tx fast path to improve performance
> > > >
> > > > Inside the inline function call other functions to handle "unlikely"
> > > > cases such that the inline function code footprint is small.
> > > >
> > > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > >
> > > Reading this, it's like adding __rte_always_inline improves
> > > performance at all, which I doubt unless you can show proof through
> performance results.
> > >
> > > When in doubt, leave it to the compiler, the static keyword is
> > > usually enough of a hint. Too much forced inlining may actually be
> harmful.
> > >
> > > What this patch really does is splitting the heavy
> > > lookup/registration function in two halves with one small static
> > > inline function for the lookup part that calls the separate
> > > registration part in the unlikely event MR is not already registered.
> > >
> > > Thankfully the compiler doesn't inline the large registration
> > > function back, which results in the perceived performance
> > > improvement for the time being, however there is no guarantee it
> > > won't happen in the future (you didn't use the noinline keyword on the
> registration function for that).
> > >
> > > Therefore I have a bunch of comments and suggestions, see below.
> > >
> > > > ---
> > > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > > ++++++------------------------------
> > > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > > >
> > > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > > >  	return 0;
> > > >  }
> > > >
> > > > -/**
> > > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool
> > > > from which
> > > > - * the cloned mbuf is allocated is returned instead.
> > > > - *
> > > > - * @param buf
> > > > - *   Pointer to mbuf.
> > > > - *
> > > > - * @return
> > > > - *   Memory pool where data is located for given mbuf.
> > > > - */
> > > > -static struct rte_mempool *
> > > > -mlx4_txq_mb2mp(struct rte_mbuf *buf) -{
> > > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > > -	return buf->pool;
> > > > -}
> > > >
> > > >  /**
> > > > - * Get memory region (MR) <-> memory pool (MP) association from
> > > >txq- mp2mr[].
> > > > - * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[]
> > > >is  full,
> > > > - * remove an entry first.
> > > > + * Add memory region (MR) <-> memory pool (MP) association to
> > > > + txq-
> > > >mp2mr[].
> > > > + * If mp2mr[] is full, remove an entry first.
> > > >   *
> > > >   * @param txq
> > > >   *   Pointer to Tx queue structure.
> > > >   * @param[in] mp
> > > > - *   Memory pool for which a memory region lkey must be returned.
> > > > + *   Memory pool for which a memory region lkey must be added
> > > > + * @param[in] i
> > > > + *   Index in memory pool (MP) where to add memory region (MR)
> > > >   *
> > > >   * @return
> > > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > > >   */
> > > > -uint32_t
> > > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > > +uint32_t i)
> > > >  {
> > > > -	unsigned int i;
> > > >  	struct ibv_mr *mr;
> > > >
> > > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > -			/* Unknown MP, add a new MR for it. */
> > > > -			break;
> > > > -		}
> > > > -		if (txq->mp2mr[i].mp == mp) {
> > > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > > >mp2mr[i].lkey);
> > > > -			return txq->mp2mr[i].lkey;
> > > > -		}
> > > > -	}
> > > >  	/* Add a new entry, register MR first. */
> > > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > >a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h index
> > > > e10bbca..719ef45 100644
> > > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > > @@ -53,6 +53,7 @@
> > > >
> > > >  #include "mlx4.h"
> > > >  #include "mlx4_prm.h"
> > > > +#include "mlx4_utils.h"
> > >
> > > Why?
> > >
> > > >
> > > >  /** Rx queue counters. */
> > > >  struct mlx4_rxq_stats {
> > > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void *dpdk_rxq);
> > > >
> > > >  /* mlx4_rxtx.c */
> > > >
> > > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
> > > > uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > > >  		       uint16_t pkts_n);
> > > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, @@
> > > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > > struct rte_mbuf **pkts,
> > > >  			       uint16_t pkts_n);
> > > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf
> **pkts,
> > > >  			       uint16_t pkts_n);
> > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
> > > > +				unsigned int i);
> > > >
> > > >  /* mlx4_txq.c */
> > > >
> > > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > > *dev, uint16_t idx,
> > > >  			const struct rte_eth_txconf *conf);  void
> > > > mlx4_tx_queue_release(void *dpdk_txq);
> > > >
> > > > +/**
> > > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool
> > > > +from which
> > > > + * the cloned mbuf is allocated is returned instead.
> > > > + *
> > > > + * @param buf
> > > > + *   Pointer to mbuf.
> > > > + *
> > > > + * @return
> > > > + *   Memory pool where data is located for given mbuf.
> > > > + */
> > > > +static __rte_always_inline struct rte_mempool *
> > > > +mlx4_txq_mb2mp(struct rte_mbuf *buf) {
> > > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > > +	return buf->pool;
> > > > +}
> > > > +
> > > > +/**
> > > > + * Get memory region (MR) <-> memory pool (MP) association from
> > > > +txq-
> > > >mp2mr[].
> > > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > > + *
> > > > + * @param txq
> > > > + *   Pointer to Tx queue structure.
> > > > + * @param[in] mp
> > > > + *   Memory pool for which a memory region lkey must be returned.
> > > > + *
> > > > + * @return
> > > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > + */
> > > > +static __rte_always_inline uint32_t
> > >
> > > Note __rte_always_inline is defined in rte_common.h and should be
> > > explicitly included (however don't do that, see below).
> > >
> > > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > > +	unsigned int i;
> > > > +
> > > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > +			/* Unknown MP, add a new MR for it. */
> > > > +			break;
> > > > +		}
> > > > +		if (txq->mp2mr[i].mp == mp) {
> > > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > > >mp2mr[i].lkey);
> > >
> > > assert() requires assert.h (but don't include it, see subsequent
> suggestion).
> > >
> > > > +			return txq->mp2mr[i].lkey;
> > > > +		}
> > > > +	}
> > > > +	return mlx4_txq_add_mr(txq, mp, i); }
> > > >  #endif /* MLX4_RXTX_H_ */
> > >
> > > So as described above, these functions do not need the
> > > __rte_always_inline, please remove it. They also do not need to be
> > > located in a header file; the reason it's the case for their mlx5
> > > counterparts is that they have to be shared between
> > > vectorized/non-vectorized code. No such requirement here, you should
> move them back to their original spot.
> > >
> >
> > Static function mlx4_txq_mp2mr() must be in a header file because it is
> shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> > It is not related to vectorized/non-vectorized code in mlx5.
> > Having said that -__rte_always_inline is required as well otherwise
> > compilation fails with
> > drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined
> > but not used [-Werror=unused-function] for files which include
> > mlx4_rxtx.h
> 
> All right, then what you were looking or was static inline, not *force* inline.
> The former is a hint, the latter doesn't leave much of a choice to the
> compiler, it means you're sure this way brings the most performance,
> however for this patch I really think inlining plays a really minor part (even
> changes anything at all) compared to dividing this function, which is the real
> performance improvement.

Without inline I get ~0.2Mpps degradation on my setup, therefore I suggest keeping inline.
The final call is yours. Please let me know if to leave inline or remove.

> 
> > > My suggestion for this performance improvement is to move
> > > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > > candidate. This fact will ensure it's never inlined and far away
> > > from the data path.
> > >
> >
> > Function mlx4_txq_add_mr() is relatively small.
> > What do you say about preceding it with __attribute((noinline)) instead of
> creating a new file?
> 
> What I mean is you should declare mlx4_txq_add_mr() which does the heavy
> lifting inside mlx4_mr.c and provide its definition in mlx4.h instead of
> mlx4_rxtx.h.
> 
> Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its original
> spot as a non-static function with its public declaration remaining in
> mlx4_rxtx.h for users outside of this file.
> 

That's done before. mlx4_txq_mp2mr() is defined in mlx4_rxtx.c as described.

> The fact mlx4_txq_mp2mr() remains defined in that file *before*
> mlx4_post_send()'s definition where it's needed allows the compiler to
> optimize it away as if it was static inline thanks to -O3, that is, unless it thinks
> doing so would hurt performance, but as a (now) small function this
> shouldn't be an issue.
> 
> Other reasons includes that doing so would make a smaller diff that focuses
> on the performance improvement itself. The extra performance brought by a
> statically inlined version of mlx4_txq_mp2mr() is not needed in mlx4_txq.c,
> whose only purpose is to set up queues.

I have moved mlx4_txq_add_mr() to mlx_mr.c. One of the disadvantages of moving it is that now 
it requires adding #include "mlx4_rxtx.h" in the C file in order to know the details of txq struct.
I liked mlx4_mr.c for being encapsulated from any Tx/Rx specific structures and just handling MR issues.

> 
> --
> Adrien Mazarguil
> 6WIND
  
Ophir Munk Oct. 29, 2017, 7:30 p.m. UTC | #5
On Thursday, October 26, 2017 5:28 PM, Ophir Munk wrote:

> 
> Hi,
> Please see inside
> 
> On Thursday, October 26, 2017 10:49 AM Adrien Mazarguil wrote:
> > To: Ophir Munk <ophirmu@mellanox.com>
> > Cc: dev@dpdk.org; Thomas Monjalon <thomas@monjalon.net>; Olga
> Shern
> > <olgas@mellanox.com>; Matan Azrad <matan@mellanox.com>
> > Subject: Re: [PATCH v2 2/7] net/mlx4: inline more Tx functions
> >
> > Hi Ophir,
> >
> > Please see below.
> >
> > On Wed, Oct 25, 2017 at 09:42:46PM +0000, Ophir Munk wrote:
> > > Hi Adrien,
> > >
> > > On Wednesday, October 25, 2017 7:50 PM, Adrien Mazarguil wrote:
> > > >
> > > > Hi Ophir,
> > > >
> > > > On Mon, Oct 23, 2017 at 02:21:55PM +0000, Ophir Munk wrote:
> > > > > Change functions to inline on Tx fast path to improve
> > > > > performance
> > > > >
> > > > > Inside the inline function call other functions to handle "unlikely"
> > > > > cases such that the inline function code footprint is small.
> > > > >
> > > > > Signed-off-by: Ophir Munk <ophirmu@mellanox.com>
> > > >
> > > > Reading this, it's like adding __rte_always_inline improves
> > > > performance at all, which I doubt unless you can show proof
> > > > through
> > performance results.
> > > >
> > > > When in doubt, leave it to the compiler, the static keyword is
> > > > usually enough of a hint. Too much forced inlining may actually be
> > harmful.
> > > >
> > > > What this patch really does is splitting the heavy
> > > > lookup/registration function in two halves with one small static
> > > > inline function for the lookup part that calls the separate
> > > > registration part in the unlikely event MR is not already registered.
> > > >
> > > > Thankfully the compiler doesn't inline the large registration
> > > > function back, which results in the perceived performance
> > > > improvement for the time being, however there is no guarantee it
> > > > won't happen in the future (you didn't use the noinline keyword on
> > > > the
> > registration function for that).
> > > >
> > > > Therefore I have a bunch of comments and suggestions, see below.
> > > >
> > > > > ---
> > > > >  drivers/net/mlx4/mlx4_rxtx.c | 43
> > > > > ++++++------------------------------
> > > > >  drivers/net/mlx4/mlx4_rxtx.h | 52
> > > > > +++++++++++++++++++++++++++++++++++++++++++-
> > > > >  2 files changed, 58 insertions(+), 37 deletions(-)
> > > > >
> > > > > diff --git a/drivers/net/mlx4/mlx4_rxtx.c
> > > > > b/drivers/net/mlx4/mlx4_rxtx.c index 011ea79..ae37f9b 100644
> > > > > --- a/drivers/net/mlx4/mlx4_rxtx.c
> > > > > +++ b/drivers/net/mlx4/mlx4_rxtx.c
> > > > > @@ -220,54 +220,25 @@ mlx4_txq_complete(struct txq *txq)
> > > > >  	return 0;
> > > > >  }
> > > > >
> > > > > -/**
> > > > > - * Get memory pool (MP) from mbuf. If mbuf is indirect, the
> > > > > pool from which
> > > > > - * the cloned mbuf is allocated is returned instead.
> > > > > - *
> > > > > - * @param buf
> > > > > - *   Pointer to mbuf.
> > > > > - *
> > > > > - * @return
> > > > > - *   Memory pool where data is located for given mbuf.
> > > > > - */
> > > > > -static struct rte_mempool *
> > > > > -mlx4_txq_mb2mp(struct rte_mbuf *buf) -{
> > > > > -	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > > -		return rte_mbuf_from_indirect(buf)->pool;
> > > > > -	return buf->pool;
> > > > > -}
> > > > >
> > > > >  /**
> > > > > - * Get memory region (MR) <-> memory pool (MP) association from
> > > > >txq- mp2mr[].
> > > > > - * Add MP to txq->mp2mr[] if it's not registered yet. If
> > > > >mp2mr[] is  full,
> > > > > - * remove an entry first.
> > > > > + * Add memory region (MR) <-> memory pool (MP) association to
> > > > > + txq-
> > > > >mp2mr[].
> > > > > + * If mp2mr[] is full, remove an entry first.
> > > > >   *
> > > > >   * @param txq
> > > > >   *   Pointer to Tx queue structure.
> > > > >   * @param[in] mp
> > > > > - *   Memory pool for which a memory region lkey must be returned.
> > > > > + *   Memory pool for which a memory region lkey must be added
> > > > > + * @param[in] i
> > > > > + *   Index in memory pool (MP) where to add memory region (MR)
> > > > >   *
> > > > >   * @return
> > > > > - *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > > + *   Added mr->lkey on success, (uint32_t)-1 on failure.
> > > > >   */
> > > > > -uint32_t
> > > > > -mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
> > > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool
> > > > > +*mp, uint32_t i)
> > > > >  {
> > > > > -	unsigned int i;
> > > > >  	struct ibv_mr *mr;
> > > > >
> > > > > -	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > > -		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > > -			/* Unknown MP, add a new MR for it. */
> > > > > -			break;
> > > > > -		}
> > > > > -		if (txq->mp2mr[i].mp == mp) {
> > > > > -			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > > -			assert(txq->mp2mr[i].mr->lkey == txq-
> > > > >mp2mr[i].lkey);
> > > > > -			return txq->mp2mr[i].lkey;
> > > > > -		}
> > > > > -	}
> > > > >  	/* Add a new entry, register MR first. */
> > > > >  	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
> > > > >  	      (void *)txq, mp->name, (void *)mp); diff --git
> > > > >a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
> > > > >index
> > > > > e10bbca..719ef45 100644
> > > > > --- a/drivers/net/mlx4/mlx4_rxtx.h
> > > > > +++ b/drivers/net/mlx4/mlx4_rxtx.h
> > > > > @@ -53,6 +53,7 @@
> > > > >
> > > > >  #include "mlx4.h"
> > > > >  #include "mlx4_prm.h"
> > > > > +#include "mlx4_utils.h"
> > > >
> > > > Why?
> > > >
> > > > >
> > > > >  /** Rx queue counters. */
> > > > >  struct mlx4_rxq_stats {
> > > > > @@ -160,7 +161,6 @@ void mlx4_rx_queue_release(void
> *dpdk_rxq);
> > > > >
> > > > >  /* mlx4_rxtx.c */
> > > > >
> > > > > -uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool
> > > > > *mp); uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
> > > > >  		       uint16_t pkts_n);
> > > > >  uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
> > > > > @@
> > > > > -169,6 +169,8 @@ uint16_t mlx4_tx_burst_removed(void *dpdk_txq,
> > > > struct rte_mbuf **pkts,
> > > > >  			       uint16_t pkts_n);
> > > > >  uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf
> > **pkts,
> > > > >  			       uint16_t pkts_n);
> > > > > +uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool
> *mp,
> > > > > +				unsigned int i);
> > > > >
> > > > >  /* mlx4_txq.c */
> > > > >
> > > > > @@ -177,4 +179,52 @@ int mlx4_tx_queue_setup(struct rte_eth_dev
> > > > *dev, uint16_t idx,
> > > > >  			const struct rte_eth_txconf *conf);  void
> > > > > mlx4_tx_queue_release(void *dpdk_txq);
> > > > >
> > > > > +/**
> > > > > + * Get memory pool (MP) from mbuf. If mbuf is indirect, the
> > > > > +pool from which
> > > > > + * the cloned mbuf is allocated is returned instead.
> > > > > + *
> > > > > + * @param buf
> > > > > + *   Pointer to mbuf.
> > > > > + *
> > > > > + * @return
> > > > > + *   Memory pool where data is located for given mbuf.
> > > > > + */
> > > > > +static __rte_always_inline struct rte_mempool *
> > > > > +mlx4_txq_mb2mp(struct rte_mbuf *buf) {
> > > > > +	if (unlikely(RTE_MBUF_INDIRECT(buf)))
> > > > > +		return rte_mbuf_from_indirect(buf)->pool;
> > > > > +	return buf->pool;
> > > > > +}
> > > > > +
> > > > > +/**
> > > > > + * Get memory region (MR) <-> memory pool (MP) association from
> > > > > +txq-
> > > > >mp2mr[].
> > > > > + * Call mlx4_txq_add_mr() if MP is not registered yet.
> > > > > + *
> > > > > + * @param txq
> > > > > + *   Pointer to Tx queue structure.
> > > > > + * @param[in] mp
> > > > > + *   Memory pool for which a memory region lkey must be returned.
> > > > > + *
> > > > > + * @return
> > > > > + *   mr->lkey on success, (uint32_t)-1 on failure.
> > > > > + */
> > > > > +static __rte_always_inline uint32_t
> > > >
> > > > Note __rte_always_inline is defined in rte_common.h and should be
> > > > explicitly included (however don't do that, see below).
> > > >
> > > > > +mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp) {
> > > > > +	unsigned int i;
> > > > > +
> > > > > +	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
> > > > > +		if (unlikely(txq->mp2mr[i].mp == NULL)) {
> > > > > +			/* Unknown MP, add a new MR for it. */
> > > > > +			break;
> > > > > +		}
> > > > > +		if (txq->mp2mr[i].mp == mp) {
> > > > > +			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
> > > > > +			assert(txq->mp2mr[i].mr->lkey == txq-
> > > > >mp2mr[i].lkey);
> > > >
> > > > assert() requires assert.h (but don't include it, see subsequent
> > suggestion).
> > > >
> > > > > +			return txq->mp2mr[i].lkey;
> > > > > +		}
> > > > > +	}
> > > > > +	return mlx4_txq_add_mr(txq, mp, i); }
> > > > >  #endif /* MLX4_RXTX_H_ */
> > > >
> > > > So as described above, these functions do not need the
> > > > __rte_always_inline, please remove it. They also do not need to be
> > > > located in a header file; the reason it's the case for their mlx5
> > > > counterparts is that they have to be shared between
> > > > vectorized/non-vectorized code. No such requirement here, you
> > > > should
> > move them back to their original spot.
> > > >
> > >
> > > Static function mlx4_txq_mp2mr() must be in a header file because it
> > > is
> > shared by 2 files: mlx4_txq.c and mlx4_rxtx.c.
> > > It is not related to vectorized/non-vectorized code in mlx5.
> > > Having said that -__rte_always_inline is required as well otherwise
> > > compilation fails with
> > > drivers/net/mlx4/mlx4_rxtx.h:200:1: error: 'mlx4_txq_mp2mr' defined
> > > but not used [-Werror=unused-function] for files which include
> > > mlx4_rxtx.h
> >
> > All right, then what you were looking or was static inline, not *force*
> inline.
> > The former is a hint, the latter doesn't leave much of a choice to the
> > compiler, it means you're sure this way brings the most performance,
> > however for this patch I really think inlining plays a really minor
> > part (even changes anything at all) compared to dividing this
> > function, which is the real performance improvement.
> 
> Without inline I get ~0.2Mpps degradation on my setup, therefore I suggest
> keeping inline.
> The final call is yours. Please let me know if to leave inline or remove.
> 

Till further indication - function mlx4_txq_mp2mr() is left static in next v3
of this series.

> >
> > > > My suggestion for this performance improvement is to move
> > > > mlx4_txq_add_mr() to a different file, mlx4_mr.c looks like a good
> > > > candidate. This fact will ensure it's never inlined and far away
> > > > from the data path.
> > > >
> > >
> > > Function mlx4_txq_add_mr() is relatively small.
> > > What do you say about preceding it with __attribute((noinline))
> > > instead of
> > creating a new file?
> >
> > What I mean is you should declare mlx4_txq_add_mr() which does the
> > heavy lifting inside mlx4_mr.c and provide its definition in mlx4.h
> > instead of mlx4_rxtx.h.
> >
> > Then, mlx4_txq_mp2mr() can remain defined in mlx4_rxtx.c in its
> > original spot as a non-static function with its public declaration
> > remaining in mlx4_rxtx.h for users outside of this file.
> >
> 
> That's done before. mlx4_txq_mp2mr() is defined in mlx4_rxtx.c as
> described.
> 
> > The fact mlx4_txq_mp2mr() remains defined in that file *before*
> > mlx4_post_send()'s definition where it's needed allows the compiler to
> > optimize it away as if it was static inline thanks to -O3, that is,
> > unless it thinks doing so would hurt performance, but as a (now) small
> > function this shouldn't be an issue.
> >
> > Other reasons includes that doing so would make a smaller diff that
> > focuses on the performance improvement itself. The extra performance
> > brought by a statically inlined version of mlx4_txq_mp2mr() is not
> > needed in mlx4_txq.c, whose only purpose is to set up queues.
> 
> I have moved mlx4_txq_add_mr() to mlx_mr.c. One of the disadvantages of
> moving it is that now it requires adding #include "mlx4_rxtx.h" in the C file in
> order to know the details of txq struct.
> I liked mlx4_mr.c for being encapsulated from any Tx/Rx specific structures
> and just handling MR issues.
> 
> >
> > --
> > Adrien Mazarguil
> > 6WIND
  

Patch

diff --git a/drivers/net/mlx4/mlx4_rxtx.c b/drivers/net/mlx4/mlx4_rxtx.c
index 011ea79..ae37f9b 100644
--- a/drivers/net/mlx4/mlx4_rxtx.c
+++ b/drivers/net/mlx4/mlx4_rxtx.c
@@ -220,54 +220,25 @@  mlx4_txq_complete(struct txq *txq)
 	return 0;
 }
 
-/**
- * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
- * the cloned mbuf is allocated is returned instead.
- *
- * @param buf
- *   Pointer to mbuf.
- *
- * @return
- *   Memory pool where data is located for given mbuf.
- */
-static struct rte_mempool *
-mlx4_txq_mb2mp(struct rte_mbuf *buf)
-{
-	if (unlikely(RTE_MBUF_INDIRECT(buf)))
-		return rte_mbuf_from_indirect(buf)->pool;
-	return buf->pool;
-}
 
 /**
- * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
- * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
- * remove an entry first.
+ * Add memory region (MR) <-> memory pool (MP) association to txq->mp2mr[].
+ * If mp2mr[] is full, remove an entry first.
  *
  * @param txq
  *   Pointer to Tx queue structure.
  * @param[in] mp
- *   Memory pool for which a memory region lkey must be returned.
+ *   Memory pool for which a memory region lkey must be added
+ * @param[in] i
+ *   Index in memory pool (MP) where to add memory region (MR)
  *
  * @return
- *   mr->lkey on success, (uint32_t)-1 on failure.
+ *   Added mr->lkey on success, (uint32_t)-1 on failure.
  */
-uint32_t
-mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp, uint32_t i)
 {
-	unsigned int i;
 	struct ibv_mr *mr;
 
-	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
-		if (unlikely(txq->mp2mr[i].mp == NULL)) {
-			/* Unknown MP, add a new MR for it. */
-			break;
-		}
-		if (txq->mp2mr[i].mp == mp) {
-			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
-			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
-			return txq->mp2mr[i].lkey;
-		}
-	}
 	/* Add a new entry, register MR first. */
 	DEBUG("%p: discovered new memory pool \"%s\" (%p)",
 	      (void *)txq, mp->name, (void *)mp);
diff --git a/drivers/net/mlx4/mlx4_rxtx.h b/drivers/net/mlx4/mlx4_rxtx.h
index e10bbca..719ef45 100644
--- a/drivers/net/mlx4/mlx4_rxtx.h
+++ b/drivers/net/mlx4/mlx4_rxtx.h
@@ -53,6 +53,7 @@ 
 
 #include "mlx4.h"
 #include "mlx4_prm.h"
+#include "mlx4_utils.h"
 
 /** Rx queue counters. */
 struct mlx4_rxq_stats {
@@ -160,7 +161,6 @@  void mlx4_rx_queue_release(void *dpdk_rxq);
 
 /* mlx4_rxtx.c */
 
-uint32_t mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
 uint16_t mlx4_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
 		       uint16_t pkts_n);
 uint16_t mlx4_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts,
@@ -169,6 +169,8 @@  uint16_t mlx4_tx_burst_removed(void *dpdk_txq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
 uint16_t mlx4_rx_burst_removed(void *dpdk_rxq, struct rte_mbuf **pkts,
 			       uint16_t pkts_n);
+uint32_t mlx4_txq_add_mr(struct txq *txq, struct rte_mempool *mp,
+				unsigned int i);
 
 /* mlx4_txq.c */
 
@@ -177,4 +179,52 @@  int mlx4_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx,
 			const struct rte_eth_txconf *conf);
 void mlx4_tx_queue_release(void *dpdk_txq);
 
+/**
+ * Get memory pool (MP) from mbuf. If mbuf is indirect, the pool from which
+ * the cloned mbuf is allocated is returned instead.
+ *
+ * @param buf
+ *   Pointer to mbuf.
+ *
+ * @return
+ *   Memory pool where data is located for given mbuf.
+ */
+static __rte_always_inline struct rte_mempool *
+mlx4_txq_mb2mp(struct rte_mbuf *buf)
+{
+	if (unlikely(RTE_MBUF_INDIRECT(buf)))
+		return rte_mbuf_from_indirect(buf)->pool;
+	return buf->pool;
+}
+
+/**
+ * Get memory region (MR) <-> memory pool (MP) association from txq->mp2mr[].
+ * Call mlx4_txq_add_mr() if MP is not registered yet.
+ *
+ * @param txq
+ *   Pointer to Tx queue structure.
+ * @param[in] mp
+ *   Memory pool for which a memory region lkey must be returned.
+ *
+ * @return
+ *   mr->lkey on success, (uint32_t)-1 on failure.
+ */
+static __rte_always_inline uint32_t
+mlx4_txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
+{
+	unsigned int i;
+
+	for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
+		if (unlikely(txq->mp2mr[i].mp == NULL)) {
+			/* Unknown MP, add a new MR for it. */
+			break;
+		}
+		if (txq->mp2mr[i].mp == mp) {
+			assert(txq->mp2mr[i].lkey != (uint32_t)-1);
+			assert(txq->mp2mr[i].mr->lkey == txq->mp2mr[i].lkey);
+			return txq->mp2mr[i].lkey;
+		}
+	}
+	return mlx4_txq_add_mr(txq, mp, i);
+}
 #endif /* MLX4_RXTX_H_ */