[dpdk-dev,v5,1/2] mbuf: support attaching external buffer to mbuf

Message ID 20180425025341.10590-1-yskoh@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply patch file failure

Commit Message

Yongseok Koh April 25, 2018, 2:53 a.m. UTC
  This patch introduces a new way of attaching an external buffer to a mbuf.

Attaching an external buffer is quite similar to mbuf indirection in
replacing buffer addresses and length of a mbuf, but a few differences:
  - When an indirect mbuf is attached, refcnt of the direct mbuf would be
    2 as long as the direct mbuf itself isn't freed after the attachment.
    In such cases, the buffer area of a direct mbuf must be read-only. But
    external buffer has its own refcnt and it starts from 1. Unless
    multiple mbufs are attached to a mbuf having an external buffer, the
    external buffer is writable.
  - There's no need to allocate buffer from a mempool. Any buffer can be
    attached with appropriate free callback.
  - Smaller metadata is required to maintain shared data such as refcnt.

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---

** This patch can pass the mbuf_autotest. **

Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches
will be submitted separately rebased on a differnet patchset which
accommodates new memory hotplug design to mlx PMDs.

v5:
* rte_pktmbuf_attach_extbuf() sets headroom to 0.
* if shinfo is provided when attaching, user should initialize it.
* minor changes from review.

v4:
* rte_pktmbuf_attach_extbuf() takes new arguments - buf_iova and shinfo.
  user can pass memory for shared data via shinfo argument.
* minor changes from review.

v3:
* implement external buffer attachment instead of introducing buf_off for
  mbuf indirection.

 lib/librte_mbuf/rte_mbuf.h | 303 ++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 274 insertions(+), 29 deletions(-)
  

Comments

Ananyev, Konstantin April 25, 2018, 1:31 p.m. UTC | #1
> 
> This patch introduces a new way of attaching an external buffer to a mbuf.
> 
> Attaching an external buffer is quite similar to mbuf indirection in
> replacing buffer addresses and length of a mbuf, but a few differences:
>   - When an indirect mbuf is attached, refcnt of the direct mbuf would be
>     2 as long as the direct mbuf itself isn't freed after the attachment.
>     In such cases, the buffer area of a direct mbuf must be read-only. But
>     external buffer has its own refcnt and it starts from 1. Unless
>     multiple mbufs are attached to a mbuf having an external buffer, the
>     external buffer is writable.
>   - There's no need to allocate buffer from a mempool. Any buffer can be
>     attached with appropriate free callback.
>   - Smaller metadata is required to maintain shared data such as refcnt.
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
> 
> ** This patch can pass the mbuf_autotest. **
> 
> Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches
> will be submitted separately rebased on a differnet patchset which
> accommodates new memory hotplug design to mlx PMDs.
> 
> v5:
> * rte_pktmbuf_attach_extbuf() sets headroom to 0.
> * if shinfo is provided when attaching, user should initialize it.
> * minor changes from review.
> 
> v4:
> * rte_pktmbuf_attach_extbuf() takes new arguments - buf_iova and shinfo.
>   user can pass memory for shared data via shinfo argument.
> * minor changes from review.
> 
> v3:
> * implement external buffer attachment instead of introducing buf_off for
>   mbuf indirection.
> 
>  lib/librte_mbuf/rte_mbuf.h | 303 ++++++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 274 insertions(+), 29 deletions(-)
> 
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 43aaa9c5f..e2c12874a 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -344,7 +344,10 @@ extern "C" {
>  		PKT_TX_MACSEC |		 \
>  		PKT_TX_SEC_OFFLOAD)
> 
> -#define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
> +/**
> + * Mbuf having an external buffer attached. shinfo in mbuf must be filled.
> + */
> +#define EXT_ATTACHED_MBUF    (1ULL << 61)
> 
>  #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
> 
> @@ -584,8 +587,27 @@ struct rte_mbuf {
>  	/** Sequence number. See also rte_reorder_insert(). */
>  	uint32_t seqn;
> 
> +	/** Shared data for external buffer attached to mbuf. See
> +	 * rte_pktmbuf_attach_extbuf().
> +	 */
> +	struct rte_mbuf_ext_shared_info *shinfo;
> +
>  } __rte_cache_aligned;
> 
> +/**
> + * Function typedef of callback to free externally attached buffer.
> + */
> +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
> +
> +/**
> + * Shared data at the end of an external buffer.
> + */
> +struct rte_mbuf_ext_shared_info {
> +	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> +	void *fcb_opaque;                        /**< Free callback argument */
> +	rte_atomic16_t refcnt_atomic;        /**< Atomically accessed refcnt */
> +};
> +
>  /**< Maximum number of nb_segs allowed. */
>  #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
> 
> @@ -706,14 +728,33 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
>  }
> 
>  /**
> + * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE
> + * otherwise.
> + *
> + * If a mbuf has its data in another mbuf and references it by mbuf
> + * indirection, this mbuf can be defined as a cloned mbuf.
> + */
> +#define RTE_MBUF_CLONED(mb)     ((mb)->ol_flags & IND_ATTACHED_MBUF)
> +
> +/**
>   * Returns TRUE if given mbuf is indirect, or FALSE otherwise.
>   */
> -#define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
> +#define RTE_MBUF_INDIRECT(mb)   RTE_MBUF_CLONED(mb)
> +
> +/**
> + * Returns TRUE if given mbuf has an external buffer, or FALSE otherwise.
> + *
> + * External buffer is a user-provided anonymous buffer.
> + */
> +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> 
>  /**
>   * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> + *
> + * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf
> + * can be defined as a direct mbuf.
>   */
> -#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
> +#define RTE_MBUF_DIRECT(mb) (!(RTE_MBUF_CLONED(mb) || RTE_MBUF_HAS_EXTBUF(mb)))
> 
>  /**
>   * Private data in case of pktmbuf pool.
> @@ -839,6 +880,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
> 
>  #endif /* RTE_MBUF_REFCNT_ATOMIC */
> 
> +/**
> + * Reads the refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @return
> + *   Reference count number.
> + */
> +static inline uint16_t
> +rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
> +{
> +	return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic));
> +}
> +
> +/**
> + * Set refcnt of an external buffer.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @param new_value
> + *   Value set
> + */
> +static inline void
> +rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo,
> +	uint16_t new_value)
> +{
> +	rte_atomic16_set(&shinfo->refcnt_atomic, new_value);
> +}
> +
> +/**
> + * Add given value to refcnt of an external buffer and return its new
> + * value.
> + *
> + * @param shinfo
> + *   Shared data of the external buffer.
> + * @param value
> + *   Value to add/subtract
> + * @return
> + *   Updated value
> + */
> +static inline uint16_t
> +rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
> +	int16_t value)
> +{
> +	if (likely(rte_mbuf_ext_refcnt_read(shinfo) == 1)) {
> +		rte_mbuf_ext_refcnt_set(shinfo, 1 + value);
> +		return 1 + value;
> +	}
> +
> +	return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value);
> +}
> +
>  /** Mbuf prefetch */
>  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
>  	if ((m) != NULL)                        \
> @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>  }
> 
>  /**
> + * Attach an external buffer to a mbuf.
> + *
> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> + * it, corresponding free callback function and its argument should be
> + * provided. This callback function will be called once all the mbufs are
> + * detached from the buffer.
> + *
> + * The headroom for the attaching mbuf will be set to zero and this can be
> + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
> + * or ``rte_pktmbuf_reset_headroom()`` can be used.
> + *
> + * More mbufs can be attached to the same external buffer by
> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> + * this API.
> + *
> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> + * ``rte_pktmbuf_detach()``.
> + *
> + * Attaching an external buffer is quite similar to mbuf indirection in
> + * replacing buffer addresses and length of a mbuf, but a few differences:
> + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
> + *   2 as long as the direct mbuf itself isn't freed after the attachment.
> + *   In such cases, the buffer area of a direct mbuf must be read-only. But
> + *   external buffer has its own refcnt and it starts from 1. Unless
> + *   multiple mbufs are attached to a mbuf having an external buffer, the
> + *   external buffer is writable.
> + * - There's no need to allocate buffer from a mempool. Any buffer can be
> + *   attached with appropriate free callback and its IO address.
> + * - Smaller metadata is required to maintain shared data such as refcnt.
> + *
> + * @warning
> + * @b EXPERIMENTAL: This API may change without prior notice.
> + * Once external buffer is enabled by allowing experimental API,
> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> + * exclusive. A mbuf can be considered direct if it is neither indirect nor
> + * having external buffer.
> + *
> + * @param m
> + *   The pointer to the mbuf.
> + * @param buf_addr
> + *   The pointer to the external buffer we're attaching to.
> + * @param buf_iova
> + *   IO address of the external buffer we're attaching to.
> + * @param buf_len
> + *   The size of the external buffer we're attaching to. If memory for
> + *   shared data is not provided, buf_len must be larger than the size of
> + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
> + *   enough, this function will return NULL.
> + * @param shinfo
> + *   User-provided memory for shared data. If NULL, a few bytes in the
> + *   trailer of the provided buffer will be dedicated for shared data and
> + *   the shared data will be properly initialized. Otherwise, user must
> + *   initialize the content except for free callback and its argument. The
> + *   pointer of shared data will be stored in m->shinfo.
> + * @param free_cb
> + *   Free callback function to call when the external buffer needs to be
> + *   freed.
> + * @param fcb_opaque
> + *   Argument for the free callback function.
> + *
> + * @return
> + *   A pointer to the new start of the data on success, return NULL
> + *   otherwise.
> + */
> +static inline char * __rte_experimental
> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> +	rte_iova_t buf_iova, uint16_t buf_len,
> +	struct rte_mbuf_ext_shared_info *shinfo,
> +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
> +{
> +	/* Additional attachment should be done by rte_pktmbuf_attach() */
> +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));

Shouldn't we have here something like:
RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
?

> +
> +	m->buf_addr = buf_addr;
> +	m->buf_iova = buf_iova;
> +
> +	if (shinfo == NULL) {

Instead of allocating shinfo ourselves - wound's it be better to rely
on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
or whenever he likes to.
Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
and would know for sure that free_cb wouldn't be overwritten by mistake.
I.E. mbuf code will only update refcnt inside shinfo.

> +		void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> +
> +		shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end,
> +					sizeof(*shinfo)), sizeof(uintptr_t));
> +		if ((void *)shinfo <= buf_addr)
> +			return NULL;
> +
> +		m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> +	} else {
> +		m->buf_len = buf_len;

I think you need to update shinfo>refcnt here too.

> +	}
> +
> +	m->data_len = 0;
> +	m->data_off = 0;
> +
> +	m->ol_flags |= EXT_ATTACHED_MBUF;
> +	m->shinfo = shinfo;
> +
> +	shinfo->free_cb = free_cb;
> +	shinfo->fcb_opaque = fcb_opaque;
> +
> +	return (char *)m->buf_addr + m->data_off;
> +}
> +
> +/**
> + * Detach the external buffer attached to a mbuf, same as
> + * ``rte_pktmbuf_detach()``
> + *
> + * @param m
> + *   The mbuf having external buffer.
> + */
> +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m)
> +
> +/**
>   * Attach packet mbuf to another packet mbuf.
>   *
> - * After attachment we refer the mbuf we attached as 'indirect',
> - * while mbuf we attached to as 'direct'.
> - * The direct mbuf's reference counter is incremented.
> + * If the mbuf we are attaching to isn't a direct buffer and is attached to
> + * an external buffer, the mbuf being attached will be attached to the
> + * external buffer instead of mbuf indirection.
> + *
> + * Otherwise, the mbuf will be indirectly attached. After attachment we
> + * refer the mbuf we attached as 'indirect', while mbuf we attached to as
> + * 'direct'.  The direct mbuf's reference counter is incremented.
>   *
>   * Right now, not supported:
>   *  - attachment for already indirect mbuf (e.g. - mi has to be direct).
> @@ -1231,19 +1440,19 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>   */
>  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  {
> -	struct rte_mbuf *md;
> -
>  	RTE_ASSERT(RTE_MBUF_DIRECT(mi) &&
>  	    rte_mbuf_refcnt_read(mi) == 1);
> 
> -	/* if m is not direct, get the mbuf that embeds the data */
> -	if (RTE_MBUF_DIRECT(m))
> -		md = m;
> -	else
> -		md = rte_mbuf_from_indirect(m);
> +	if (RTE_MBUF_HAS_EXTBUF(m)) {
> +		rte_mbuf_ext_refcnt_update(m->shinfo, 1);
> +		mi->ol_flags = m->ol_flags;
> +	} else {
> +		/* if m is not direct, get the mbuf that embeds the data */
> +		rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1);
> +		mi->priv_size = m->priv_size;
> +		mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
> +	}
> 
> -	rte_mbuf_refcnt_update(md, 1);
> -	mi->priv_size = m->priv_size;
>  	mi->buf_iova = m->buf_iova;
>  	mi->buf_addr = m->buf_addr;
>  	mi->buf_len = m->buf_len;
> @@ -1259,7 +1468,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  	mi->next = NULL;
>  	mi->pkt_len = mi->data_len;
>  	mi->nb_segs = 1;
> -	mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
>  	mi->packet_type = m->packet_type;
>  	mi->timestamp = m->timestamp;
> 
> @@ -1268,12 +1476,52 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>  }
> 
>  /**
> - * Detach an indirect packet mbuf.
> + * @internal used by rte_pktmbuf_detach().
>   *
> + * Decrement the reference counter of the external buffer. When the
> + * reference counter becomes 0, the buffer is freed by pre-registered
> + * callback.
> + */
> +static inline void
> +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> +{
> +	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> +	RTE_ASSERT(m->shinfo != NULL);
> +
> +	if (rte_mbuf_ext_refcnt_update(m->shinfo, -1) == 0)
> +		m->shinfo->free_cb(m->buf_addr, m->shinfo->fcb_opaque);
> +}
> +
> +/**
> + * @internal used by rte_pktmbuf_detach().
> + *
> + * Decrement the direct mbuf's reference counter. When the reference
> + * counter becomes 0, the direct mbuf is freed.
> + */
> +static inline void
> +__rte_pktmbuf_free_direct(struct rte_mbuf *m)
> +{
> +	struct rte_mbuf *md;
> +
> +	RTE_ASSERT(RTE_MBUF_INDIRECT(m));
> +
> +	md = rte_mbuf_from_indirect(m);
> +
> +	if (rte_mbuf_refcnt_update(md, -1) == 0) {
> +		md->next = NULL;
> +		md->nb_segs = 1;
> +		rte_mbuf_refcnt_set(md, 1);
> +		rte_mbuf_raw_free(md);
> +	}
> +}
> +
> +/**
> + * Detach a packet mbuf from external buffer or direct buffer.
> + *
> + *  - decrement refcnt and free the external/direct buffer if refcnt
> + *    becomes zero.
>   *  - restore original mbuf address and length values.
>   *  - reset pktmbuf data and data_len to their default values.
> - *  - decrement the direct mbuf's reference counter. When the
> - *  reference counter becomes 0, the direct mbuf is freed.
>   *
>   * All other fields of the given packet mbuf will be left intact.
>   *
> @@ -1282,10 +1530,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
>   */
>  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  {
> -	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
>  	struct rte_mempool *mp = m->pool;
>  	uint32_t mbuf_size, buf_len, priv_size;
> 
> +	if (RTE_MBUF_HAS_EXTBUF(m))
> +		__rte_pktmbuf_free_extbuf(m);
> +	else
> +		__rte_pktmbuf_free_direct(m);
> +
>  	priv_size = rte_pktmbuf_priv_size(mp);
>  	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
>  	buf_len = rte_pktmbuf_data_room_size(mp);
> @@ -1297,13 +1549,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
>  	rte_pktmbuf_reset_headroom(m);
>  	m->data_len = 0;
>  	m->ol_flags = 0;
> -
> -	if (rte_mbuf_refcnt_update(md, -1) == 0) {
> -		md->next = NULL;
> -		md->nb_segs = 1;
> -		rte_mbuf_refcnt_set(md, 1);
> -		rte_mbuf_raw_free(md);
> -	}
>  }
> 
>  /**
> @@ -1327,7 +1572,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> 
>  	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
> 
> -		if (RTE_MBUF_INDIRECT(m))
> +		if (!RTE_MBUF_DIRECT(m))
>  			rte_pktmbuf_detach(m);
> 
>  		if (m->next != NULL) {
> @@ -1339,7 +1584,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
> 
>  	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
> 
> -		if (RTE_MBUF_INDIRECT(m))
> +		if (!RTE_MBUF_DIRECT(m))
>  			rte_pktmbuf_detach(m);
> 
>  		if (m->next != NULL) {
> --
> 2.11.0
  
Yongseok Koh April 25, 2018, 5:06 p.m. UTC | #2
On Wed, Apr 25, 2018 at 01:31:42PM +0000, Ananyev, Konstantin wrote:
[...]
> >  /** Mbuf prefetch */
> >  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
> >  	if ((m) != NULL)                        \
> > @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> >  }
> > 
> >  /**
> > + * Attach an external buffer to a mbuf.
> > + *
> > + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> > + * it, corresponding free callback function and its argument should be
> > + * provided. This callback function will be called once all the mbufs are
> > + * detached from the buffer.
> > + *
> > + * The headroom for the attaching mbuf will be set to zero and this can be
> > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
> > + * or ``rte_pktmbuf_reset_headroom()`` can be used.
> > + *
> > + * More mbufs can be attached to the same external buffer by
> > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> > + * this API.
> > + *
> > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> > + * ``rte_pktmbuf_detach()``.
> > + *
> > + * Attaching an external buffer is quite similar to mbuf indirection in
> > + * replacing buffer addresses and length of a mbuf, but a few differences:
> > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
> > + *   2 as long as the direct mbuf itself isn't freed after the attachment.
> > + *   In such cases, the buffer area of a direct mbuf must be read-only. But
> > + *   external buffer has its own refcnt and it starts from 1. Unless
> > + *   multiple mbufs are attached to a mbuf having an external buffer, the
> > + *   external buffer is writable.
> > + * - There's no need to allocate buffer from a mempool. Any buffer can be
> > + *   attached with appropriate free callback and its IO address.
> > + * - Smaller metadata is required to maintain shared data such as refcnt.
> > + *
> > + * @warning
> > + * @b EXPERIMENTAL: This API may change without prior notice.
> > + * Once external buffer is enabled by allowing experimental API,
> > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> > + * exclusive. A mbuf can be considered direct if it is neither indirect nor
> > + * having external buffer.
> > + *
> > + * @param m
> > + *   The pointer to the mbuf.
> > + * @param buf_addr
> > + *   The pointer to the external buffer we're attaching to.
> > + * @param buf_iova
> > + *   IO address of the external buffer we're attaching to.
> > + * @param buf_len
> > + *   The size of the external buffer we're attaching to. If memory for
> > + *   shared data is not provided, buf_len must be larger than the size of
> > + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
> > + *   enough, this function will return NULL.
> > + * @param shinfo
> > + *   User-provided memory for shared data. If NULL, a few bytes in the
> > + *   trailer of the provided buffer will be dedicated for shared data and
> > + *   the shared data will be properly initialized. Otherwise, user must
> > + *   initialize the content except for free callback and its argument. The
> > + *   pointer of shared data will be stored in m->shinfo.
> > + * @param free_cb
> > + *   Free callback function to call when the external buffer needs to be
> > + *   freed.
> > + * @param fcb_opaque
> > + *   Argument for the free callback function.
> > + *
> > + * @return
> > + *   A pointer to the new start of the data on success, return NULL
> > + *   otherwise.
> > + */
> > +static inline char * __rte_experimental
> > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > +	rte_iova_t buf_iova, uint16_t buf_len,
> > +	struct rte_mbuf_ext_shared_info *shinfo,
> > +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
> > +{
> > +	/* Additional attachment should be done by rte_pktmbuf_attach() */
> > +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
> 
> Shouldn't we have here something like:
> RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
> ?

Right. That's better. Attaching mbuf should be direct and writable.

> > +
> > +	m->buf_addr = buf_addr;
> > +	m->buf_iova = buf_iova;
> > +
> > +	if (shinfo == NULL) {
> 
> Instead of allocating shinfo ourselves - wound's it be better to rely
> on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
> or whenever he likes to.

It is just for convenience. For some users, external attachment could be
occasional and casual, e.g. punt control traffic from kernel/hv. For such
non-serious cases, it is good to provide this small utility.

> Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
> and would know for sure that free_cb wouldn't be overwritten by mistake.
> I.E. mbuf code will only update refcnt inside shinfo.

I think you missed the discussion with other people yesterday. This change is
exactly for that purpose. Like I documented above, if this API is called with
shinfo being provided, it will use the user-provided shinfo instead of sparing a
few byte in the trailer and won't touch the shinfo. This code block happens only
if user doesn't provide memory for shared data (shinfo is NULL).

> > +		void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> > +
> > +		shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end,
> > +					sizeof(*shinfo)), sizeof(uintptr_t));
> > +		if ((void *)shinfo <= buf_addr)
> > +			return NULL;
> > +
> > +		m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> > +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> > +	} else {
> > +		m->buf_len = buf_len;
> 
> I think you need to update shinfo>refcnt here too.

Like explained above, if shinfo is provided, it doesn't alter anything except
for callbacks and its arg.


Thanks,
Yongseok
  
Ananyev, Konstantin April 25, 2018, 5:23 p.m. UTC | #3
> -----Original Message-----
> From: Yongseok Koh [mailto:yskoh@mellanox.com]
> Sent: Wednesday, April 25, 2018 6:07 PM
> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; olivier.matz@6wind.com; dev@dpdk.org;
> arybchenko@solarflare.com; stephen@networkplumber.org; thomas@monjalon.net; adrien.mazarguil@6wind.com;
> nelio.laranjeiro@6wind.com
> Subject: Re: [PATCH v5 1/2] mbuf: support attaching external buffer to mbuf
> 
> On Wed, Apr 25, 2018 at 01:31:42PM +0000, Ananyev, Konstantin wrote:
> [...]
> > >  /** Mbuf prefetch */
> > >  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
> > >  	if ((m) != NULL)                        \
> > > @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> > >  }
> > >
> > >  /**
> > > + * Attach an external buffer to a mbuf.
> > > + *
> > > + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> > > + * it, corresponding free callback function and its argument should be
> > > + * provided. This callback function will be called once all the mbufs are
> > > + * detached from the buffer.
> > > + *
> > > + * The headroom for the attaching mbuf will be set to zero and this can be
> > > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
> > > + * or ``rte_pktmbuf_reset_headroom()`` can be used.
> > > + *
> > > + * More mbufs can be attached to the same external buffer by
> > > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> > > + * this API.
> > > + *
> > > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> > > + * ``rte_pktmbuf_detach()``.
> > > + *
> > > + * Attaching an external buffer is quite similar to mbuf indirection in
> > > + * replacing buffer addresses and length of a mbuf, but a few differences:
> > > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
> > > + *   2 as long as the direct mbuf itself isn't freed after the attachment.
> > > + *   In such cases, the buffer area of a direct mbuf must be read-only. But
> > > + *   external buffer has its own refcnt and it starts from 1. Unless
> > > + *   multiple mbufs are attached to a mbuf having an external buffer, the
> > > + *   external buffer is writable.
> > > + * - There's no need to allocate buffer from a mempool. Any buffer can be
> > > + *   attached with appropriate free callback and its IO address.
> > > + * - Smaller metadata is required to maintain shared data such as refcnt.
> > > + *
> > > + * @warning
> > > + * @b EXPERIMENTAL: This API may change without prior notice.
> > > + * Once external buffer is enabled by allowing experimental API,
> > > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> > > + * exclusive. A mbuf can be considered direct if it is neither indirect nor
> > > + * having external buffer.
> > > + *
> > > + * @param m
> > > + *   The pointer to the mbuf.
> > > + * @param buf_addr
> > > + *   The pointer to the external buffer we're attaching to.
> > > + * @param buf_iova
> > > + *   IO address of the external buffer we're attaching to.
> > > + * @param buf_len
> > > + *   The size of the external buffer we're attaching to. If memory for
> > > + *   shared data is not provided, buf_len must be larger than the size of
> > > + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
> > > + *   enough, this function will return NULL.
> > > + * @param shinfo
> > > + *   User-provided memory for shared data. If NULL, a few bytes in the
> > > + *   trailer of the provided buffer will be dedicated for shared data and
> > > + *   the shared data will be properly initialized. Otherwise, user must
> > > + *   initialize the content except for free callback and its argument. The
> > > + *   pointer of shared data will be stored in m->shinfo.
> > > + * @param free_cb
> > > + *   Free callback function to call when the external buffer needs to be
> > > + *   freed.
> > > + * @param fcb_opaque
> > > + *   Argument for the free callback function.
> > > + *
> > > + * @return
> > > + *   A pointer to the new start of the data on success, return NULL
> > > + *   otherwise.
> > > + */
> > > +static inline char * __rte_experimental
> > > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > > +	rte_iova_t buf_iova, uint16_t buf_len,
> > > +	struct rte_mbuf_ext_shared_info *shinfo,
> > > +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
> > > +{
> > > +	/* Additional attachment should be done by rte_pktmbuf_attach() */
> > > +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
> >
> > Shouldn't we have here something like:
> > RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
> > ?
> 
> Right. That's better. Attaching mbuf should be direct and writable.
> 
> > > +
> > > +	m->buf_addr = buf_addr;
> > > +	m->buf_iova = buf_iova;
> > > +
> > > +	if (shinfo == NULL) {
> >
> > Instead of allocating shinfo ourselves - wound's it be better to rely
> > on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
> > or whenever he likes to.
> 
> It is just for convenience. For some users, external attachment could be
> occasional and casual, e.g. punt control traffic from kernel/hv. For such
> non-serious cases, it is good to provide this small utility.

For such users that small utility could be a separate function then:
shinfo_inside_buf() or so.

> 
> > Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
> > and would know for sure that free_cb wouldn't be overwritten by mistake.
> > I.E. mbuf code will only update refcnt inside shinfo.
> 
> I think you missed the discussion with other people yesterday. This change is
> exactly for that purpose. Like I documented above, if this API is called with
> shinfo being provided, it will use the user-provided shinfo instead of sparing a
> few byte in the trailer and won't touch the shinfo.

As I can see your current code always update free_cb and fcb_opaque.
Which is kind of strange these fields shold be the same for all instances of the shinfo.

> This code block happens only
> if user doesn't provide memory for shared data (shinfo is NULL).
> 
> > > +		void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> > > +
> > > +		shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end,
> > > +					sizeof(*shinfo)), sizeof(uintptr_t));
> > > +		if ((void *)shinfo <= buf_addr)
> > > +			return NULL;
> > > +
> > > +		m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> > > +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> > > +	} else {
> > > +		m->buf_len = buf_len;
> >
> > I think you need to update shinfo>refcnt here too.
> 
> Like explained above, if shinfo is provided, it doesn't alter anything except
> for callbacks and its arg.

Hm, but I have 2mbufs attached to the same external buffer via  same shinfo,
shouldn't  shinfo.refcnt == 2?


> 
> 
> Thanks,
> Yongseok
  
Yongseok Koh April 25, 2018, 6:02 p.m. UTC | #4
On Wed, Apr 25, 2018 at 05:23:20PM +0000, Ananyev, Konstantin wrote:
> 
> 
> > -----Original Message-----
> > From: Yongseok Koh [mailto:yskoh@mellanox.com]
> > Sent: Wednesday, April 25, 2018 6:07 PM
> > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; olivier.matz@6wind.com; dev@dpdk.org;
> > arybchenko@solarflare.com; stephen@networkplumber.org; thomas@monjalon.net; adrien.mazarguil@6wind.com;
> > nelio.laranjeiro@6wind.com
> > Subject: Re: [PATCH v5 1/2] mbuf: support attaching external buffer to mbuf
> > 
> > On Wed, Apr 25, 2018 at 01:31:42PM +0000, Ananyev, Konstantin wrote:
> > [...]
> > > >  /** Mbuf prefetch */
> > > >  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
> > > >  	if ((m) != NULL)                        \
> > > > @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> > > >  }
> > > >
> > > >  /**
> > > > + * Attach an external buffer to a mbuf.
> > > > + *
> > > > + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> > > > + * it, corresponding free callback function and its argument should be
> > > > + * provided. This callback function will be called once all the mbufs are
> > > > + * detached from the buffer.
> > > > + *
> > > > + * The headroom for the attaching mbuf will be set to zero and this can be
> > > > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
> > > > + * or ``rte_pktmbuf_reset_headroom()`` can be used.
> > > > + *
> > > > + * More mbufs can be attached to the same external buffer by
> > > > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> > > > + * this API.
> > > > + *
> > > > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> > > > + * ``rte_pktmbuf_detach()``.
> > > > + *
> > > > + * Attaching an external buffer is quite similar to mbuf indirection in
> > > > + * replacing buffer addresses and length of a mbuf, but a few differences:
> > > > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
> > > > + *   2 as long as the direct mbuf itself isn't freed after the attachment.
> > > > + *   In such cases, the buffer area of a direct mbuf must be read-only. But
> > > > + *   external buffer has its own refcnt and it starts from 1. Unless
> > > > + *   multiple mbufs are attached to a mbuf having an external buffer, the
> > > > + *   external buffer is writable.
> > > > + * - There's no need to allocate buffer from a mempool. Any buffer can be
> > > > + *   attached with appropriate free callback and its IO address.
> > > > + * - Smaller metadata is required to maintain shared data such as refcnt.
> > > > + *
> > > > + * @warning
> > > > + * @b EXPERIMENTAL: This API may change without prior notice.
> > > > + * Once external buffer is enabled by allowing experimental API,
> > > > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> > > > + * exclusive. A mbuf can be considered direct if it is neither indirect nor
> > > > + * having external buffer.
> > > > + *
> > > > + * @param m
> > > > + *   The pointer to the mbuf.
> > > > + * @param buf_addr
> > > > + *   The pointer to the external buffer we're attaching to.
> > > > + * @param buf_iova
> > > > + *   IO address of the external buffer we're attaching to.
> > > > + * @param buf_len
> > > > + *   The size of the external buffer we're attaching to. If memory for
> > > > + *   shared data is not provided, buf_len must be larger than the size of
> > > > + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
> > > > + *   enough, this function will return NULL.
> > > > + * @param shinfo
> > > > + *   User-provided memory for shared data. If NULL, a few bytes in the
> > > > + *   trailer of the provided buffer will be dedicated for shared data and
> > > > + *   the shared data will be properly initialized. Otherwise, user must
> > > > + *   initialize the content except for free callback and its argument. The
> > > > + *   pointer of shared data will be stored in m->shinfo.
> > > > + * @param free_cb
> > > > + *   Free callback function to call when the external buffer needs to be
> > > > + *   freed.
> > > > + * @param fcb_opaque
> > > > + *   Argument for the free callback function.
> > > > + *
> > > > + * @return
> > > > + *   A pointer to the new start of the data on success, return NULL
> > > > + *   otherwise.
> > > > + */
> > > > +static inline char * __rte_experimental
> > > > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > > > +	rte_iova_t buf_iova, uint16_t buf_len,
> > > > +	struct rte_mbuf_ext_shared_info *shinfo,
> > > > +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
> > > > +{
> > > > +	/* Additional attachment should be done by rte_pktmbuf_attach() */
> > > > +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
> > >
> > > Shouldn't we have here something like:
> > > RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
> > > ?
> > 
> > Right. That's better. Attaching mbuf should be direct and writable.
> > 
> > > > +
> > > > +	m->buf_addr = buf_addr;
> > > > +	m->buf_iova = buf_iova;
> > > > +
> > > > +	if (shinfo == NULL) {
> > >
> > > Instead of allocating shinfo ourselves - wound's it be better to rely
> > > on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
> > > or whenever he likes to.
> > 
> > It is just for convenience. For some users, external attachment could be
> > occasional and casual, e.g. punt control traffic from kernel/hv. For such
> > non-serious cases, it is good to provide this small utility.
> 
> For such users that small utility could be a separate function then:
> shinfo_inside_buf() or so.

I like this idea! As this is an inline function and can be called in a datapath,
shorter code is better if it isn't expected to be used frequently.

Will take this idea for the new version. Thanks.

> > > Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
> > > and would know for sure that free_cb wouldn't be overwritten by mistake.
> > > I.E. mbuf code will only update refcnt inside shinfo.
> > 
> > I think you missed the discussion with other people yesterday. This change is
> > exactly for that purpose. Like I documented above, if this API is called with
> > shinfo being provided, it will use the user-provided shinfo instead of sparing a
> > few byte in the trailer and won't touch the shinfo.
> 
> As I can see your current code always update free_cb and fcb_opaque.
> Which is kind of strange these fields shold be the same for all instances of the shinfo.
> 
> > This code block happens only
> > if user doesn't provide memory for shared data (shinfo is NULL).
> > 
> > > > +		void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> > > > +
> > > > +		shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end,
> > > > +					sizeof(*shinfo)), sizeof(uintptr_t));
> > > > +		if ((void *)shinfo <= buf_addr)
> > > > +			return NULL;
> > > > +
> > > > +		m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> > > > +		rte_mbuf_ext_refcnt_set(shinfo, 1);
> > > > +	} else {
> > > > +		m->buf_len = buf_len;
> > >
> > > I think you need to update shinfo>refcnt here too.
> > 
> > Like explained above, if shinfo is provided, it doesn't alter anything except
> > for callbacks and its arg.
> 
> Hm, but I have 2mbufs attached to the same external buffer via  same shinfo,
> shouldn't  shinfo.refcnt == 2?

If the shinfo is provided by user, user is responsible for managing the content.
Please refer to my reply to Andrew. I drew a few diagrams.


Thanks,
Yongseok
  
Yongseok Koh April 25, 2018, 6:22 p.m. UTC | #5
On Wed, Apr 25, 2018 at 11:02:36AM -0700, Yongseok Koh wrote:
> On Wed, Apr 25, 2018 at 05:23:20PM +0000, Ananyev, Konstantin wrote:
> > 
> > 
> > > -----Original Message-----
> > > From: Yongseok Koh [mailto:yskoh@mellanox.com]
> > > Sent: Wednesday, April 25, 2018 6:07 PM
> > > To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> > > Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; olivier.matz@6wind.com; dev@dpdk.org;
> > > arybchenko@solarflare.com; stephen@networkplumber.org; thomas@monjalon.net; adrien.mazarguil@6wind.com;
> > > nelio.laranjeiro@6wind.com
> > > Subject: Re: [PATCH v5 1/2] mbuf: support attaching external buffer to mbuf
> > > 
> > > On Wed, Apr 25, 2018 at 01:31:42PM +0000, Ananyev, Konstantin wrote:
> > > [...]
> > > > >  /** Mbuf prefetch */
> > > > >  #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
> > > > >  	if ((m) != NULL)                        \
> > > > > @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> > > > >  }
> > > > >
> > > > >  /**
> > > > > + * Attach an external buffer to a mbuf.
> > > > > + *
> > > > > + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> > > > > + * it, corresponding free callback function and its argument should be
> > > > > + * provided. This callback function will be called once all the mbufs are
> > > > > + * detached from the buffer.
> > > > > + *
> > > > > + * The headroom for the attaching mbuf will be set to zero and this can be
> > > > > + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
> > > > > + * or ``rte_pktmbuf_reset_headroom()`` can be used.
> > > > > + *
> > > > > + * More mbufs can be attached to the same external buffer by
> > > > > + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> > > > > + * this API.
> > > > > + *
> > > > > + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> > > > > + * ``rte_pktmbuf_detach()``.
> > > > > + *
> > > > > + * Attaching an external buffer is quite similar to mbuf indirection in
> > > > > + * replacing buffer addresses and length of a mbuf, but a few differences:
> > > > > + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
> > > > > + *   2 as long as the direct mbuf itself isn't freed after the attachment.
> > > > > + *   In such cases, the buffer area of a direct mbuf must be read-only. But
> > > > > + *   external buffer has its own refcnt and it starts from 1. Unless
> > > > > + *   multiple mbufs are attached to a mbuf having an external buffer, the
> > > > > + *   external buffer is writable.
> > > > > + * - There's no need to allocate buffer from a mempool. Any buffer can be
> > > > > + *   attached with appropriate free callback and its IO address.
> > > > > + * - Smaller metadata is required to maintain shared data such as refcnt.
> > > > > + *
> > > > > + * @warning
> > > > > + * @b EXPERIMENTAL: This API may change without prior notice.
> > > > > + * Once external buffer is enabled by allowing experimental API,
> > > > > + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> > > > > + * exclusive. A mbuf can be considered direct if it is neither indirect nor
> > > > > + * having external buffer.
> > > > > + *
> > > > > + * @param m
> > > > > + *   The pointer to the mbuf.
> > > > > + * @param buf_addr
> > > > > + *   The pointer to the external buffer we're attaching to.
> > > > > + * @param buf_iova
> > > > > + *   IO address of the external buffer we're attaching to.
> > > > > + * @param buf_len
> > > > > + *   The size of the external buffer we're attaching to. If memory for
> > > > > + *   shared data is not provided, buf_len must be larger than the size of
> > > > > + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
> > > > > + *   enough, this function will return NULL.
> > > > > + * @param shinfo
> > > > > + *   User-provided memory for shared data. If NULL, a few bytes in the
> > > > > + *   trailer of the provided buffer will be dedicated for shared data and
> > > > > + *   the shared data will be properly initialized. Otherwise, user must
> > > > > + *   initialize the content except for free callback and its argument. The
> > > > > + *   pointer of shared data will be stored in m->shinfo.
> > > > > + * @param free_cb
> > > > > + *   Free callback function to call when the external buffer needs to be
> > > > > + *   freed.
> > > > > + * @param fcb_opaque
> > > > > + *   Argument for the free callback function.
> > > > > + *
> > > > > + * @return
> > > > > + *   A pointer to the new start of the data on success, return NULL
> > > > > + *   otherwise.
> > > > > + */
> > > > > +static inline char * __rte_experimental
> > > > > +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> > > > > +	rte_iova_t buf_iova, uint16_t buf_len,
> > > > > +	struct rte_mbuf_ext_shared_info *shinfo,
> > > > > +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
> > > > > +{
> > > > > +	/* Additional attachment should be done by rte_pktmbuf_attach() */
> > > > > +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
> > > >
> > > > Shouldn't we have here something like:
> > > > RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
> > > > ?
> > > 
> > > Right. That's better. Attaching mbuf should be direct and writable.
> > > 
> > > > > +
> > > > > +	m->buf_addr = buf_addr;
> > > > > +	m->buf_iova = buf_iova;
> > > > > +
> > > > > +	if (shinfo == NULL) {
> > > >
> > > > Instead of allocating shinfo ourselves - wound's it be better to rely
> > > > on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
> > > > or whenever he likes to.
> > > 
> > > It is just for convenience. For some users, external attachment could be
> > > occasional and casual, e.g. punt control traffic from kernel/hv. For such
> > > non-serious cases, it is good to provide this small utility.
> > 
> > For such users that small utility could be a separate function then:
> > shinfo_inside_buf() or so.
> 
> I like this idea! As this is an inline function and can be called in a datapath,
> shorter code is better if it isn't expected to be used frequently.
> 
> Will take this idea for the new version. Thanks.

However, if this API is called with shinfo=NULL (builtin constant), this code
block won't get included in compile time because it is an inline function.

What is disadvantage to keep this block here? More intuitive?

Advantage of keeping it here could be simplicity. No need to call the utility in
advance.

Or separating this code to another inline function could make the API prototype
simpler because free_cb and its arg should be passed via shinfo.

static inline char * __rte_experimental
rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
		rte_iova_t buf_iova, uint16_t buf_len,
		struct rte_mbuf_ext_shared_info *shinfo)

I'm still inclined to write the utility function like you suggested.
Thoughts?

Thanks,
Yongseok

> > > > Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
> > > > and would know for sure that free_cb wouldn't be overwritten by mistake.
> > > > I.E. mbuf code will only update refcnt inside shinfo.
> > > 
> > > I think you missed the discussion with other people yesterday. This change is
> > > exactly for that purpose. Like I documented above, if this API is called with
> > > shinfo being provided, it will use the user-provided shinfo instead of sparing a
> > > few byte in the trailer and won't touch the shinfo.
> > 
> > As I can see your current code always update free_cb and fcb_opaque.
> > Which is kind of strange these fields shold be the same for all instances of the shinfo.
  
Yongseok Koh April 25, 2018, 6:30 p.m. UTC | #6
> On Apr 25, 2018, at 11:22 AM, Yongseok Koh <yskoh@mellanox.com> wrote:
> 
> On Wed, Apr 25, 2018 at 11:02:36AM -0700, Yongseok Koh wrote:
>> On Wed, Apr 25, 2018 at 05:23:20PM +0000, Ananyev, Konstantin wrote:
>>> 
>>> 
>>>> -----Original Message-----
>>>> From: Yongseok Koh [mailto:yskoh@mellanox.com]
>>>> Sent: Wednesday, April 25, 2018 6:07 PM
>>>> To: Ananyev, Konstantin <konstantin.ananyev@intel.com>
>>>> Cc: Lu, Wenzhuo <wenzhuo.lu@intel.com>; Wu, Jingjing <jingjing.wu@intel.com>; olivier.matz@6wind.com; dev@dpdk.org;
>>>> arybchenko@solarflare.com; stephen@networkplumber.org; thomas@monjalon.net; adrien.mazarguil@6wind.com;
>>>> nelio.laranjeiro@6wind.com
>>>> Subject: Re: [PATCH v5 1/2] mbuf: support attaching external buffer to mbuf
>>>> 
>>>> On Wed, Apr 25, 2018 at 01:31:42PM +0000, Ananyev, Konstantin wrote:
>>>> [...]
>>>>>> /** Mbuf prefetch */
>>>>>> #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
>>>>>> 	if ((m) != NULL)                        \
>>>>>> @@ -1213,11 +1306,127 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
>>>>>> }
>>>>>> 
>>>>>> /**
>>>>>> + * Attach an external buffer to a mbuf.
>>>>>> + *
>>>>>> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
>>>>>> + * it, corresponding free callback function and its argument should be
>>>>>> + * provided. This callback function will be called once all the mbufs are
>>>>>> + * detached from the buffer.
>>>>>> + *
>>>>>> + * The headroom for the attaching mbuf will be set to zero and this can be
>>>>>> + * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
>>>>>> + * or ``rte_pktmbuf_reset_headroom()`` can be used.
>>>>>> + *
>>>>>> + * More mbufs can be attached to the same external buffer by
>>>>>> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
>>>>>> + * this API.
>>>>>> + *
>>>>>> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
>>>>>> + * ``rte_pktmbuf_detach()``.
>>>>>> + *
>>>>>> + * Attaching an external buffer is quite similar to mbuf indirection in
>>>>>> + * replacing buffer addresses and length of a mbuf, but a few differences:
>>>>>> + * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
>>>>>> + *   2 as long as the direct mbuf itself isn't freed after the attachment.
>>>>>> + *   In such cases, the buffer area of a direct mbuf must be read-only. But
>>>>>> + *   external buffer has its own refcnt and it starts from 1. Unless
>>>>>> + *   multiple mbufs are attached to a mbuf having an external buffer, the
>>>>>> + *   external buffer is writable.
>>>>>> + * - There's no need to allocate buffer from a mempool. Any buffer can be
>>>>>> + *   attached with appropriate free callback and its IO address.
>>>>>> + * - Smaller metadata is required to maintain shared data such as refcnt.
>>>>>> + *
>>>>>> + * @warning
>>>>>> + * @b EXPERIMENTAL: This API may change without prior notice.
>>>>>> + * Once external buffer is enabled by allowing experimental API,
>>>>>> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
>>>>>> + * exclusive. A mbuf can be considered direct if it is neither indirect nor
>>>>>> + * having external buffer.
>>>>>> + *
>>>>>> + * @param m
>>>>>> + *   The pointer to the mbuf.
>>>>>> + * @param buf_addr
>>>>>> + *   The pointer to the external buffer we're attaching to.
>>>>>> + * @param buf_iova
>>>>>> + *   IO address of the external buffer we're attaching to.
>>>>>> + * @param buf_len
>>>>>> + *   The size of the external buffer we're attaching to. If memory for
>>>>>> + *   shared data is not provided, buf_len must be larger than the size of
>>>>>> + *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
>>>>>> + *   enough, this function will return NULL.
>>>>>> + * @param shinfo
>>>>>> + *   User-provided memory for shared data. If NULL, a few bytes in the
>>>>>> + *   trailer of the provided buffer will be dedicated for shared data and
>>>>>> + *   the shared data will be properly initialized. Otherwise, user must
>>>>>> + *   initialize the content except for free callback and its argument. The
>>>>>> + *   pointer of shared data will be stored in m->shinfo.
>>>>>> + * @param free_cb
>>>>>> + *   Free callback function to call when the external buffer needs to be
>>>>>> + *   freed.
>>>>>> + * @param fcb_opaque
>>>>>> + *   Argument for the free callback function.
>>>>>> + *
>>>>>> + * @return
>>>>>> + *   A pointer to the new start of the data on success, return NULL
>>>>>> + *   otherwise.
>>>>>> + */
>>>>>> +static inline char * __rte_experimental
>>>>>> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
>>>>>> +	rte_iova_t buf_iova, uint16_t buf_len,
>>>>>> +	struct rte_mbuf_ext_shared_info *shinfo,
>>>>>> +	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
>>>>>> +{
>>>>>> +	/* Additional attachment should be done by rte_pktmbuf_attach() */
>>>>>> +	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
>>>>> 
>>>>> Shouldn't we have here something like:
>>>>> RTE_ASSERT(RTE_MBUF_DIRECT(m) && rte_mbuf_refcnt_read(m) == 1);
>>>>> ?
>>>> 
>>>> Right. That's better. Attaching mbuf should be direct and writable.
>>>> 
>>>>>> +
>>>>>> +	m->buf_addr = buf_addr;
>>>>>> +	m->buf_iova = buf_iova;
>>>>>> +
>>>>>> +	if (shinfo == NULL) {
>>>>> 
>>>>> Instead of allocating shinfo ourselves - wound's it be better to rely
>>>>> on caller always allocating afeeling it for us (he can do that at the end/start of buffer,
>>>>> or whenever he likes to.
>>>> 
>>>> It is just for convenience. For some users, external attachment could be
>>>> occasional and casual, e.g. punt control traffic from kernel/hv. For such
>>>> non-serious cases, it is good to provide this small utility.
>>> 
>>> For such users that small utility could be a separate function then:
>>> shinfo_inside_buf() or so.
>> 
>> I like this idea! As this is an inline function and can be called in a datapath,
>> shorter code is better if it isn't expected to be used frequently.
>> 
>> Will take this idea for the new version. Thanks.
> 
> However, if this API is called with shinfo=NULL (builtin constant), this code
> block won't get included in compile time because it is an inline function.

Sorry, it was wrong. I said the exact opposite. Not enough sleep theses days. :-(
If shinfo is passed, the code block will be included anyway.

Please disregard the email.

Yongseok

> 
> What is disadvantage to keep this block here? More intuitive?
> 
> Advantage of keeping it here could be simplicity. No need to call the utility in
> advance.
> 
> Or separating this code to another inline function could make the API prototype
> simpler because free_cb and its arg should be passed via shinfo.
> 
> static inline char * __rte_experimental
> rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> 		rte_iova_t buf_iova, uint16_t buf_len,
> 		struct rte_mbuf_ext_shared_info *shinfo)
> 
> I'm still inclined to write the utility function like you suggested.
> Thoughts?
> 
> Thanks,
> Yongseok
> 
>>>>> Again in that case - caller can provide one shinfo to several mbufs (with different buf_addrs)
>>>>> and would know for sure that free_cb wouldn't be overwritten by mistake.
>>>>> I.E. mbuf code will only update refcnt inside shinfo.
>>>> 
>>>> I think you missed the discussion with other people yesterday. This change is
>>>> exactly for that purpose. Like I documented above, if this API is called with
>>>> shinfo being provided, it will use the user-provided shinfo instead of sparing a
>>>> few byte in the trailer and won't touch the shinfo.
>>> 
>>> As I can see your current code always update free_cb and fcb_opaque.
>>> Which is kind of strange these fields shold be the same for all instances of the shinfo.
  

Patch

diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
index 43aaa9c5f..e2c12874a 100644
--- a/lib/librte_mbuf/rte_mbuf.h
+++ b/lib/librte_mbuf/rte_mbuf.h
@@ -344,7 +344,10 @@  extern "C" {
 		PKT_TX_MACSEC |		 \
 		PKT_TX_SEC_OFFLOAD)
 
-#define __RESERVED           (1ULL << 61) /**< reserved for future mbuf use */
+/**
+ * Mbuf having an external buffer attached. shinfo in mbuf must be filled.
+ */
+#define EXT_ATTACHED_MBUF    (1ULL << 61)
 
 #define IND_ATTACHED_MBUF    (1ULL << 62) /**< Indirect attached mbuf */
 
@@ -584,8 +587,27 @@  struct rte_mbuf {
 	/** Sequence number. See also rte_reorder_insert(). */
 	uint32_t seqn;
 
+	/** Shared data for external buffer attached to mbuf. See
+	 * rte_pktmbuf_attach_extbuf().
+	 */
+	struct rte_mbuf_ext_shared_info *shinfo;
+
 } __rte_cache_aligned;
 
+/**
+ * Function typedef of callback to free externally attached buffer.
+ */
+typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
+
+/**
+ * Shared data at the end of an external buffer.
+ */
+struct rte_mbuf_ext_shared_info {
+	rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
+	void *fcb_opaque;                        /**< Free callback argument */
+	rte_atomic16_t refcnt_atomic;        /**< Atomically accessed refcnt */
+};
+
 /**< Maximum number of nb_segs allowed. */
 #define RTE_MBUF_MAX_NB_SEGS	UINT16_MAX
 
@@ -706,14 +728,33 @@  rte_mbuf_to_baddr(struct rte_mbuf *md)
 }
 
 /**
+ * Returns TRUE if given mbuf is cloned by mbuf indirection, or FALSE
+ * otherwise.
+ *
+ * If a mbuf has its data in another mbuf and references it by mbuf
+ * indirection, this mbuf can be defined as a cloned mbuf.
+ */
+#define RTE_MBUF_CLONED(mb)     ((mb)->ol_flags & IND_ATTACHED_MBUF)
+
+/**
  * Returns TRUE if given mbuf is indirect, or FALSE otherwise.
  */
-#define RTE_MBUF_INDIRECT(mb)   ((mb)->ol_flags & IND_ATTACHED_MBUF)
+#define RTE_MBUF_INDIRECT(mb)   RTE_MBUF_CLONED(mb)
+
+/**
+ * Returns TRUE if given mbuf has an external buffer, or FALSE otherwise.
+ *
+ * External buffer is a user-provided anonymous buffer.
+ */
+#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
 
 /**
  * Returns TRUE if given mbuf is direct, or FALSE otherwise.
+ *
+ * If a mbuf embeds its own data after the rte_mbuf structure, this mbuf
+ * can be defined as a direct mbuf.
  */
-#define RTE_MBUF_DIRECT(mb)     (!RTE_MBUF_INDIRECT(mb))
+#define RTE_MBUF_DIRECT(mb) (!(RTE_MBUF_CLONED(mb) || RTE_MBUF_HAS_EXTBUF(mb)))
 
 /**
  * Private data in case of pktmbuf pool.
@@ -839,6 +880,58 @@  rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
 
 #endif /* RTE_MBUF_REFCNT_ATOMIC */
 
+/**
+ * Reads the refcnt of an external buffer.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @return
+ *   Reference count number.
+ */
+static inline uint16_t
+rte_mbuf_ext_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
+{
+	return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic));
+}
+
+/**
+ * Set refcnt of an external buffer.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @param new_value
+ *   Value set
+ */
+static inline void
+rte_mbuf_ext_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo,
+	uint16_t new_value)
+{
+	rte_atomic16_set(&shinfo->refcnt_atomic, new_value);
+}
+
+/**
+ * Add given value to refcnt of an external buffer and return its new
+ * value.
+ *
+ * @param shinfo
+ *   Shared data of the external buffer.
+ * @param value
+ *   Value to add/subtract
+ * @return
+ *   Updated value
+ */
+static inline uint16_t
+rte_mbuf_ext_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
+	int16_t value)
+{
+	if (likely(rte_mbuf_ext_refcnt_read(shinfo) == 1)) {
+		rte_mbuf_ext_refcnt_set(shinfo, 1 + value);
+		return 1 + value;
+	}
+
+	return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value);
+}
+
 /** Mbuf prefetch */
 #define RTE_MBUF_PREFETCH_TO_FREE(m) do {       \
 	if ((m) != NULL)                        \
@@ -1213,11 +1306,127 @@  static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
 }
 
 /**
+ * Attach an external buffer to a mbuf.
+ *
+ * User-managed anonymous buffer can be attached to an mbuf. When attaching
+ * it, corresponding free callback function and its argument should be
+ * provided. This callback function will be called once all the mbufs are
+ * detached from the buffer.
+ *
+ * The headroom for the attaching mbuf will be set to zero and this can be
+ * properly adjusted after attachment. For example, ``rte_pktmbuf_adj()``
+ * or ``rte_pktmbuf_reset_headroom()`` can be used.
+ *
+ * More mbufs can be attached to the same external buffer by
+ * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
+ * this API.
+ *
+ * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
+ * ``rte_pktmbuf_detach()``.
+ *
+ * Attaching an external buffer is quite similar to mbuf indirection in
+ * replacing buffer addresses and length of a mbuf, but a few differences:
+ * - When an indirect mbuf is attached, refcnt of the direct mbuf would be
+ *   2 as long as the direct mbuf itself isn't freed after the attachment.
+ *   In such cases, the buffer area of a direct mbuf must be read-only. But
+ *   external buffer has its own refcnt and it starts from 1. Unless
+ *   multiple mbufs are attached to a mbuf having an external buffer, the
+ *   external buffer is writable.
+ * - There's no need to allocate buffer from a mempool. Any buffer can be
+ *   attached with appropriate free callback and its IO address.
+ * - Smaller metadata is required to maintain shared data such as refcnt.
+ *
+ * @warning
+ * @b EXPERIMENTAL: This API may change without prior notice.
+ * Once external buffer is enabled by allowing experimental API,
+ * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
+ * exclusive. A mbuf can be considered direct if it is neither indirect nor
+ * having external buffer.
+ *
+ * @param m
+ *   The pointer to the mbuf.
+ * @param buf_addr
+ *   The pointer to the external buffer we're attaching to.
+ * @param buf_iova
+ *   IO address of the external buffer we're attaching to.
+ * @param buf_len
+ *   The size of the external buffer we're attaching to. If memory for
+ *   shared data is not provided, buf_len must be larger than the size of
+ *   ``struct rte_mbuf_ext_shared_info`` and padding for alignment. If not
+ *   enough, this function will return NULL.
+ * @param shinfo
+ *   User-provided memory for shared data. If NULL, a few bytes in the
+ *   trailer of the provided buffer will be dedicated for shared data and
+ *   the shared data will be properly initialized. Otherwise, user must
+ *   initialize the content except for free callback and its argument. The
+ *   pointer of shared data will be stored in m->shinfo.
+ * @param free_cb
+ *   Free callback function to call when the external buffer needs to be
+ *   freed.
+ * @param fcb_opaque
+ *   Argument for the free callback function.
+ *
+ * @return
+ *   A pointer to the new start of the data on success, return NULL
+ *   otherwise.
+ */
+static inline char * __rte_experimental
+rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
+	rte_iova_t buf_iova, uint16_t buf_len,
+	struct rte_mbuf_ext_shared_info *shinfo,
+	rte_mbuf_extbuf_free_callback_t free_cb, void *fcb_opaque)
+{
+	/* Additional attachment should be done by rte_pktmbuf_attach() */
+	RTE_ASSERT(!RTE_MBUF_HAS_EXTBUF(m));
+
+	m->buf_addr = buf_addr;
+	m->buf_iova = buf_iova;
+
+	if (shinfo == NULL) {
+		void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
+
+		shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end,
+					sizeof(*shinfo)), sizeof(uintptr_t));
+		if ((void *)shinfo <= buf_addr)
+			return NULL;
+
+		m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
+		rte_mbuf_ext_refcnt_set(shinfo, 1);
+	} else {
+		m->buf_len = buf_len;
+	}
+
+	m->data_len = 0;
+	m->data_off = 0;
+
+	m->ol_flags |= EXT_ATTACHED_MBUF;
+	m->shinfo = shinfo;
+
+	shinfo->free_cb = free_cb;
+	shinfo->fcb_opaque = fcb_opaque;
+
+	return (char *)m->buf_addr + m->data_off;
+}
+
+/**
+ * Detach the external buffer attached to a mbuf, same as
+ * ``rte_pktmbuf_detach()``
+ *
+ * @param m
+ *   The mbuf having external buffer.
+ */
+#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m)
+
+/**
  * Attach packet mbuf to another packet mbuf.
  *
- * After attachment we refer the mbuf we attached as 'indirect',
- * while mbuf we attached to as 'direct'.
- * The direct mbuf's reference counter is incremented.
+ * If the mbuf we are attaching to isn't a direct buffer and is attached to
+ * an external buffer, the mbuf being attached will be attached to the
+ * external buffer instead of mbuf indirection.
+ *
+ * Otherwise, the mbuf will be indirectly attached. After attachment we
+ * refer the mbuf we attached as 'indirect', while mbuf we attached to as
+ * 'direct'.  The direct mbuf's reference counter is incremented.
  *
  * Right now, not supported:
  *  - attachment for already indirect mbuf (e.g. - mi has to be direct).
@@ -1231,19 +1440,19 @@  static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
  */
 static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 {
-	struct rte_mbuf *md;
-
 	RTE_ASSERT(RTE_MBUF_DIRECT(mi) &&
 	    rte_mbuf_refcnt_read(mi) == 1);
 
-	/* if m is not direct, get the mbuf that embeds the data */
-	if (RTE_MBUF_DIRECT(m))
-		md = m;
-	else
-		md = rte_mbuf_from_indirect(m);
+	if (RTE_MBUF_HAS_EXTBUF(m)) {
+		rte_mbuf_ext_refcnt_update(m->shinfo, 1);
+		mi->ol_flags = m->ol_flags;
+	} else {
+		/* if m is not direct, get the mbuf that embeds the data */
+		rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1);
+		mi->priv_size = m->priv_size;
+		mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
+	}
 
-	rte_mbuf_refcnt_update(md, 1);
-	mi->priv_size = m->priv_size;
 	mi->buf_iova = m->buf_iova;
 	mi->buf_addr = m->buf_addr;
 	mi->buf_len = m->buf_len;
@@ -1259,7 +1468,6 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 	mi->next = NULL;
 	mi->pkt_len = mi->data_len;
 	mi->nb_segs = 1;
-	mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
 	mi->packet_type = m->packet_type;
 	mi->timestamp = m->timestamp;
 
@@ -1268,12 +1476,52 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
 }
 
 /**
- * Detach an indirect packet mbuf.
+ * @internal used by rte_pktmbuf_detach().
  *
+ * Decrement the reference counter of the external buffer. When the
+ * reference counter becomes 0, the buffer is freed by pre-registered
+ * callback.
+ */
+static inline void
+__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
+{
+	RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
+	RTE_ASSERT(m->shinfo != NULL);
+
+	if (rte_mbuf_ext_refcnt_update(m->shinfo, -1) == 0)
+		m->shinfo->free_cb(m->buf_addr, m->shinfo->fcb_opaque);
+}
+
+/**
+ * @internal used by rte_pktmbuf_detach().
+ *
+ * Decrement the direct mbuf's reference counter. When the reference
+ * counter becomes 0, the direct mbuf is freed.
+ */
+static inline void
+__rte_pktmbuf_free_direct(struct rte_mbuf *m)
+{
+	struct rte_mbuf *md;
+
+	RTE_ASSERT(RTE_MBUF_INDIRECT(m));
+
+	md = rte_mbuf_from_indirect(m);
+
+	if (rte_mbuf_refcnt_update(md, -1) == 0) {
+		md->next = NULL;
+		md->nb_segs = 1;
+		rte_mbuf_refcnt_set(md, 1);
+		rte_mbuf_raw_free(md);
+	}
+}
+
+/**
+ * Detach a packet mbuf from external buffer or direct buffer.
+ *
+ *  - decrement refcnt and free the external/direct buffer if refcnt
+ *    becomes zero.
  *  - restore original mbuf address and length values.
  *  - reset pktmbuf data and data_len to their default values.
- *  - decrement the direct mbuf's reference counter. When the
- *  reference counter becomes 0, the direct mbuf is freed.
  *
  * All other fields of the given packet mbuf will be left intact.
  *
@@ -1282,10 +1530,14 @@  static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
  */
 static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 {
-	struct rte_mbuf *md = rte_mbuf_from_indirect(m);
 	struct rte_mempool *mp = m->pool;
 	uint32_t mbuf_size, buf_len, priv_size;
 
+	if (RTE_MBUF_HAS_EXTBUF(m))
+		__rte_pktmbuf_free_extbuf(m);
+	else
+		__rte_pktmbuf_free_direct(m);
+
 	priv_size = rte_pktmbuf_priv_size(mp);
 	mbuf_size = sizeof(struct rte_mbuf) + priv_size;
 	buf_len = rte_pktmbuf_data_room_size(mp);
@@ -1297,13 +1549,6 @@  static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
 	rte_pktmbuf_reset_headroom(m);
 	m->data_len = 0;
 	m->ol_flags = 0;
-
-	if (rte_mbuf_refcnt_update(md, -1) == 0) {
-		md->next = NULL;
-		md->nb_segs = 1;
-		rte_mbuf_refcnt_set(md, 1);
-		rte_mbuf_raw_free(md);
-	}
 }
 
 /**
@@ -1327,7 +1572,7 @@  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 
 	if (likely(rte_mbuf_refcnt_read(m) == 1)) {
 
-		if (RTE_MBUF_INDIRECT(m))
+		if (!RTE_MBUF_DIRECT(m))
 			rte_pktmbuf_detach(m);
 
 		if (m->next != NULL) {
@@ -1339,7 +1584,7 @@  rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
 
 	} else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
 
-		if (RTE_MBUF_INDIRECT(m))
+		if (!RTE_MBUF_DIRECT(m))
 			rte_pktmbuf_detach(m);
 
 		if (m->next != NULL) {