[dpdk-dev] [PATCH v3 1/2] mbuf: support attaching external buffer to mbuf
Ananyev, Konstantin
konstantin.ananyev at intel.com
Mon Apr 23 13:53:04 CEST 2018
>
> This patch introduces a new way of attaching an external buffer to a mbuf.
>
> Attaching an external buffer is quite similar to mbuf indirection in
> replacing buffer addresses and length of a mbuf, but a few differences:
> - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> mbuf must be read-only. But external buffer has its own refcnt and it
> starts from 1. Unless multiple mbufs are attached to a mbuf having an
> external buffer, the external buffer is writable.
> - There's no need to allocate buffer from a mempool. Any buffer can be
> attached with appropriate free callback.
> - Smaller metadata is required to maintain shared data such as refcnt.
>
> Signed-off-by: Yongseok Koh <yskoh at mellanox.com>
> ---
>
> Submitting only non-mlx5 patches to meet deadline for RC1. mlx5 patches
> will be submitted separately rebased on a differnet patchset which
> accommodates new memory hotplug design to mlx PMDs.
>
> v3:
> * implement external buffer attachment instead of introducing buf_off for
> mbuf indirection.
>
> lib/librte_mbuf/rte_mbuf.h | 276 ++++++++++++++++++++++++++++++++++++++++-----
> 1 file changed, 248 insertions(+), 28 deletions(-)
>
> diff --git a/lib/librte_mbuf/rte_mbuf.h b/lib/librte_mbuf/rte_mbuf.h
> index 06eceba37..e64160c81 100644
> --- a/lib/librte_mbuf/rte_mbuf.h
> +++ b/lib/librte_mbuf/rte_mbuf.h
> @@ -326,7 +326,7 @@ extern "C" {
> PKT_TX_MACSEC | \
> PKT_TX_SEC_OFFLOAD)
>
> -#define __RESERVED (1ULL << 61) /**< reserved for future mbuf use */
> +#define EXT_ATTACHED_MBUF (1ULL << 61) /**< Mbuf having external buffer */
>
> #define IND_ATTACHED_MBUF (1ULL << 62) /**< Indirect attached mbuf */
>
> @@ -568,6 +568,24 @@ struct rte_mbuf {
>
> } __rte_cache_aligned;
>
> +/**
> + * Function typedef of callback to free externally attached buffer.
> + */
> +typedef void (*rte_mbuf_extbuf_free_callback_t)(void *addr, void *opaque);
> +
> +/**
> + * Shared data at the end of an external buffer.
> + */
> +struct rte_mbuf_ext_shared_info {
> + rte_mbuf_extbuf_free_callback_t free_cb; /**< Free callback function */
> + void *fcb_opaque; /**< Free callback argument */
> + RTE_STD_C11
> + union {
> + rte_atomic16_t refcnt_atomic; /**< Atomically accessed refcnt */
> + uint16_t refcnt; /**< Non-atomically accessed refcnt */
> + };
> +};
> +
> /**< Maximum number of nb_segs allowed. */
> #define RTE_MBUF_MAX_NB_SEGS UINT16_MAX
>
> @@ -693,9 +711,14 @@ rte_mbuf_to_baddr(struct rte_mbuf *md)
> #define RTE_MBUF_INDIRECT(mb) ((mb)->ol_flags & IND_ATTACHED_MBUF)
>
> /**
> + * Returns TRUE if given mbuf has external buffer, or FALSE otherwise.
> + */
> +#define RTE_MBUF_HAS_EXTBUF(mb) ((mb)->ol_flags & EXT_ATTACHED_MBUF)
> +
> +/**
> * Returns TRUE if given mbuf is direct, or FALSE otherwise.
> */
> -#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb))
> +#define RTE_MBUF_DIRECT(mb) (!RTE_MBUF_INDIRECT(mb) && !RTE_MBUF_HAS_EXTBUF(mb))
As a nit:
RTE_MBUF_DIRECT(mb) (((mb)->ol_flags & (IND_ATTACHED_MBUF | EXT_ATTACHED_MBUF)) == 0)
>
> /**
> * Private data in case of pktmbuf pool.
> @@ -821,6 +844,58 @@ rte_mbuf_refcnt_set(struct rte_mbuf *m, uint16_t new_value)
>
> #endif /* RTE_MBUF_REFCNT_ATOMIC */
>
> +/**
> + * Reads the refcnt of an external buffer.
> + *
> + * @param shinfo
> + * Shared data of the external buffer.
> + * @return
> + * Reference count number.
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_read(const struct rte_mbuf_ext_shared_info *shinfo)
> +{
> + return (uint16_t)(rte_atomic16_read(&shinfo->refcnt_atomic));
> +}
> +
> +/**
> + * Set refcnt of an external buffer.
> + *
> + * @param shinfo
> + * Shared data of the external buffer.
> + * @param new_value
> + * Value set
> + */
> +static inline void
> +rte_extbuf_refcnt_set(struct rte_mbuf_ext_shared_info *shinfo,
> + uint16_t new_value)
> +{
> + rte_atomic16_set(&shinfo->refcnt_atomic, new_value);
> +}
> +
> +/**
> + * Add given value to refcnt of an external buffer and return its new
> + * value.
> + *
> + * @param shinfo
> + * Shared data of the external buffer.
> + * @param value
> + * Value to add/subtract
> + * @return
> + * Updated value
> + */
> +static inline uint16_t
> +rte_extbuf_refcnt_update(struct rte_mbuf_ext_shared_info *shinfo,
> + int16_t value)
> +{
> + if (likely(rte_extbuf_refcnt_read(shinfo) == 1)) {
> + rte_extbuf_refcnt_set(shinfo, 1 + value);
> + return 1 + value;
> + }
> +
> + return (uint16_t)rte_atomic16_add_return(&shinfo->refcnt_atomic, value);
> +}
> +
> /** Mbuf prefetch */
> #define RTE_MBUF_PREFETCH_TO_FREE(m) do { \
> if ((m) != NULL) \
> @@ -1195,11 +1270,120 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> }
>
> /**
> + * Return shared data of external buffer of a mbuf.
> + *
> + * @param m
> + * The pointer to the mbuf.
> + * @return
> + * The address of the shared data.
> + */
> +static inline struct rte_mbuf_ext_shared_info *
> +rte_mbuf_ext_shinfo(struct rte_mbuf *m)
> +{
> + return (struct rte_mbuf_ext_shared_info *)
> + RTE_PTR_ADD(m->buf_addr, m->buf_len);
> +}
> +
> +/**
> + * Attach an external buffer to a mbuf.
> + *
> + * User-managed anonymous buffer can be attached to an mbuf. When attaching
> + * it, corresponding free callback function and its argument should be
> + * provided. This callback function will be called once all the mbufs are
> + * detached from the buffer.
> + *
> + * More mbufs can be attached to the same external buffer by
> + * ``rte_pktmbuf_attach()`` once the external buffer has been attached by
> + * this API.
> + *
> + * Detachment can be done by either ``rte_pktmbuf_detach_extbuf()`` or
> + * ``rte_pktmbuf_detach()``.
> + *
> + * A few bytes in the trailer of the provided buffer will be dedicated for
> + * shared data (``struct rte_mbuf_ext_shared_info``) to store refcnt,
> + * callback function and so on. The shared data can be referenced by
> + * ``rte_mbuf_ext_shinfo()``
> + *
> + * Attaching an external buffer is quite similar to mbuf indirection in
> + * replacing buffer addresses and length of a mbuf, but a few differences:
> + * - As refcnt of a direct mbuf is at least 2, the buffer area of a direct
> + * mbuf must be read-only. But external buffer has its own refcnt and it
> + * starts from 1. Unless multiple mbufs are attached to a mbuf having an
> + * external buffer, the external buffer is writable.
> + * - There's no need to allocate buffer from a mempool. Any buffer can be
> + * attached with appropriate free callback.
> + * - Smaller metadata is required to maintain shared data such as refcnt.
> + *
> + * @warning
> + * @b EXPERIMENTAL: This API may change without prior notice.
> + * Once external buffer is enabled by allowing experimental API,
> + * ``RTE_MBUF_DIRECT()`` and ``RTE_MBUF_INDIRECT()`` are no longer
> + * exclusive. A mbuf can be consiered direct if it is neither indirect nor
> + * having external buffer.
> + *
> + * @param m
> + * The pointer to the mbuf.
> + * @param buf_addr
> + * The pointer to the external buffer we're attaching to.
> + * @param buf_len
> + * The size of the external buffer we're attaching to. This must be larger
> + * than the size of ``struct rte_mbuf_ext_shared_info`` and padding for
> + * alignment. If not enough, this function will return NULL.
> + * @param free_cb
> + * Free callback function to call when the external buffer needs to be freed.
> + * @param fcb_opaque
> + * Argument for the free callback function.
> + * @return
> + * A pointer to the new start of the data on success, return NULL otherwise.
> + */
> +static inline char * __rte_experimental
> +rte_pktmbuf_attach_extbuf(struct rte_mbuf *m, void *buf_addr,
> + uint16_t buf_len, rte_mbuf_extbuf_free_callback_t free_cb,
> + void *fcb_opaque)
> +{
> + void *buf_end = RTE_PTR_ADD(buf_addr, buf_len);
> + struct rte_mbuf_ext_shared_info *shinfo;
> +
> + shinfo = RTE_PTR_ALIGN_FLOOR(RTE_PTR_SUB(buf_end, sizeof(*shinfo)),
> + sizeof(uintptr_t));
> +
> + if ((void *)shinfo <= buf_addr)
> + return NULL;
> +
> + m->buf_addr = buf_addr;
> + m->buf_iova = rte_mempool_virt2iova(buf_addr);
That wouldn't work for arbitrary extern buffer.
Only for the one that is an element in some other mempool.
For arbitrary external buffer - callee has to provide PA for it plus guarantee that
it's VA would be locked down.
>From other side - if your intention is just to use only elements of other mempools -
No need to have free_cb(). mempool_put should do.
> + m->buf_len = RTE_PTR_DIFF(shinfo, buf_addr);
> + m->data_len = 0;
> +
> + rte_pktmbuf_reset_headroom(m);
> + m->ol_flags |= EXT_ATTACHED_MBUF;
> +
> + rte_extbuf_refcnt_set(shinfo, 1);
> + shinfo->free_cb = free_cb;
> + shinfo->fcb_opaque = fcb_opaque;
> +
> + return (char *)m->buf_addr + m->data_off;
> +}
> +
> +/**
> + * Detach the external buffer attached to a mbuf, same as
> + * ``rte_pktmbuf_detach()``
> + *
> + * @param m
> + * The mbuf having external buffer.
> + */
> +#define rte_pktmbuf_detach_extbuf(m) rte_pktmbuf_detach(m)
> +
> +/**
> * Attach packet mbuf to another packet mbuf.
> *
> - * After attachment we refer the mbuf we attached as 'indirect',
> - * while mbuf we attached to as 'direct'.
> - * The direct mbuf's reference counter is incremented.
> + * If the mbuf we are attaching to isn't a direct buffer and is attached to
> + * an external buffer, the mbuf being attached will be attached to the
> + * external buffer instead of mbuf indirection.
> + *
> + * Otherwise, the mbuf will be indirectly attached. After attachment we
> + * refer the mbuf we attached as 'indirect', while mbuf we attached to as
> + * 'direct'. The direct mbuf's reference counter is incremented.
> *
> * Right now, not supported:
> * - attachment for already indirect mbuf (e.g. - mi has to be direct).
> @@ -1213,19 +1397,18 @@ static inline int rte_pktmbuf_alloc_bulk(struct rte_mempool *pool,
> */
> static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
> {
> - struct rte_mbuf *md;
> -
> RTE_ASSERT(RTE_MBUF_DIRECT(mi) &&
> rte_mbuf_refcnt_read(mi) == 1);
>
> - /* if m is not direct, get the mbuf that embeds the data */
> - if (RTE_MBUF_DIRECT(m))
> - md = m;
> - else
> - md = rte_mbuf_from_indirect(m);
> + if (RTE_MBUF_HAS_EXTBUF(m)) {
> + rte_extbuf_refcnt_update(rte_mbuf_ext_shinfo(m), 1);
> + mi->ol_flags = m->ol_flags;
> + } else {
> + rte_mbuf_refcnt_update(rte_mbuf_from_indirect(m), 1);
> + mi->priv_size = m->priv_size;
> + mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
> + }
>
> - rte_mbuf_refcnt_update(md, 1);
> - mi->priv_size = m->priv_size;
> mi->buf_iova = m->buf_iova;
> mi->buf_addr = m->buf_addr;
> mi->buf_len = m->buf_len;
> @@ -1241,7 +1424,6 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
> mi->next = NULL;
> mi->pkt_len = mi->data_len;
> mi->nb_segs = 1;
> - mi->ol_flags = m->ol_flags | IND_ATTACHED_MBUF;
> mi->packet_type = m->packet_type;
> mi->timestamp = m->timestamp;
>
> @@ -1250,12 +1432,53 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
> }
>
> /**
> - * Detach an indirect packet mbuf.
> + * @internal used by rte_pktmbuf_detach().
> + *
> + * Decrement the reference counter of the external buffer. When the
> + * reference counter becomes 0, the buffer is freed by pre-registered
> + * callback.
> + */
> +static inline void
> +__rte_pktmbuf_free_extbuf(struct rte_mbuf *m)
> +{
> + struct rte_mbuf_ext_shared_info *shinfo;
> +
> + RTE_ASSERT(RTE_MBUF_HAS_EXTBUF(m));
> +
> + shinfo = rte_mbuf_ext_shinfo(m);
> +
> + if (rte_extbuf_refcnt_update(shinfo, -1) == 0)
> + shinfo->free_cb(m->buf_addr, shinfo->fcb_opaque);
I understand the reason but extra function call for each external mbuf - seems quite expensive.
Wonder is it possible to group them somehow and amortize the cost?
> +}
> +
> +/**
> + * @internal used by rte_pktmbuf_detach().
> *
> + * Decrement the direct mbuf's reference counter. When the reference
> + * counter becomes 0, the direct mbuf is freed.
> + */
> +static inline void
> +__rte_pktmbuf_free_direct(struct rte_mbuf *m)
> +{
> + struct rte_mbuf *md = rte_mbuf_from_indirect(m);
> +
> + RTE_ASSERT(RTE_MBUF_INDIRECT(m));
> +
> + if (rte_mbuf_refcnt_update(md, -1) == 0) {
> + md->next = NULL;
> + md->nb_segs = 1;
> + rte_mbuf_refcnt_set(md, 1);
> + rte_mbuf_raw_free(md);
> + }
> +}
> +
> +/**
> + * Detach a packet mbuf from external buffer or direct buffer.
> + *
> + * - decrement refcnt and free the external/direct buffer if refcnt
> + * becomes zero.
> * - restore original mbuf address and length values.
> * - reset pktmbuf data and data_len to their default values.
> - * - decrement the direct mbuf's reference counter. When the
> - * reference counter becomes 0, the direct mbuf is freed.
> *
> * All other fields of the given packet mbuf will be left intact.
> *
> @@ -1264,10 +1487,14 @@ static inline void rte_pktmbuf_attach(struct rte_mbuf *mi, struct rte_mbuf *m)
> */
> static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
> {
> - struct rte_mbuf *md = rte_mbuf_from_indirect(m);
> struct rte_mempool *mp = m->pool;
> uint32_t mbuf_size, buf_len, priv_size;
>
> + if (RTE_MBUF_HAS_EXTBUF(m))
> + __rte_pktmbuf_free_extbuf(m);
> + else
> + __rte_pktmbuf_free_direct(m);
> +
> priv_size = rte_pktmbuf_priv_size(mp);
> mbuf_size = sizeof(struct rte_mbuf) + priv_size;
> buf_len = rte_pktmbuf_data_room_size(mp);
> @@ -1279,13 +1506,6 @@ static inline void rte_pktmbuf_detach(struct rte_mbuf *m)
> rte_pktmbuf_reset_headroom(m);
> m->data_len = 0;
> m->ol_flags = 0;
> -
> - if (rte_mbuf_refcnt_update(md, -1) == 0) {
> - md->next = NULL;
> - md->nb_segs = 1;
> - rte_mbuf_refcnt_set(md, 1);
> - rte_mbuf_raw_free(md);
> - }
> }
>
> /**
> @@ -1309,7 +1529,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>
> if (likely(rte_mbuf_refcnt_read(m) == 1)) {
>
> - if (RTE_MBUF_INDIRECT(m))
> + if (!RTE_MBUF_DIRECT(m))
> rte_pktmbuf_detach(m);
>
> if (m->next != NULL) {
> @@ -1321,7 +1541,7 @@ rte_pktmbuf_prefree_seg(struct rte_mbuf *m)
>
> } else if (__rte_mbuf_refcnt_update(m, -1) == 0) {
>
> - if (RTE_MBUF_INDIRECT(m))
> + if (!RTE_MBUF_DIRECT(m))
> rte_pktmbuf_detach(m);
>
> if (m->next != NULL) {
> --
> 2.11.0
More information about the dev
mailing list