[dpdk-dev] [RFC PATCH 3/6] mempool/bucket: implement bucket mempool manager

Olivier MATZ olivier.matz at 6wind.com
Thu Dec 14 14:38:22 CET 2017


On Fri, Nov 24, 2017 at 04:06:28PM +0000, Andrew Rybchenko wrote:
> From: "Artem V. Andreev" <Artem.Andreev at oktetlabs.ru>
> 
> The manager provides a way to allocate physically and virtually
> contiguous set of objects.
> 
> Note: due to the way objects are organized in the bucket manager,
> the get_avail_count may return less objects than were enqueued.
> That breaks the expectation of mempool and mempool_perf tests.

To me, this can be problematic. The driver should respect the
API, or it will trigger hard-to-debug issues in applications. Can't
this be fixed in some way or another?

[...]

> --- a/config/common_base
> +++ b/config/common_base
> @@ -608,6 +608,8 @@ CONFIG_RTE_LIBRTE_MEMPOOL_DEBUG=n
>  #
>  # Compile Mempool drivers
>  #
> +CONFIG_RTE_DRIVER_MEMPOOL_BUCKET=y
> +CONFIG_RTE_DRIVER_MEMPOOL_BUCKET_SIZE_KB=32
>  CONFIG_RTE_DRIVER_MEMPOOL_RING=y
>  CONFIG_RTE_DRIVER_MEMPOOL_STACK=y
>  

Why 32KB?
Why not more, or less?
Can it be a runtime parameter?
I guess it won't work with too large objects.

[...]

> +struct bucket_data {
> +	unsigned int header_size;
> +	unsigned int chunk_size;
> +	unsigned int bucket_size;
> +	uintptr_t bucket_page_mask;
> +	struct rte_ring *shared_bucket_ring;
> +	struct bucket_stack *buckets[RTE_MAX_LCORE];
> +	/*
> +	 * Multi-producer single-consumer ring to hold objects that are
> +	 * returned to the mempool at a different lcore than initially
> +	 * dequeued
> +	 */
> +	struct rte_ring *adoption_buffer_rings[RTE_MAX_LCORE];
> +	struct rte_ring *shared_orphan_ring;
> +	struct rte_mempool *pool;
> +
> +};

I'm seeing per-core structures. Will it work on non-dataplane cores?
For instance, if a control thread wants to allocate a mbuf?

If possible, these fields should be more documented (or just renamed).
For instance, I suggest chunk_size could be called obj_per_bucket, which
better described the content of the field.

[...]

> +static int
> +bucket_enqueue_single(struct bucket_data *data, void *obj)
> +{
> +	int rc = 0;
> +	uintptr_t addr = (uintptr_t)obj;
> +	struct bucket_header *hdr;
> +	unsigned int lcore_id = rte_lcore_id();
> +
> +	addr &= data->bucket_page_mask;
> +	hdr = (struct bucket_header *)addr;
> +
> +	if (likely(hdr->lcore_id == lcore_id)) {
> +		if (hdr->fill_cnt < data->bucket_size - 1) {
> +			hdr->fill_cnt++;
> +		} else {
> +			hdr->fill_cnt = 0;
> +			/* Stack is big enough to put all buckets */
> +			bucket_stack_push(data->buckets[lcore_id], hdr);
> +		}
> +	} else if (hdr->lcore_id != LCORE_ID_ANY) {
> +		struct rte_ring *adopt_ring =
> +			data->adoption_buffer_rings[hdr->lcore_id];
> +
> +		rc = rte_ring_enqueue(adopt_ring, obj);
> +		/* Ring is big enough to put all objects */
> +		RTE_ASSERT(rc == 0);
> +	} else if (hdr->fill_cnt < data->bucket_size - 1) {
> +		hdr->fill_cnt++;
> +	} else {
> +		hdr->fill_cnt = 0;
> +		rc = rte_ring_enqueue(data->shared_bucket_ring, hdr);
> +		/* Ring is big enough to put all buckets */
> +		RTE_ASSERT(rc == 0);
> +	}
> +
> +	return rc;
> +}

[...]

> +static int
> +bucket_dequeue_buckets(struct bucket_data *data, void **obj_table,
> +		       unsigned int n_buckets)
> +{
> +	struct bucket_stack *cur_stack = data->buckets[rte_lcore_id()];
> +	unsigned int n_buckets_from_stack = RTE_MIN(n_buckets, cur_stack->top);
> +	void **obj_table_base = obj_table;
> +
> +	n_buckets -= n_buckets_from_stack;
> +	while (n_buckets_from_stack-- > 0) {
> +		void *obj = bucket_stack_pop_unsafe(cur_stack);
> +
> +		obj_table = bucket_fill_obj_table(data, &obj, obj_table,
> +						  data->bucket_size);
> +	}
> +	while (n_buckets-- > 0) {
> +		struct bucket_header *hdr;
> +
> +		if (unlikely(rte_ring_dequeue(data->shared_bucket_ring,
> +					      (void **)&hdr) != 0)) {
> +			/* Return the already-dequeued buffers
> +			 * back to the mempool
> +			 */
> +			bucket_enqueue(data->pool, obj_table_base,
> +				       obj_table - obj_table_base);
> +			rte_errno = ENOBUFS;
> +			return -rte_errno;
> +		}
> +		hdr->lcore_id = rte_lcore_id();
> +		obj_table = bucket_fill_obj_table(data, (void **)&hdr,
> +						  obj_table, data->bucket_size);
> +	}
> +
> +	return 0;
> +}

[...]

> +static int
> +bucket_dequeue(struct rte_mempool *mp, void **obj_table, unsigned int n)
> +{
> +	struct bucket_data *data = mp->pool_data;
> +	unsigned int n_buckets = n / data->bucket_size;
> +	unsigned int n_orphans = n - n_buckets * data->bucket_size;
> +	int rc = 0;
> +
> +	bucket_adopt_orphans(data);
> +
> +	if (unlikely(n_orphans > 0)) {
> +		rc = bucket_dequeue_orphans(data, obj_table +
> +					    (n_buckets * data->bucket_size),
> +					    n_orphans);
> +		if (rc != 0)
> +			return rc;
> +	}
> +
> +	if (likely(n_buckets > 0)) {
> +		rc = bucket_dequeue_buckets(data, obj_table, n_buckets);
> +		if (unlikely(rc != 0) && n_orphans > 0) {
> +			rte_ring_enqueue_bulk(data->shared_orphan_ring,
> +					      obj_table + (n_buckets *
> +							   data->bucket_size),
> +					      n_orphans, NULL);
> +		}
> +	}
> +
> +	return rc;
> +}

If my understanding is correct, at initialization, all full buckets will
go to the data->shared_bucket_ring ring, with lcore_id == ANY (this is
done in register_mem).

(note: I feel 'data' is not an ideal name for bucket_data)

If the core 0 allocates all the mbufs, and then frees them all, they
will be stored in the per-core stack, with hdr->lcoreid == 0. Is it
right?

If yes, can core 1 allocate a mbuf after that?


> +static unsigned int
> +bucket_get_count(const struct rte_mempool *mp)
> +{
> +	const struct bucket_data *data = mp->pool_data;
> +	const struct bucket_stack *local_bucket_stack =
> +		data->buckets[rte_lcore_id()];
> +
> +	return data->bucket_size * local_bucket_stack->top +
> +		data->bucket_size * rte_ring_count(data->shared_bucket_ring) +
> +		rte_ring_count(data->shared_orphan_ring);
> +}

It looks that get_count only rely on the current core stack usage
and ignore the other core stacks.

[...]

> +static int
> +bucket_register_memory_area(__rte_unused const struct rte_mempool *mp,
> +			    char *vaddr, __rte_unused phys_addr_t paddr,
> +			    size_t len)
> +{
> +	/* mp->pool_data may be still uninitialized at this point */
> +	unsigned int chunk_size = mp->header_size + mp->elt_size +
> +		mp->trailer_size;
> +	unsigned int bucket_mem_size =
> +		(BUCKET_MEM_SIZE / chunk_size) * chunk_size;
> +	unsigned int bucket_page_sz = rte_align32pow2(bucket_mem_size);
> +	uintptr_t align;
> +	char *iter;
> +
> +	align = RTE_PTR_ALIGN_CEIL(vaddr, bucket_page_sz) - vaddr;
> +
> +	for (iter = vaddr + align; iter < vaddr + len; iter += bucket_page_sz) {
> +		/* librte_mempool uses the header part for its own bookkeeping,
> +		 * but the librte_mempool's object header is adjacent to the
> +		 * data; it is small enough and the header is guaranteed to be
> +		 * at least CACHE_LINE_SIZE (i.e. 64) bytes, so we do have
> +		 * plenty of space at the start of the header. So the layout
> +		 * looks like this:
> +		 * [bucket_header] ... unused ... [rte_mempool_objhdr] [data...]
> +		 */

This is not always true.
If a use creates a mempool with the NO_CACHE_ALIGN, the header will be
small, without padding.




More information about the dev mailing list