[v3,04/11] baseband/acc100: add queue configuration

Message ID 1597796731-57841-5-git-send-email-nicolas.chautru@intel.com (mailing list archive)
State Superseded, archived
Delegated to: akhil goyal
Headers
Series bbdev PMD ACC100 |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Chautru, Nicolas Aug. 19, 2020, 12:25 a.m. UTC
  Adding function to create and configure queues for
the device. Still no capability.

Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
---
 drivers/baseband/acc100/rte_acc100_pmd.c | 420 ++++++++++++++++++++++++++++++-
 drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
 2 files changed, 464 insertions(+), 1 deletion(-)
  

Comments

Xu, Rosen Aug. 29, 2020, 10:39 a.m. UTC | #1
Hi,

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Nicolas Chautru
> Sent: Wednesday, August 19, 2020 8:25
> To: dev@dpdk.org; akhil.goyal@nxp.com
> Cc: Richardson, Bruce <bruce.richardson@intel.com>; Chautru, Nicolas
> <nicolas.chautru@intel.com>
> Subject: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> configuration
> 
> Adding function to create and configure queues for the device. Still no
> capability.
> 
> Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> ---
>  drivers/baseband/acc100/rte_acc100_pmd.c | 420
> ++++++++++++++++++++++++++++++-
> drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
>  2 files changed, 464 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> b/drivers/baseband/acc100/rte_acc100_pmd.c
> index 7807a30..7a21c57 100644
> --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> @@ -26,6 +26,22 @@
>  RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);  #endif
> 
> +/* Write to MMIO register address */
> +static inline void
> +mmio_write(void *addr, uint32_t value)
> +{
> +	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value); }
> +
> +/* Write a register of a ACC100 device */ static inline void
> +acc100_reg_write(struct acc100_device *d, uint32_t offset, uint32_t
> +payload) {
> +	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
> +	mmio_write(reg_addr, payload);
> +	usleep(1000);
> +}
> +
>  /* Read a register of a ACC100 device */  static inline uint32_t
> acc100_reg_read(struct acc100_device *d, uint32_t offset) @@ -36,6 +52,22
> @@
>  	return rte_le_to_cpu_32(ret);
>  }
> 
> +/* Basic Implementation of Log2 for exact 2^N */ static inline uint32_t
> +log2_basic(uint32_t value) {
> +	return (value == 0) ? 0 : __builtin_ctz(value); }
> +
> +/* Calculate memory alignment offset assuming alignment is 2^N */
> +static inline uint32_t calc_mem_alignment_offset(void
> +*unaligned_virt_mem, uint32_t alignment) {
> +	rte_iova_t unaligned_phy_mem =
> rte_malloc_virt2iova(unaligned_virt_mem);
> +	return (uint32_t)(alignment -
> +			(unaligned_phy_mem & (alignment-1))); }
> +
>  /* Calculate the offset of the enqueue register */  static inline uint32_t
> queue_offset(bool pf_device, uint8_t vf_id, uint8_t qgrp_id, uint16_t aq_id)
> @@ -204,10 +236,393 @@
>  			acc100_conf->q_dl_5g.aq_depth_log2);
>  }
> 
> +static void
> +free_base_addresses(void **base_addrs, int size) {
> +	int i;
> +	for (i = 0; i < size; i++)
> +		rte_free(base_addrs[i]);
> +}
> +
> +static inline uint32_t
> +get_desc_len(void)
> +{
> +	return sizeof(union acc100_dma_desc);
> +}
> +
> +/* Allocate the 2 * 64MB block for the sw rings */ static int
> +alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct acc100_device
> *d,
> +		int socket)
> +{
> +	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
> +	d->sw_rings_base = rte_zmalloc_socket(dev->device->driver-
> >name,
> +			2 * sw_ring_size, RTE_CACHE_LINE_SIZE, socket);
> +	if (d->sw_rings_base == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate memory for %s:%u",
> +				dev->device->driver->name,
> +				dev->data->dev_id);
> +		return -ENOMEM;
> +	}
> +	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
> +	uint32_t next_64mb_align_offset = calc_mem_alignment_offset(
> +			d->sw_rings_base, ACC100_SIZE_64MBYTE);
> +	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base,
> next_64mb_align_offset);
> +	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base) +
> +			next_64mb_align_offset;
> +	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> +	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
> +
> +	return 0;
> +}

Why not a common alloc memory function but special function for different memory size?

> +/* Attempt to allocate minimised memory space for sw rings */ static
> +void alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct
> acc100_device
> +*d,
> +		uint16_t num_queues, int socket)
> +{
> +	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
> +	uint32_t next_64mb_align_offset;
> +	rte_iova_t sw_ring_phys_end_addr;
> +	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
> +	void *sw_rings_base;
> +	int i = 0;
> +	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> +	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
> +
> +	/* Find an aligned block of memory to store sw rings */
> +	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
> +		/*
> +		 * sw_ring allocated memory is guaranteed to be aligned to
> +		 * q_sw_ring_size at the condition that the requested size is
> +		 * less than the page size
> +		 */
> +		sw_rings_base = rte_zmalloc_socket(
> +				dev->device->driver->name,
> +				dev_sw_ring_size, q_sw_ring_size, socket);
> +
> +		if (sw_rings_base == NULL) {
> +			rte_bbdev_log(ERR,
> +					"Failed to allocate memory
> for %s:%u",
> +					dev->device->driver->name,
> +					dev->data->dev_id);
> +			break;
> +		}
> +
> +		sw_rings_base_phy = rte_malloc_virt2iova(sw_rings_base);
> +		next_64mb_align_offset = calc_mem_alignment_offset(
> +				sw_rings_base, ACC100_SIZE_64MBYTE);
> +		next_64mb_align_addr_phy = sw_rings_base_phy +
> +				next_64mb_align_offset;
> +		sw_ring_phys_end_addr = sw_rings_base_phy +
> dev_sw_ring_size;
> +
> +		/* Check if the end of the sw ring memory block is before the
> +		 * start of next 64MB aligned mem address
> +		 */
> +		if (sw_ring_phys_end_addr < next_64mb_align_addr_phy) {
> +			d->sw_rings_phys = sw_rings_base_phy;
> +			d->sw_rings = sw_rings_base;
> +			d->sw_rings_base = sw_rings_base;
> +			d->sw_ring_size = q_sw_ring_size;
> +			d->sw_ring_max_depth = MAX_QUEUE_DEPTH;
> +			break;
> +		}
> +		/* Store the address of the unaligned mem block */
> +		base_addrs[i] = sw_rings_base;
> +		i++;
> +	}
> +
> +	/* Free all unaligned blocks of mem allocated in the loop */
> +	free_base_addresses(base_addrs, i);
> +}

It's strange to firstly alloc memory and then free memory but on operations on this memory.

> +
> +/* Allocate 64MB memory used for all software rings */ static int
> +acc100_setup_queues(struct rte_bbdev *dev, uint16_t num_queues, int
> +socket_id) {
> +	uint32_t phys_low, phys_high, payload;
> +	struct acc100_device *d = dev->data->dev_private;
> +	const struct acc100_registry_addr *reg_addr;
> +
> +	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
> +		rte_bbdev_log(NOTICE,
> +				"%s has PF mode disabled. This PF can't be
> used.",
> +				dev->data->name);
> +		return -ENODEV;
> +	}
> +
> +	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
> +
> +	/* If minimal memory space approach failed, then allocate
> +	 * the 2 * 64MB block for the sw rings
> +	 */
> +	if (d->sw_rings == NULL)
> +		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
> +
> +	/* Configure ACC100 with the base address for DMA descriptor rings
> +	 * Same descriptor rings used for UL and DL DMA Engines
> +	 * Note : Assuming only VF0 bundle is used for PF mode
> +	 */
> +	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
> +	phys_low  = (uint32_t)(d->sw_rings_phys &
> ~(ACC100_SIZE_64MBYTE-1));
> +
> +	/* Choose correct registry addresses for the device type */
> +	if (d->pf_device)
> +		reg_addr = &pf_reg_addr;
> +	else
> +		reg_addr = &vf_reg_addr;
> +
> +	/* Read the populated cfg from ACC100 registers */
> +	fetch_acc100_config(dev);
> +
> +	/* Mark as configured properly */
> +	d->configured = true;
> +
> +	/* Release AXI from PF */
> +	if (d->pf_device)
> +		acc100_reg_write(d, HWPfDmaAxiControl, 1);
> +
> +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo, phys_low);
> +
> +	/*
> +	 * Configure Ring Size to the max queue ring size
> +	 * (used for wrapping purpose)
> +	 */
> +	payload = log2_basic(d->sw_ring_size / 64);
> +	acc100_reg_write(d, reg_addr->ring_size, payload);
> +
> +	/* Configure tail pointer for use when SDONE enabled */
> +	d->tail_ptrs = rte_zmalloc_socket(
> +			dev->device->driver->name,
> +			ACC100_NUM_QGRPS * ACC100_NUM_AQS *
> sizeof(uint32_t),
> +			RTE_CACHE_LINE_SIZE, socket_id);
> +	if (d->tail_ptrs == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate tail ptr for %s:%u",
> +				dev->device->driver->name,
> +				dev->data->dev_id);
> +		rte_free(d->sw_rings);
> +		return -ENOMEM;
> +	}
> +	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
> +
> +	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
> +	phys_low  = (uint32_t)(d->tail_ptr_phys);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
> +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
> +
> +	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
> +			ACC100_HARQ_LAYOUT * sizeof(*d->harq_layout),
> +			RTE_CACHE_LINE_SIZE, dev->data->socket_id);
> +
> +	rte_bbdev_log_debug(
> +			"ACC100 (%s) configured  sw_rings = %p,
> sw_rings_phys = %#"
> +			PRIx64, dev->data->name, d->sw_rings, d-
> >sw_rings_phys);
> +
> +	return 0;
> +}
> +
>  /* Free 64MB memory used for software rings */  static int -
> acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
> +acc100_dev_close(struct rte_bbdev *dev)
>  {
> +	struct acc100_device *d = dev->data->dev_private;
> +	if (d->sw_rings_base != NULL) {
> +		rte_free(d->tail_ptrs);
> +		rte_free(d->sw_rings_base);
> +		d->sw_rings_base = NULL;
> +	}
> +	usleep(1000);
> +	return 0;
> +}
> +
> +
> +/**
> + * Report a ACC100 queue index which is free
> + * Return 0 to 16k for a valid queue_idx or -1 when no queue is
> +available
> + * Note : Only supporting VF0 Bundle for PF mode  */ static int
> +acc100_find_free_queue_idx(struct rte_bbdev *dev,
> +		const struct rte_bbdev_queue_conf *conf) {
> +	struct acc100_device *d = dev->data->dev_private;
> +	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
> +	int acc = op_2_acc[conf->op_type];
> +	struct rte_q_topology_t *qtop = NULL;
> +	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
> +	if (qtop == NULL)
> +		return -1;
> +	/* Identify matching QGroup Index which are sorted in priority order
> */
> +	uint16_t group_idx = qtop->first_qgroup_index;
> +	group_idx += conf->priority;
> +	if (group_idx >= ACC100_NUM_QGRPS ||
> +			conf->priority >= qtop->num_qgroups) {
> +		rte_bbdev_log(INFO, "Invalid Priority on %s, priority %u",
> +				dev->data->name, conf->priority);
> +		return -1;
> +	}
> +	/* Find a free AQ_idx  */
> +	uint16_t aq_idx;
> +	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups; aq_idx++) {
> +		if (((d->q_assigned_bit_map[group_idx] >> aq_idx) & 0x1)
> == 0) {
> +			/* Mark the Queue as assigned */
> +			d->q_assigned_bit_map[group_idx] |= (1 << aq_idx);
> +			/* Report the AQ Index */
> +			return (group_idx << GRP_ID_SHIFT) + aq_idx;
> +		}
> +	}
> +	rte_bbdev_log(INFO, "Failed to find free queue on %s, priority %u",
> +			dev->data->name, conf->priority);
> +	return -1;
> +}
> +
> +/* Setup ACC100 queue */
> +static int
> +acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
> +		const struct rte_bbdev_queue_conf *conf) {
> +	struct acc100_device *d = dev->data->dev_private;
> +	struct acc100_queue *q;
> +	int16_t q_idx;
> +
> +	/* Allocate the queue data structure. */
> +	q = rte_zmalloc_socket(dev->device->driver->name, sizeof(*q),
> +			RTE_CACHE_LINE_SIZE, conf->socket);
> +	if (q == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate queue memory");
> +		return -ENOMEM;
> +	}
> +
> +	q->d = d;
> +	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size *
> queue_id));
> +	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size *
> queue_id);
> +
> +	/* Prepare the Ring with default descriptor format */
> +	union acc100_dma_desc *desc = NULL;
> +	unsigned int desc_idx, b_idx;
> +	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
> +		ACC100_FCW_LE_BLEN : (conf->op_type ==
> RTE_BBDEV_OP_TURBO_DEC ?
> +		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
> +
> +	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> +		desc = q->ring_addr + desc_idx;
> +		desc->req.word0 = ACC100_DMA_DESC_TYPE;
> +		desc->req.word1 = 0; /**< Timestamp */
> +		desc->req.word2 = 0;
> +		desc->req.word3 = 0;
> +		uint64_t fcw_offset = (desc_idx << 8) +
> ACC100_DESC_FCW_OFFSET;
> +		desc->req.data_ptrs[0].address = q->ring_addr_phys +
> fcw_offset;
> +		desc->req.data_ptrs[0].blen = fcw_len;
> +		desc->req.data_ptrs[0].blkid = ACC100_DMA_BLKID_FCW;
> +		desc->req.data_ptrs[0].last = 0;
> +		desc->req.data_ptrs[0].dma_ext = 0;
> +		for (b_idx = 1; b_idx < ACC100_DMA_MAX_NUM_POINTERS
> - 1;
> +				b_idx++) {
> +			desc->req.data_ptrs[b_idx].blkid =
> ACC100_DMA_BLKID_IN;
> +			desc->req.data_ptrs[b_idx].last = 1;
> +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> +			b_idx++;
> +			desc->req.data_ptrs[b_idx].blkid =
> +					ACC100_DMA_BLKID_OUT_ENC;
> +			desc->req.data_ptrs[b_idx].last = 1;
> +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> +		}
> +		/* Preset some fields of LDPC FCW */
> +		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
> +		desc->req.fcw_ld.gain_i = 1;
> +		desc->req.fcw_ld.gain_h = 1;
> +	}
> +
> +	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
> +			RTE_CACHE_LINE_SIZE,
> +			RTE_CACHE_LINE_SIZE, conf->socket);
> +	if (q->lb_in == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate lb_in memory");
> +		return -ENOMEM;
> +	}
> +	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
> +	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
> +			RTE_CACHE_LINE_SIZE,
> +			RTE_CACHE_LINE_SIZE, conf->socket);
> +	if (q->lb_out == NULL) {
> +		rte_bbdev_log(ERR, "Failed to allocate lb_out memory");
> +		return -ENOMEM;
> +	}
> +	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
> +
> +	/*
> +	 * Software queue ring wraps synchronously with the HW when it
> reaches
> +	 * the boundary of the maximum allocated queue size, no matter
> what the
> +	 * sw queue size is. This wrapping is guarded by setting the
> wrap_mask
> +	 * to represent the maximum queue size as allocated at the time
> when
> +	 * the device has been setup (in configure()).
> +	 *
> +	 * The queue depth is set to the queue size value (conf-
> >queue_size).
> +	 * This limits the occupancy of the queue at any point of time, so that
> +	 * the queue does not get swamped with enqueue requests.
> +	 */
> +	q->sw_ring_depth = conf->queue_size;
> +	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
> +
> +	q->op_type = conf->op_type;
> +
> +	q_idx = acc100_find_free_queue_idx(dev, conf);
> +	if (q_idx == -1) {
> +		rte_free(q);
> +		return -1;
> +	}
> +
> +	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
> +	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
> +	q->aq_id = q_idx & 0xF;
> +	q->aq_depth = (conf->op_type ==  RTE_BBDEV_OP_TURBO_DEC) ?
> +			(1 << d->acc100_conf.q_ul_4g.aq_depth_log2) :
> +			(1 << d->acc100_conf.q_dl_4g.aq_depth_log2);
> +
> +	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
> +			queue_offset(d->pf_device,
> +					q->vf_id, q->qgrp_id, q->aq_id));
> +
> +	rte_bbdev_log_debug(
> +			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u,
> aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
> +			dev->data->dev_id, queue_id, q->qgrp_id, q->vf_id,
> +			q->aq_id, q->aq_depth, q->mmio_reg_enqueue);
> +
> +	dev->data->queues[queue_id].queue_private = q;
> +	return 0;
> +}
> +
> +/* Release ACC100 queue */
> +static int
> +acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id) {
> +	struct acc100_device *d = dev->data->dev_private;
> +	struct acc100_queue *q = dev->data->queues[q_id].queue_private;
> +
> +	if (q != NULL) {
> +		/* Mark the Queue as un-assigned */
> +		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF -
> +				(1 << q->aq_id));
> +		rte_free(q->lb_in);
> +		rte_free(q->lb_out);
> +		rte_free(q);
> +		dev->data->queues[q_id].queue_private = NULL;
> +	}
> +
>  	return 0;
>  }
> 
> @@ -258,8 +673,11 @@
>  }
> 
>  static const struct rte_bbdev_ops acc100_bbdev_ops = {
> +	.setup_queues = acc100_setup_queues,
>  	.close = acc100_dev_close,
>  	.info_get = acc100_dev_info_get,
> +	.queue_setup = acc100_queue_setup,
> +	.queue_release = acc100_queue_release,
>  };
> 
>  /* ACC100 PCI PF address map */
> diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h
> b/drivers/baseband/acc100/rte_acc100_pmd.h
> index 662e2c8..0e2b79c 100644
> --- a/drivers/baseband/acc100/rte_acc100_pmd.h
> +++ b/drivers/baseband/acc100/rte_acc100_pmd.h
> @@ -518,11 +518,56 @@ struct acc100_registry_addr {
>  	.ddr_range = HWVfDmaDdrBaseRangeRoVf,
>  };
> 
> +/* Structure associated with each queue. */ struct __rte_cache_aligned
> +acc100_queue {
> +	union acc100_dma_desc *ring_addr;  /* Virtual address of sw ring */
> +	rte_iova_t ring_addr_phys;  /* Physical address of software ring */
> +	uint32_t sw_ring_head;  /* software ring head */
> +	uint32_t sw_ring_tail;  /* software ring tail */
> +	/* software ring size (descriptors, not bytes) */
> +	uint32_t sw_ring_depth;
> +	/* mask used to wrap enqueued descriptors on the sw ring */
> +	uint32_t sw_ring_wrap_mask;
> +	/* MMIO register used to enqueue descriptors */
> +	void *mmio_reg_enqueue;
> +	uint8_t vf_id;  /* VF ID (max = 63) */
> +	uint8_t qgrp_id;  /* Queue Group ID */
> +	uint16_t aq_id;  /* Atomic Queue ID */
> +	uint16_t aq_depth;  /* Depth of atomic queue */
> +	uint32_t aq_enqueued;  /* Count how many "batches" have been
> enqueued */
> +	uint32_t aq_dequeued;  /* Count how many "batches" have been
> dequeued */
> +	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> +	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> +	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD
> */
> +	/* Internal Buffers for loopback input */
> +	uint8_t *lb_in;
> +	uint8_t *lb_out;
> +	rte_iova_t lb_in_addr_phys;
> +	rte_iova_t lb_out_addr_phys;
> +	struct acc100_device *d;
> +};
> +
>  /* Private data structure for each ACC100 device */  struct acc100_device {
>  	void *mmio_base;  /**< Base address of MMIO registers (BAR0) */
> +	void *sw_rings_base;  /* Base addr of un-aligned memory for sw
> rings */
> +	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw rings */
> +	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
> +	/* Virtual address of the info memory routed to the this function
> under
> +	 * operation, whether it is PF or VF.
> +	 */
> +	union acc100_harq_layout_data *harq_layout;
> +	uint32_t sw_ring_size;
>  	uint32_t ddr_size; /* Size in kB */
> +	uint32_t *tail_ptrs; /* Base address of response tail pointer buffer */
> +	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers */
> +	/* Max number of entries available for each queue in device,
> depending
> +	 * on how many queues are enabled with configure()
> +	 */
> +	uint32_t sw_ring_max_depth;
>  	struct acc100_conf acc100_conf; /* ACC100 Initial configuration */
> +	/* Bitmap capturing which Queues have already been assigned */
> +	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
>  	bool pf_device; /**< True if this is a PF ACC100 device */
>  	bool configured; /**< True if this ACC100 device is configured */  };
> --
> 1.8.3.1
  
Chautru, Nicolas Aug. 29, 2020, 5:48 p.m. UTC | #2
Hi, 

> From: Xu, Rosen <rosen.xu@intel.com>
> 
> Hi,
> 
> > -----Original Message-----
> > From: dev <dev-bounces@dpdk.org> On Behalf Of Nicolas Chautru
> > Sent: Wednesday, August 19, 2020 8:25
> > To: dev@dpdk.org; akhil.goyal@nxp.com
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>; Chautru, Nicolas
> > <nicolas.chautru@intel.com>
> > Subject: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > configuration
> >
> > Adding function to create and configure queues for the device. Still
> > no capability.
> >
> > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > ---
> >  drivers/baseband/acc100/rte_acc100_pmd.c | 420
> > ++++++++++++++++++++++++++++++-
> > drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
> >  2 files changed, 464 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> > b/drivers/baseband/acc100/rte_acc100_pmd.c
> > index 7807a30..7a21c57 100644
> > --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> > +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> > @@ -26,6 +26,22 @@
> >  RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);  #endif
> >
> > +/* Write to MMIO register address */
> > +static inline void
> > +mmio_write(void *addr, uint32_t value) {
> > +	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value); }
> > +
> > +/* Write a register of a ACC100 device */ static inline void
> > +acc100_reg_write(struct acc100_device *d, uint32_t offset, uint32_t
> > +payload) {
> > +	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
> > +	mmio_write(reg_addr, payload);
> > +	usleep(1000);
> > +}
> > +
> >  /* Read a register of a ACC100 device */  static inline uint32_t
> > acc100_reg_read(struct acc100_device *d, uint32_t offset) @@ -36,6
> > +52,22 @@
> >  	return rte_le_to_cpu_32(ret);
> >  }
> >
> > +/* Basic Implementation of Log2 for exact 2^N */ static inline
> > +uint32_t log2_basic(uint32_t value) {
> > +	return (value == 0) ? 0 : __builtin_ctz(value); }
> > +
> > +/* Calculate memory alignment offset assuming alignment is 2^N */
> > +static inline uint32_t calc_mem_alignment_offset(void
> > +*unaligned_virt_mem, uint32_t alignment) {
> > +	rte_iova_t unaligned_phy_mem =
> > rte_malloc_virt2iova(unaligned_virt_mem);
> > +	return (uint32_t)(alignment -
> > +			(unaligned_phy_mem & (alignment-1))); }
> > +
> >  /* Calculate the offset of the enqueue register */  static inline
> > uint32_t queue_offset(bool pf_device, uint8_t vf_id, uint8_t qgrp_id,
> > uint16_t aq_id) @@ -204,10 +236,393 @@
> >  			acc100_conf->q_dl_5g.aq_depth_log2);
> >  }
> >
> > +static void
> > +free_base_addresses(void **base_addrs, int size) {
> > +	int i;
> > +	for (i = 0; i < size; i++)
> > +		rte_free(base_addrs[i]);
> > +}
> > +
> > +static inline uint32_t
> > +get_desc_len(void)
> > +{
> > +	return sizeof(union acc100_dma_desc); }
> > +
> > +/* Allocate the 2 * 64MB block for the sw rings */ static int
> > +alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct acc100_device
> > *d,
> > +		int socket)
> > +{
> > +	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
> > +	d->sw_rings_base = rte_zmalloc_socket(dev->device->driver-
> > >name,
> > +			2 * sw_ring_size, RTE_CACHE_LINE_SIZE, socket);
> > +	if (d->sw_rings_base == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate memory for %s:%u",
> > +				dev->device->driver->name,
> > +				dev->data->dev_id);
> > +		return -ENOMEM;
> > +	}
> > +	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
> > +	uint32_t next_64mb_align_offset = calc_mem_alignment_offset(
> > +			d->sw_rings_base, ACC100_SIZE_64MBYTE);
> > +	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base,
> > next_64mb_align_offset);
> > +	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base) +
> > +			next_64mb_align_offset;
> > +	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > +	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
> > +
> > +	return 0;
> > +}
> 
> Why not a common alloc memory function but special function for different
> memory size?

This is a bit convoluted but due to the fact the first attempt method which is optimal (minimum) may not always find aligned memory. 


> 
> > +/* Attempt to allocate minimised memory space for sw rings */ static
> > +void alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct
> > acc100_device
> > +*d,
> > +		uint16_t num_queues, int socket)
> > +{
> > +	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
> > +	uint32_t next_64mb_align_offset;
> > +	rte_iova_t sw_ring_phys_end_addr;
> > +	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
> > +	void *sw_rings_base;
> > +	int i = 0;
> > +	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > +	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
> > +
> > +	/* Find an aligned block of memory to store sw rings */
> > +	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
> > +		/*
> > +		 * sw_ring allocated memory is guaranteed to be aligned to
> > +		 * q_sw_ring_size at the condition that the requested size is
> > +		 * less than the page size
> > +		 */
> > +		sw_rings_base = rte_zmalloc_socket(
> > +				dev->device->driver->name,
> > +				dev_sw_ring_size, q_sw_ring_size, socket);
> > +
> > +		if (sw_rings_base == NULL) {
> > +			rte_bbdev_log(ERR,
> > +					"Failed to allocate memory
> > for %s:%u",
> > +					dev->device->driver->name,
> > +					dev->data->dev_id);
> > +			break;
> > +		}
> > +
> > +		sw_rings_base_phy = rte_malloc_virt2iova(sw_rings_base);
> > +		next_64mb_align_offset = calc_mem_alignment_offset(
> > +				sw_rings_base, ACC100_SIZE_64MBYTE);
> > +		next_64mb_align_addr_phy = sw_rings_base_phy +
> > +				next_64mb_align_offset;
> > +		sw_ring_phys_end_addr = sw_rings_base_phy +
> > dev_sw_ring_size;
> > +
> > +		/* Check if the end of the sw ring memory block is before the
> > +		 * start of next 64MB aligned mem address
> > +		 */
> > +		if (sw_ring_phys_end_addr < next_64mb_align_addr_phy) {
> > +			d->sw_rings_phys = sw_rings_base_phy;
> > +			d->sw_rings = sw_rings_base;
> > +			d->sw_rings_base = sw_rings_base;
> > +			d->sw_ring_size = q_sw_ring_size;
> > +			d->sw_ring_max_depth = MAX_QUEUE_DEPTH;
> > +			break;
> > +		}
> > +		/* Store the address of the unaligned mem block */
> > +		base_addrs[i] = sw_rings_base;
> > +		i++;
> > +	}
> > +
> > +	/* Free all unaligned blocks of mem allocated in the loop */
> > +	free_base_addresses(base_addrs, i);
> > +}
> 
> It's strange to firstly alloc memory and then free memory but on operations on
> this memory.

I may miss your point. We are freeing the exact same mem we did get from rte_zmalloc. 
Not that the base_addrs array refers to multiple attempts of mallocs, not multiple operations in a ring. 

> 
> > +
> > +/* Allocate 64MB memory used for all software rings */ static int
> > +acc100_setup_queues(struct rte_bbdev *dev, uint16_t num_queues, int
> > +socket_id) {
> > +	uint32_t phys_low, phys_high, payload;
> > +	struct acc100_device *d = dev->data->dev_private;
> > +	const struct acc100_registry_addr *reg_addr;
> > +
> > +	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
> > +		rte_bbdev_log(NOTICE,
> > +				"%s has PF mode disabled. This PF can't be
> > used.",
> > +				dev->data->name);
> > +		return -ENODEV;
> > +	}
> > +
> > +	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
> > +
> > +	/* If minimal memory space approach failed, then allocate
> > +	 * the 2 * 64MB block for the sw rings
> > +	 */
> > +	if (d->sw_rings == NULL)
> > +		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
> > +
> > +	/* Configure ACC100 with the base address for DMA descriptor rings
> > +	 * Same descriptor rings used for UL and DL DMA Engines
> > +	 * Note : Assuming only VF0 bundle is used for PF mode
> > +	 */
> > +	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
> > +	phys_low  = (uint32_t)(d->sw_rings_phys &
> > ~(ACC100_SIZE_64MBYTE-1));
> > +
> > +	/* Choose correct registry addresses for the device type */
> > +	if (d->pf_device)
> > +		reg_addr = &pf_reg_addr;
> > +	else
> > +		reg_addr = &vf_reg_addr;
> > +
> > +	/* Read the populated cfg from ACC100 registers */
> > +	fetch_acc100_config(dev);
> > +
> > +	/* Mark as configured properly */
> > +	d->configured = true;
> > +
> > +	/* Release AXI from PF */
> > +	if (d->pf_device)
> > +		acc100_reg_write(d, HWPfDmaAxiControl, 1);
> > +
> > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo, phys_low);
> > +
> > +	/*
> > +	 * Configure Ring Size to the max queue ring size
> > +	 * (used for wrapping purpose)
> > +	 */
> > +	payload = log2_basic(d->sw_ring_size / 64);
> > +	acc100_reg_write(d, reg_addr->ring_size, payload);
> > +
> > +	/* Configure tail pointer for use when SDONE enabled */
> > +	d->tail_ptrs = rte_zmalloc_socket(
> > +			dev->device->driver->name,
> > +			ACC100_NUM_QGRPS * ACC100_NUM_AQS *
> > sizeof(uint32_t),
> > +			RTE_CACHE_LINE_SIZE, socket_id);
> > +	if (d->tail_ptrs == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate tail ptr for %s:%u",
> > +				dev->device->driver->name,
> > +				dev->data->dev_id);
> > +		rte_free(d->sw_rings);
> > +		return -ENOMEM;
> > +	}
> > +	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
> > +
> > +	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
> > +	phys_low  = (uint32_t)(d->tail_ptr_phys);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
> > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
> > +
> > +	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
> > +			ACC100_HARQ_LAYOUT * sizeof(*d->harq_layout),
> > +			RTE_CACHE_LINE_SIZE, dev->data->socket_id);
> > +
> > +	rte_bbdev_log_debug(
> > +			"ACC100 (%s) configured  sw_rings = %p,
> > sw_rings_phys = %#"
> > +			PRIx64, dev->data->name, d->sw_rings, d-
> > >sw_rings_phys);
> > +
> > +	return 0;
> > +}
> > +
> >  /* Free 64MB memory used for software rings */  static int -
> > acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
> > +acc100_dev_close(struct rte_bbdev *dev)
> >  {
> > +	struct acc100_device *d = dev->data->dev_private;
> > +	if (d->sw_rings_base != NULL) {
> > +		rte_free(d->tail_ptrs);
> > +		rte_free(d->sw_rings_base);
> > +		d->sw_rings_base = NULL;
> > +	}
> > +	usleep(1000);
> > +	return 0;
> > +}
> > +
> > +
> > +/**
> > + * Report a ACC100 queue index which is free
> > + * Return 0 to 16k for a valid queue_idx or -1 when no queue is
> > +available
> > + * Note : Only supporting VF0 Bundle for PF mode  */ static int
> > +acc100_find_free_queue_idx(struct rte_bbdev *dev,
> > +		const struct rte_bbdev_queue_conf *conf) {
> > +	struct acc100_device *d = dev->data->dev_private;
> > +	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
> > +	int acc = op_2_acc[conf->op_type];
> > +	struct rte_q_topology_t *qtop = NULL;
> > +	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
> > +	if (qtop == NULL)
> > +		return -1;
> > +	/* Identify matching QGroup Index which are sorted in priority order
> > */
> > +	uint16_t group_idx = qtop->first_qgroup_index;
> > +	group_idx += conf->priority;
> > +	if (group_idx >= ACC100_NUM_QGRPS ||
> > +			conf->priority >= qtop->num_qgroups) {
> > +		rte_bbdev_log(INFO, "Invalid Priority on %s, priority %u",
> > +				dev->data->name, conf->priority);
> > +		return -1;
> > +	}
> > +	/* Find a free AQ_idx  */
> > +	uint16_t aq_idx;
> > +	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups; aq_idx++) {
> > +		if (((d->q_assigned_bit_map[group_idx] >> aq_idx) & 0x1)
> > == 0) {
> > +			/* Mark the Queue as assigned */
> > +			d->q_assigned_bit_map[group_idx] |= (1 << aq_idx);
> > +			/* Report the AQ Index */
> > +			return (group_idx << GRP_ID_SHIFT) + aq_idx;
> > +		}
> > +	}
> > +	rte_bbdev_log(INFO, "Failed to find free queue on %s, priority %u",
> > +			dev->data->name, conf->priority);
> > +	return -1;
> > +}
> > +
> > +/* Setup ACC100 queue */
> > +static int
> > +acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
> > +		const struct rte_bbdev_queue_conf *conf) {
> > +	struct acc100_device *d = dev->data->dev_private;
> > +	struct acc100_queue *q;
> > +	int16_t q_idx;
> > +
> > +	/* Allocate the queue data structure. */
> > +	q = rte_zmalloc_socket(dev->device->driver->name, sizeof(*q),
> > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > +	if (q == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate queue memory");
> > +		return -ENOMEM;
> > +	}
> > +
> > +	q->d = d;
> > +	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size *
> > queue_id));
> > +	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size *
> > queue_id);
> > +
> > +	/* Prepare the Ring with default descriptor format */
> > +	union acc100_dma_desc *desc = NULL;
> > +	unsigned int desc_idx, b_idx;
> > +	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
> > +		ACC100_FCW_LE_BLEN : (conf->op_type ==
> > RTE_BBDEV_OP_TURBO_DEC ?
> > +		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
> > +
> > +	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> > +		desc = q->ring_addr + desc_idx;
> > +		desc->req.word0 = ACC100_DMA_DESC_TYPE;
> > +		desc->req.word1 = 0; /**< Timestamp */
> > +		desc->req.word2 = 0;
> > +		desc->req.word3 = 0;
> > +		uint64_t fcw_offset = (desc_idx << 8) +
> > ACC100_DESC_FCW_OFFSET;
> > +		desc->req.data_ptrs[0].address = q->ring_addr_phys +
> > fcw_offset;
> > +		desc->req.data_ptrs[0].blen = fcw_len;
> > +		desc->req.data_ptrs[0].blkid = ACC100_DMA_BLKID_FCW;
> > +		desc->req.data_ptrs[0].last = 0;
> > +		desc->req.data_ptrs[0].dma_ext = 0;
> > +		for (b_idx = 1; b_idx < ACC100_DMA_MAX_NUM_POINTERS
> > - 1;
> > +				b_idx++) {
> > +			desc->req.data_ptrs[b_idx].blkid =
> > ACC100_DMA_BLKID_IN;
> > +			desc->req.data_ptrs[b_idx].last = 1;
> > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > +			b_idx++;
> > +			desc->req.data_ptrs[b_idx].blkid =
> > +					ACC100_DMA_BLKID_OUT_ENC;
> > +			desc->req.data_ptrs[b_idx].last = 1;
> > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > +		}
> > +		/* Preset some fields of LDPC FCW */
> > +		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
> > +		desc->req.fcw_ld.gain_i = 1;
> > +		desc->req.fcw_ld.gain_h = 1;
> > +	}
> > +
> > +	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
> > +			RTE_CACHE_LINE_SIZE,
> > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > +	if (q->lb_in == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate lb_in memory");
> > +		return -ENOMEM;
> > +	}
> > +	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
> > +	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
> > +			RTE_CACHE_LINE_SIZE,
> > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > +	if (q->lb_out == NULL) {
> > +		rte_bbdev_log(ERR, "Failed to allocate lb_out memory");
> > +		return -ENOMEM;
> > +	}
> > +	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
> > +
> > +	/*
> > +	 * Software queue ring wraps synchronously with the HW when it
> > reaches
> > +	 * the boundary of the maximum allocated queue size, no matter
> > what the
> > +	 * sw queue size is. This wrapping is guarded by setting the
> > wrap_mask
> > +	 * to represent the maximum queue size as allocated at the time
> > when
> > +	 * the device has been setup (in configure()).
> > +	 *
> > +	 * The queue depth is set to the queue size value (conf-
> > >queue_size).
> > +	 * This limits the occupancy of the queue at any point of time, so that
> > +	 * the queue does not get swamped with enqueue requests.
> > +	 */
> > +	q->sw_ring_depth = conf->queue_size;
> > +	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
> > +
> > +	q->op_type = conf->op_type;
> > +
> > +	q_idx = acc100_find_free_queue_idx(dev, conf);
> > +	if (q_idx == -1) {
> > +		rte_free(q);
> > +		return -1;
> > +	}
> > +
> > +	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
> > +	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
> > +	q->aq_id = q_idx & 0xF;
> > +	q->aq_depth = (conf->op_type ==  RTE_BBDEV_OP_TURBO_DEC) ?
> > +			(1 << d->acc100_conf.q_ul_4g.aq_depth_log2) :
> > +			(1 << d->acc100_conf.q_dl_4g.aq_depth_log2);
> > +
> > +	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
> > +			queue_offset(d->pf_device,
> > +					q->vf_id, q->qgrp_id, q->aq_id));
> > +
> > +	rte_bbdev_log_debug(
> > +			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u,
> > aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
> > +			dev->data->dev_id, queue_id, q->qgrp_id, q->vf_id,
> > +			q->aq_id, q->aq_depth, q->mmio_reg_enqueue);
> > +
> > +	dev->data->queues[queue_id].queue_private = q;
> > +	return 0;
> > +}
> > +
> > +/* Release ACC100 queue */
> > +static int
> > +acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id) {
> > +	struct acc100_device *d = dev->data->dev_private;
> > +	struct acc100_queue *q = dev->data->queues[q_id].queue_private;
> > +
> > +	if (q != NULL) {
> > +		/* Mark the Queue as un-assigned */
> > +		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF -
> > +				(1 << q->aq_id));
> > +		rte_free(q->lb_in);
> > +		rte_free(q->lb_out);
> > +		rte_free(q);
> > +		dev->data->queues[q_id].queue_private = NULL;
> > +	}
> > +
> >  	return 0;
> >  }
> >
> > @@ -258,8 +673,11 @@
> >  }
> >
> >  static const struct rte_bbdev_ops acc100_bbdev_ops = {
> > +	.setup_queues = acc100_setup_queues,
> >  	.close = acc100_dev_close,
> >  	.info_get = acc100_dev_info_get,
> > +	.queue_setup = acc100_queue_setup,
> > +	.queue_release = acc100_queue_release,
> >  };
> >
> >  /* ACC100 PCI PF address map */
> > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h
> > b/drivers/baseband/acc100/rte_acc100_pmd.h
> > index 662e2c8..0e2b79c 100644
> > --- a/drivers/baseband/acc100/rte_acc100_pmd.h
> > +++ b/drivers/baseband/acc100/rte_acc100_pmd.h
> > @@ -518,11 +518,56 @@ struct acc100_registry_addr {
> >  	.ddr_range = HWVfDmaDdrBaseRangeRoVf,  };
> >
> > +/* Structure associated with each queue. */ struct
> > +__rte_cache_aligned acc100_queue {
> > +	union acc100_dma_desc *ring_addr;  /* Virtual address of sw ring */
> > +	rte_iova_t ring_addr_phys;  /* Physical address of software ring */
> > +	uint32_t sw_ring_head;  /* software ring head */
> > +	uint32_t sw_ring_tail;  /* software ring tail */
> > +	/* software ring size (descriptors, not bytes) */
> > +	uint32_t sw_ring_depth;
> > +	/* mask used to wrap enqueued descriptors on the sw ring */
> > +	uint32_t sw_ring_wrap_mask;
> > +	/* MMIO register used to enqueue descriptors */
> > +	void *mmio_reg_enqueue;
> > +	uint8_t vf_id;  /* VF ID (max = 63) */
> > +	uint8_t qgrp_id;  /* Queue Group ID */
> > +	uint16_t aq_id;  /* Atomic Queue ID */
> > +	uint16_t aq_depth;  /* Depth of atomic queue */
> > +	uint32_t aq_enqueued;  /* Count how many "batches" have been
> > enqueued */
> > +	uint32_t aq_dequeued;  /* Count how many "batches" have been
> > dequeued */
> > +	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> > +	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> > +	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD
> > */
> > +	/* Internal Buffers for loopback input */
> > +	uint8_t *lb_in;
> > +	uint8_t *lb_out;
> > +	rte_iova_t lb_in_addr_phys;
> > +	rte_iova_t lb_out_addr_phys;
> > +	struct acc100_device *d;
> > +};
> > +
> >  /* Private data structure for each ACC100 device */  struct acc100_device {
> >  	void *mmio_base;  /**< Base address of MMIO registers (BAR0) */
> > +	void *sw_rings_base;  /* Base addr of un-aligned memory for sw
> > rings */
> > +	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw rings */
> > +	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
> > +	/* Virtual address of the info memory routed to the this function
> > under
> > +	 * operation, whether it is PF or VF.
> > +	 */
> > +	union acc100_harq_layout_data *harq_layout;
> > +	uint32_t sw_ring_size;
> >  	uint32_t ddr_size; /* Size in kB */
> > +	uint32_t *tail_ptrs; /* Base address of response tail pointer buffer */
> > +	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers */
> > +	/* Max number of entries available for each queue in device,
> > depending
> > +	 * on how many queues are enabled with configure()
> > +	 */
> > +	uint32_t sw_ring_max_depth;
> >  	struct acc100_conf acc100_conf; /* ACC100 Initial configuration */
> > +	/* Bitmap capturing which Queues have already been assigned */
> > +	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
> >  	bool pf_device; /**< True if this is a PF ACC100 device */
> >  	bool configured; /**< True if this ACC100 device is configured */
> > };
> > --
> > 1.8.3.1
  
Xu, Rosen Sept. 3, 2020, 2:30 a.m. UTC | #3
Hi,

> -----Original Message-----
> From: Chautru, Nicolas <nicolas.chautru@intel.com>
> Sent: Sunday, August 30, 2020 1:48
> To: Xu, Rosen <rosen.xu@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com
> Cc: Richardson, Bruce <bruce.richardson@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> configuration
> 
> Hi,
> 
> > From: Xu, Rosen <rosen.xu@intel.com>
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: dev <dev-bounces@dpdk.org> On Behalf Of Nicolas Chautru
> > > Sent: Wednesday, August 19, 2020 8:25
> > > To: dev@dpdk.org; akhil.goyal@nxp.com
> > > Cc: Richardson, Bruce <bruce.richardson@intel.com>; Chautru, Nicolas
> > > <nicolas.chautru@intel.com>
> > > Subject: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > > configuration
> > >
> > > Adding function to create and configure queues for the device. Still
> > > no capability.
> > >
> > > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > > ---
> > >  drivers/baseband/acc100/rte_acc100_pmd.c | 420
> > > ++++++++++++++++++++++++++++++-
> > > drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
> > >  2 files changed, 464 insertions(+), 1 deletion(-)
> > >
> > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > index 7807a30..7a21c57 100644
> > > --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > @@ -26,6 +26,22 @@
> > >  RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);  #endif
> > >
> > > +/* Write to MMIO register address */ static inline void
> > > +mmio_write(void *addr, uint32_t value) {
> > > +	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value); }
> > > +
> > > +/* Write a register of a ACC100 device */ static inline void
> > > +acc100_reg_write(struct acc100_device *d, uint32_t offset, uint32_t
> > > +payload) {
> > > +	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
> > > +	mmio_write(reg_addr, payload);
> > > +	usleep(1000);
> > > +}
> > > +
> > >  /* Read a register of a ACC100 device */  static inline uint32_t
> > > acc100_reg_read(struct acc100_device *d, uint32_t offset) @@ -36,6
> > > +52,22 @@
> > >  	return rte_le_to_cpu_32(ret);
> > >  }
> > >
> > > +/* Basic Implementation of Log2 for exact 2^N */ static inline
> > > +uint32_t log2_basic(uint32_t value) {
> > > +	return (value == 0) ? 0 : __builtin_ctz(value); }
> > > +
> > > +/* Calculate memory alignment offset assuming alignment is 2^N */
> > > +static inline uint32_t calc_mem_alignment_offset(void
> > > +*unaligned_virt_mem, uint32_t alignment) {
> > > +	rte_iova_t unaligned_phy_mem =
> > > rte_malloc_virt2iova(unaligned_virt_mem);
> > > +	return (uint32_t)(alignment -
> > > +			(unaligned_phy_mem & (alignment-1))); }
> > > +
> > >  /* Calculate the offset of the enqueue register */  static inline
> > > uint32_t queue_offset(bool pf_device, uint8_t vf_id, uint8_t
> > > qgrp_id, uint16_t aq_id) @@ -204,10 +236,393 @@
> > >  			acc100_conf->q_dl_5g.aq_depth_log2);
> > >  }
> > >
> > > +static void
> > > +free_base_addresses(void **base_addrs, int size) {
> > > +	int i;
> > > +	for (i = 0; i < size; i++)
> > > +		rte_free(base_addrs[i]);
> > > +}
> > > +
> > > +static inline uint32_t
> > > +get_desc_len(void)
> > > +{
> > > +	return sizeof(union acc100_dma_desc); }
> > > +
> > > +/* Allocate the 2 * 64MB block for the sw rings */ static int
> > > +alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct
> > > +acc100_device
> > > *d,
> > > +		int socket)
> > > +{
> > > +	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
> > > +	d->sw_rings_base = rte_zmalloc_socket(dev->device->driver-
> > > >name,
> > > +			2 * sw_ring_size, RTE_CACHE_LINE_SIZE, socket);
> > > +	if (d->sw_rings_base == NULL) {
> > > +		rte_bbdev_log(ERR, "Failed to allocate memory for %s:%u",
> > > +				dev->device->driver->name,
> > > +				dev->data->dev_id);
> > > +		return -ENOMEM;
> > > +	}
> > > +	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
> > > +	uint32_t next_64mb_align_offset = calc_mem_alignment_offset(
> > > +			d->sw_rings_base, ACC100_SIZE_64MBYTE);
> > > +	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base,
> > > next_64mb_align_offset);
> > > +	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base) +
> > > +			next_64mb_align_offset;
> > > +	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > > +	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
> > > +
> > > +	return 0;
> > > +}
> >
> > Why not a common alloc memory function but special function for
> > different memory size?
> 
> This is a bit convoluted but due to the fact the first attempt method which is
> optimal (minimum) may not always find aligned memory.

What's convoluted? Can you explain?
For packet processing, in most scenarios, aren't we aligned memory when we alloc memory?
> 
> >
> > > +/* Attempt to allocate minimised memory space for sw rings */
> > > +static void alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct
> > > acc100_device
> > > +*d,
> > > +		uint16_t num_queues, int socket)
> > > +{
> > > +	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
> > > +	uint32_t next_64mb_align_offset;
> > > +	rte_iova_t sw_ring_phys_end_addr;
> > > +	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
> > > +	void *sw_rings_base;
> > > +	int i = 0;
> > > +	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > > +	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
> > > +
> > > +	/* Find an aligned block of memory to store sw rings */
> > > +	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
> > > +		/*
> > > +		 * sw_ring allocated memory is guaranteed to be aligned to
> > > +		 * q_sw_ring_size at the condition that the requested size is
> > > +		 * less than the page size
> > > +		 */
> > > +		sw_rings_base = rte_zmalloc_socket(
> > > +				dev->device->driver->name,
> > > +				dev_sw_ring_size, q_sw_ring_size, socket);
> > > +
> > > +		if (sw_rings_base == NULL) {
> > > +			rte_bbdev_log(ERR,
> > > +					"Failed to allocate memory
> > > for %s:%u",
> > > +					dev->device->driver->name,
> > > +					dev->data->dev_id);
> > > +			break;
> > > +		}
> > > +
> > > +		sw_rings_base_phy = rte_malloc_virt2iova(sw_rings_base);
> > > +		next_64mb_align_offset = calc_mem_alignment_offset(
> > > +				sw_rings_base, ACC100_SIZE_64MBYTE);
> > > +		next_64mb_align_addr_phy = sw_rings_base_phy +
> > > +				next_64mb_align_offset;
> > > +		sw_ring_phys_end_addr = sw_rings_base_phy +
> > > dev_sw_ring_size;
> > > +
> > > +		/* Check if the end of the sw ring memory block is before the
> > > +		 * start of next 64MB aligned mem address
> > > +		 */
> > > +		if (sw_ring_phys_end_addr < next_64mb_align_addr_phy) {
> > > +			d->sw_rings_phys = sw_rings_base_phy;
> > > +			d->sw_rings = sw_rings_base;
> > > +			d->sw_rings_base = sw_rings_base;
> > > +			d->sw_ring_size = q_sw_ring_size;
> > > +			d->sw_ring_max_depth = MAX_QUEUE_DEPTH;
> > > +			break;
> > > +		}
> > > +		/* Store the address of the unaligned mem block */
> > > +		base_addrs[i] = sw_rings_base;
> > > +		i++;
> > > +	}
> > > +
> > > +	/* Free all unaligned blocks of mem allocated in the loop */
> > > +	free_base_addresses(base_addrs, i); }
> >
> > It's strange to firstly alloc memory and then free memory but on
> > operations on this memory.
> 
> I may miss your point. We are freeing the exact same mem we did get from
> rte_zmalloc.
> Not that the base_addrs array refers to multiple attempts of mallocs, not
> multiple operations in a ring.

You alloc memory sw_rings_base, after some translate, assign this memory to cc100_device *d,
and before the function return, this memory has been freed.

> >
> > > +
> > > +/* Allocate 64MB memory used for all software rings */ static int
> > > +acc100_setup_queues(struct rte_bbdev *dev, uint16_t num_queues,
> int
> > > +socket_id) {
> > > +	uint32_t phys_low, phys_high, payload;
> > > +	struct acc100_device *d = dev->data->dev_private;
> > > +	const struct acc100_registry_addr *reg_addr;
> > > +
> > > +	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
> > > +		rte_bbdev_log(NOTICE,
> > > +				"%s has PF mode disabled. This PF can't be
> > > used.",
> > > +				dev->data->name);
> > > +		return -ENODEV;
> > > +	}
> > > +
> > > +	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
> > > +
> > > +	/* If minimal memory space approach failed, then allocate
> > > +	 * the 2 * 64MB block for the sw rings
> > > +	 */
> > > +	if (d->sw_rings == NULL)
> > > +		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
> > > +
> > > +	/* Configure ACC100 with the base address for DMA descriptor rings
> > > +	 * Same descriptor rings used for UL and DL DMA Engines
> > > +	 * Note : Assuming only VF0 bundle is used for PF mode
> > > +	 */
> > > +	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
> > > +	phys_low  = (uint32_t)(d->sw_rings_phys &
> > > ~(ACC100_SIZE_64MBYTE-1));
> > > +
> > > +	/* Choose correct registry addresses for the device type */
> > > +	if (d->pf_device)
> > > +		reg_addr = &pf_reg_addr;
> > > +	else
> > > +		reg_addr = &vf_reg_addr;
> > > +
> > > +	/* Read the populated cfg from ACC100 registers */
> > > +	fetch_acc100_config(dev);
> > > +
> > > +	/* Mark as configured properly */
> > > +	d->configured = true;
> > > +
> > > +	/* Release AXI from PF */
> > > +	if (d->pf_device)
> > > +		acc100_reg_write(d, HWPfDmaAxiControl, 1);
> > > +
> > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo, phys_low);
> > > +
> > > +	/*
> > > +	 * Configure Ring Size to the max queue ring size
> > > +	 * (used for wrapping purpose)
> > > +	 */
> > > +	payload = log2_basic(d->sw_ring_size / 64);
> > > +	acc100_reg_write(d, reg_addr->ring_size, payload);
> > > +
> > > +	/* Configure tail pointer for use when SDONE enabled */
> > > +	d->tail_ptrs = rte_zmalloc_socket(
> > > +			dev->device->driver->name,
> > > +			ACC100_NUM_QGRPS * ACC100_NUM_AQS *
> > > sizeof(uint32_t),
> > > +			RTE_CACHE_LINE_SIZE, socket_id);
> > > +	if (d->tail_ptrs == NULL) {
> > > +		rte_bbdev_log(ERR, "Failed to allocate tail ptr for %s:%u",
> > > +				dev->device->driver->name,
> > > +				dev->data->dev_id);
> > > +		rte_free(d->sw_rings);
> > > +		return -ENOMEM;
> > > +	}
> > > +	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
> > > +
> > > +	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
> > > +	phys_low  = (uint32_t)(d->tail_ptr_phys);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
> > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
> > > +
> > > +	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
> > > +			ACC100_HARQ_LAYOUT * sizeof(*d->harq_layout),
> > > +			RTE_CACHE_LINE_SIZE, dev->data->socket_id);
> > > +
> > > +	rte_bbdev_log_debug(
> > > +			"ACC100 (%s) configured  sw_rings = %p,
> > > sw_rings_phys = %#"
> > > +			PRIx64, dev->data->name, d->sw_rings, d-
> > > >sw_rings_phys);
> > > +
> > > +	return 0;
> > > +}
> > > +
> > >  /* Free 64MB memory used for software rings */  static int -
> > > acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
> > > +acc100_dev_close(struct rte_bbdev *dev)
> > >  {
> > > +	struct acc100_device *d = dev->data->dev_private;
> > > +	if (d->sw_rings_base != NULL) {
> > > +		rte_free(d->tail_ptrs);
> > > +		rte_free(d->sw_rings_base);
> > > +		d->sw_rings_base = NULL;
> > > +	}
> > > +	usleep(1000);
> > > +	return 0;
> > > +}
> > > +
> > > +
> > > +/**
> > > + * Report a ACC100 queue index which is free
> > > + * Return 0 to 16k for a valid queue_idx or -1 when no queue is
> > > +available
> > > + * Note : Only supporting VF0 Bundle for PF mode  */ static int
> > > +acc100_find_free_queue_idx(struct rte_bbdev *dev,
> > > +		const struct rte_bbdev_queue_conf *conf) {
> > > +	struct acc100_device *d = dev->data->dev_private;
> > > +	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
> > > +	int acc = op_2_acc[conf->op_type];
> > > +	struct rte_q_topology_t *qtop = NULL;
> > > +	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
> > > +	if (qtop == NULL)
> > > +		return -1;
> > > +	/* Identify matching QGroup Index which are sorted in priority
> > > +order
> > > */
> > > +	uint16_t group_idx = qtop->first_qgroup_index;
> > > +	group_idx += conf->priority;
> > > +	if (group_idx >= ACC100_NUM_QGRPS ||
> > > +			conf->priority >= qtop->num_qgroups) {
> > > +		rte_bbdev_log(INFO, "Invalid Priority on %s, priority %u",
> > > +				dev->data->name, conf->priority);
> > > +		return -1;
> > > +	}
> > > +	/* Find a free AQ_idx  */
> > > +	uint16_t aq_idx;
> > > +	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups; aq_idx++) {
> > > +		if (((d->q_assigned_bit_map[group_idx] >> aq_idx) & 0x1)
> > > == 0) {
> > > +			/* Mark the Queue as assigned */
> > > +			d->q_assigned_bit_map[group_idx] |= (1 << aq_idx);
> > > +			/* Report the AQ Index */
> > > +			return (group_idx << GRP_ID_SHIFT) + aq_idx;
> > > +		}
> > > +	}
> > > +	rte_bbdev_log(INFO, "Failed to find free queue on %s, priority %u",
> > > +			dev->data->name, conf->priority);
> > > +	return -1;
> > > +}
> > > +
> > > +/* Setup ACC100 queue */
> > > +static int
> > > +acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
> > > +		const struct rte_bbdev_queue_conf *conf) {
> > > +	struct acc100_device *d = dev->data->dev_private;
> > > +	struct acc100_queue *q;
> > > +	int16_t q_idx;
> > > +
> > > +	/* Allocate the queue data structure. */
> > > +	q = rte_zmalloc_socket(dev->device->driver->name, sizeof(*q),
> > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > +	if (q == NULL) {
> > > +		rte_bbdev_log(ERR, "Failed to allocate queue memory");
> > > +		return -ENOMEM;
> > > +	}
> > > +
> > > +	q->d = d;
> > > +	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size *
> > > queue_id));
> > > +	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size *
> > > queue_id);
> > > +
> > > +	/* Prepare the Ring with default descriptor format */
> > > +	union acc100_dma_desc *desc = NULL;
> > > +	unsigned int desc_idx, b_idx;
> > > +	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
> > > +		ACC100_FCW_LE_BLEN : (conf->op_type ==
> > > RTE_BBDEV_OP_TURBO_DEC ?
> > > +		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
> > > +
> > > +	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> > > +		desc = q->ring_addr + desc_idx;
> > > +		desc->req.word0 = ACC100_DMA_DESC_TYPE;
> > > +		desc->req.word1 = 0; /**< Timestamp */
> > > +		desc->req.word2 = 0;
> > > +		desc->req.word3 = 0;
> > > +		uint64_t fcw_offset = (desc_idx << 8) +
> > > ACC100_DESC_FCW_OFFSET;
> > > +		desc->req.data_ptrs[0].address = q->ring_addr_phys +
> > > fcw_offset;
> > > +		desc->req.data_ptrs[0].blen = fcw_len;
> > > +		desc->req.data_ptrs[0].blkid = ACC100_DMA_BLKID_FCW;
> > > +		desc->req.data_ptrs[0].last = 0;
> > > +		desc->req.data_ptrs[0].dma_ext = 0;
> > > +		for (b_idx = 1; b_idx < ACC100_DMA_MAX_NUM_POINTERS
> > > - 1;
> > > +				b_idx++) {
> > > +			desc->req.data_ptrs[b_idx].blkid =
> > > ACC100_DMA_BLKID_IN;
> > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > +			b_idx++;
> > > +			desc->req.data_ptrs[b_idx].blkid =
> > > +					ACC100_DMA_BLKID_OUT_ENC;
> > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > +		}
> > > +		/* Preset some fields of LDPC FCW */
> > > +		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
> > > +		desc->req.fcw_ld.gain_i = 1;
> > > +		desc->req.fcw_ld.gain_h = 1;
> > > +	}
> > > +
> > > +	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
> > > +			RTE_CACHE_LINE_SIZE,
> > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > +	if (q->lb_in == NULL) {
> > > +		rte_bbdev_log(ERR, "Failed to allocate lb_in memory");
> > > +		return -ENOMEM;
> > > +	}
> > > +	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
> > > +	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
> > > +			RTE_CACHE_LINE_SIZE,
> > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > +	if (q->lb_out == NULL) {
> > > +		rte_bbdev_log(ERR, "Failed to allocate lb_out memory");
> > > +		return -ENOMEM;
> > > +	}
> > > +	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
> > > +
> > > +	/*
> > > +	 * Software queue ring wraps synchronously with the HW when it
> > > reaches
> > > +	 * the boundary of the maximum allocated queue size, no matter
> > > what the
> > > +	 * sw queue size is. This wrapping is guarded by setting the
> > > wrap_mask
> > > +	 * to represent the maximum queue size as allocated at the time
> > > when
> > > +	 * the device has been setup (in configure()).
> > > +	 *
> > > +	 * The queue depth is set to the queue size value (conf-
> > > >queue_size).
> > > +	 * This limits the occupancy of the queue at any point of time, so that
> > > +	 * the queue does not get swamped with enqueue requests.
> > > +	 */
> > > +	q->sw_ring_depth = conf->queue_size;
> > > +	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
> > > +
> > > +	q->op_type = conf->op_type;
> > > +
> > > +	q_idx = acc100_find_free_queue_idx(dev, conf);
> > > +	if (q_idx == -1) {
> > > +		rte_free(q);
> > > +		return -1;
> > > +	}
> > > +
> > > +	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
> > > +	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
> > > +	q->aq_id = q_idx & 0xF;
> > > +	q->aq_depth = (conf->op_type ==  RTE_BBDEV_OP_TURBO_DEC) ?
> > > +			(1 << d->acc100_conf.q_ul_4g.aq_depth_log2) :
> > > +			(1 << d->acc100_conf.q_dl_4g.aq_depth_log2);
> > > +
> > > +	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
> > > +			queue_offset(d->pf_device,
> > > +					q->vf_id, q->qgrp_id, q->aq_id));
> > > +
> > > +	rte_bbdev_log_debug(
> > > +			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u,
> > > aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
> > > +			dev->data->dev_id, queue_id, q->qgrp_id, q->vf_id,
> > > +			q->aq_id, q->aq_depth, q->mmio_reg_enqueue);
> > > +
> > > +	dev->data->queues[queue_id].queue_private = q;
> > > +	return 0;
> > > +}
> > > +
> > > +/* Release ACC100 queue */
> > > +static int
> > > +acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id) {
> > > +	struct acc100_device *d = dev->data->dev_private;
> > > +	struct acc100_queue *q = dev->data->queues[q_id].queue_private;
> > > +
> > > +	if (q != NULL) {
> > > +		/* Mark the Queue as un-assigned */
> > > +		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF -
> > > +				(1 << q->aq_id));
> > > +		rte_free(q->lb_in);
> > > +		rte_free(q->lb_out);
> > > +		rte_free(q);
> > > +		dev->data->queues[q_id].queue_private = NULL;
> > > +	}
> > > +
> > >  	return 0;
> > >  }
> > >
> > > @@ -258,8 +673,11 @@
> > >  }
> > >
> > >  static const struct rte_bbdev_ops acc100_bbdev_ops = {
> > > +	.setup_queues = acc100_setup_queues,
> > >  	.close = acc100_dev_close,
> > >  	.info_get = acc100_dev_info_get,
> > > +	.queue_setup = acc100_queue_setup,
> > > +	.queue_release = acc100_queue_release,
> > >  };
> > >
> > >  /* ACC100 PCI PF address map */
> > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > index 662e2c8..0e2b79c 100644
> > > --- a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > @@ -518,11 +518,56 @@ struct acc100_registry_addr {
> > >  	.ddr_range = HWVfDmaDdrBaseRangeRoVf,  };
> > >
> > > +/* Structure associated with each queue. */ struct
> > > +__rte_cache_aligned acc100_queue {
> > > +	union acc100_dma_desc *ring_addr;  /* Virtual address of sw ring */
> > > +	rte_iova_t ring_addr_phys;  /* Physical address of software ring */
> > > +	uint32_t sw_ring_head;  /* software ring head */
> > > +	uint32_t sw_ring_tail;  /* software ring tail */
> > > +	/* software ring size (descriptors, not bytes) */
> > > +	uint32_t sw_ring_depth;
> > > +	/* mask used to wrap enqueued descriptors on the sw ring */
> > > +	uint32_t sw_ring_wrap_mask;
> > > +	/* MMIO register used to enqueue descriptors */
> > > +	void *mmio_reg_enqueue;
> > > +	uint8_t vf_id;  /* VF ID (max = 63) */
> > > +	uint8_t qgrp_id;  /* Queue Group ID */
> > > +	uint16_t aq_id;  /* Atomic Queue ID */
> > > +	uint16_t aq_depth;  /* Depth of atomic queue */
> > > +	uint32_t aq_enqueued;  /* Count how many "batches" have been
> > > enqueued */
> > > +	uint32_t aq_dequeued;  /* Count how many "batches" have been
> > > dequeued */
> > > +	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> > > +	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> > > +	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD
> > > */
> > > +	/* Internal Buffers for loopback input */
> > > +	uint8_t *lb_in;
> > > +	uint8_t *lb_out;
> > > +	rte_iova_t lb_in_addr_phys;
> > > +	rte_iova_t lb_out_addr_phys;
> > > +	struct acc100_device *d;
> > > +};
> > > +
> > >  /* Private data structure for each ACC100 device */  struct acc100_device
> {
> > >  	void *mmio_base;  /**< Base address of MMIO registers (BAR0) */
> > > +	void *sw_rings_base;  /* Base addr of un-aligned memory for sw
> > > rings */
> > > +	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw rings */
> > > +	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
> > > +	/* Virtual address of the info memory routed to the this function
> > > under
> > > +	 * operation, whether it is PF or VF.
> > > +	 */
> > > +	union acc100_harq_layout_data *harq_layout;
> > > +	uint32_t sw_ring_size;
> > >  	uint32_t ddr_size; /* Size in kB */
> > > +	uint32_t *tail_ptrs; /* Base address of response tail pointer buffer */
> > > +	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers */
> > > +	/* Max number of entries available for each queue in device,
> > > depending
> > > +	 * on how many queues are enabled with configure()
> > > +	 */
> > > +	uint32_t sw_ring_max_depth;
> > >  	struct acc100_conf acc100_conf; /* ACC100 Initial configuration */
> > > +	/* Bitmap capturing which Queues have already been assigned */
> > > +	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
> > >  	bool pf_device; /**< True if this is a PF ACC100 device */
> > >  	bool configured; /**< True if this ACC100 device is configured */
> > > };
> > > --
> > > 1.8.3.1
  
Chautru, Nicolas Sept. 3, 2020, 10:48 p.m. UTC | #4
> From: Xu, Rosen <rosen.xu@intel.com>
> 
> Hi,
> 
> > -----Original Message-----
> > From: Chautru, Nicolas <nicolas.chautru@intel.com>
> > Sent: Sunday, August 30, 2020 1:48
> > To: Xu, Rosen <rosen.xu@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com
> > Cc: Richardson, Bruce <bruce.richardson@intel.com>
> > Subject: RE: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > configuration
> >
> > Hi,
> >
> > > From: Xu, Rosen <rosen.xu@intel.com>
> > >
> > > Hi,
> > >
> > > > -----Original Message-----
> > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Nicolas Chautru
> > > > Sent: Wednesday, August 19, 2020 8:25
> > > > To: dev@dpdk.org; akhil.goyal@nxp.com
> > > > Cc: Richardson, Bruce <bruce.richardson@intel.com>; Chautru,
> > > > Nicolas <nicolas.chautru@intel.com>
> > > > Subject: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > > > configuration
> > > >
> > > > Adding function to create and configure queues for the device.
> > > > Still no capability.
> > > >
> > > > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > > > ---
> > > >  drivers/baseband/acc100/rte_acc100_pmd.c | 420
> > > > ++++++++++++++++++++++++++++++-
> > > > drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
> > > >  2 files changed, 464 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > index 7807a30..7a21c57 100644
> > > > --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > @@ -26,6 +26,22 @@
> > > >  RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);  #endif
> > > >
> > > > +/* Write to MMIO register address */ static inline void
> > > > +mmio_write(void *addr, uint32_t value) {
> > > > +	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value); }
> > > > +
> > > > +/* Write a register of a ACC100 device */ static inline void
> > > > +acc100_reg_write(struct acc100_device *d, uint32_t offset,
> > > > +uint32_t
> > > > +payload) {
> > > > +	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
> > > > +	mmio_write(reg_addr, payload);
> > > > +	usleep(1000);
> > > > +}
> > > > +
> > > >  /* Read a register of a ACC100 device */  static inline uint32_t
> > > > acc100_reg_read(struct acc100_device *d, uint32_t offset) @@ -36,6
> > > > +52,22 @@
> > > >  	return rte_le_to_cpu_32(ret);
> > > >  }
> > > >
> > > > +/* Basic Implementation of Log2 for exact 2^N */ static inline
> > > > +uint32_t log2_basic(uint32_t value) {
> > > > +	return (value == 0) ? 0 : __builtin_ctz(value); }
> > > > +
> > > > +/* Calculate memory alignment offset assuming alignment is 2^N */
> > > > +static inline uint32_t calc_mem_alignment_offset(void
> > > > +*unaligned_virt_mem, uint32_t alignment) {
> > > > +	rte_iova_t unaligned_phy_mem =
> > > > rte_malloc_virt2iova(unaligned_virt_mem);
> > > > +	return (uint32_t)(alignment -
> > > > +			(unaligned_phy_mem & (alignment-1))); }
> > > > +
> > > >  /* Calculate the offset of the enqueue register */  static inline
> > > > uint32_t queue_offset(bool pf_device, uint8_t vf_id, uint8_t
> > > > qgrp_id, uint16_t aq_id) @@ -204,10 +236,393 @@
> > > >  			acc100_conf->q_dl_5g.aq_depth_log2);
> > > >  }
> > > >
> > > > +static void
> > > > +free_base_addresses(void **base_addrs, int size) {
> > > > +	int i;
> > > > +	for (i = 0; i < size; i++)
> > > > +		rte_free(base_addrs[i]);
> > > > +}
> > > > +
> > > > +static inline uint32_t
> > > > +get_desc_len(void)
> > > > +{
> > > > +	return sizeof(union acc100_dma_desc); }
> > > > +
> > > > +/* Allocate the 2 * 64MB block for the sw rings */ static int
> > > > +alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct
> > > > +acc100_device
> > > > *d,
> > > > +		int socket)
> > > > +{
> > > > +	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
> > > > +	d->sw_rings_base = rte_zmalloc_socket(dev->device->driver-
> > > > >name,
> > > > +			2 * sw_ring_size, RTE_CACHE_LINE_SIZE, socket);
> > > > +	if (d->sw_rings_base == NULL) {
> > > > +		rte_bbdev_log(ERR, "Failed to allocate memory for %s:%u",
> > > > +				dev->device->driver->name,
> > > > +				dev->data->dev_id);
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
> > > > +	uint32_t next_64mb_align_offset = calc_mem_alignment_offset(
> > > > +			d->sw_rings_base, ACC100_SIZE_64MBYTE);
> > > > +	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base,
> > > > next_64mb_align_offset);
> > > > +	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base) +
> > > > +			next_64mb_align_offset;
> > > > +	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > > > +	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
> > > > +
> > > > +	return 0;
> > > > +}
> > >
> > > Why not a common alloc memory function but special function for
> > > different memory size?
> >
> > This is a bit convoluted but due to the fact the first attempt method
> > which is optimal (minimum) may not always find aligned memory.
> 
> What's convoluted? Can you explain?
> For packet processing, in most scenarios, aren't we aligned memory when we
> alloc memory?

Hi Rosen, 
This is related to both the alignment and the size of the contiguous amount of data in pinned down memory = 64MB contiguous block aligned on 64MB boundary of physical address (not linear). 
The first method can potentially fail hence is run incrementally while the 2nd version may be used as safe fall through and is more wasteful in term of footprint (hence not used as default).
That is the part that I considered "convoluted" in this way to reliably allocate memory. It is possible to only use the 2nd version which would look cleaner in term of code but more wasteful in memory usage. 



> >
> > >
> > > > +/* Attempt to allocate minimised memory space for sw rings */
> > > > +static void alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct
> > > > acc100_device
> > > > +*d,
> > > > +		uint16_t num_queues, int socket) {
> > > > +	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
> > > > +	uint32_t next_64mb_align_offset;
> > > > +	rte_iova_t sw_ring_phys_end_addr;
> > > > +	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
> > > > +	void *sw_rings_base;
> > > > +	int i = 0;
> > > > +	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > > > +	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
> > > > +
> > > > +	/* Find an aligned block of memory to store sw rings */
> > > > +	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
> > > > +		/*
> > > > +		 * sw_ring allocated memory is guaranteed to be aligned to
> > > > +		 * q_sw_ring_size at the condition that the requested size is
> > > > +		 * less than the page size
> > > > +		 */
> > > > +		sw_rings_base = rte_zmalloc_socket(
> > > > +				dev->device->driver->name,
> > > > +				dev_sw_ring_size, q_sw_ring_size, socket);
> > > > +
> > > > +		if (sw_rings_base == NULL) {
> > > > +			rte_bbdev_log(ERR,
> > > > +					"Failed to allocate memory
> > > > for %s:%u",
> > > > +					dev->device->driver->name,
> > > > +					dev->data->dev_id);
> > > > +			break;
> > > > +		}
> > > > +
> > > > +		sw_rings_base_phy = rte_malloc_virt2iova(sw_rings_base);
> > > > +		next_64mb_align_offset = calc_mem_alignment_offset(
> > > > +				sw_rings_base, ACC100_SIZE_64MBYTE);
> > > > +		next_64mb_align_addr_phy = sw_rings_base_phy +
> > > > +				next_64mb_align_offset;
> > > > +		sw_ring_phys_end_addr = sw_rings_base_phy +
> > > > dev_sw_ring_size;
> > > > +
> > > > +		/* Check if the end of the sw ring memory block is before the
> > > > +		 * start of next 64MB aligned mem address
> > > > +		 */
> > > > +		if (sw_ring_phys_end_addr < next_64mb_align_addr_phy) {
> > > > +			d->sw_rings_phys = sw_rings_base_phy;
> > > > +			d->sw_rings = sw_rings_base;
> > > > +			d->sw_rings_base = sw_rings_base;
> > > > +			d->sw_ring_size = q_sw_ring_size;
> > > > +			d->sw_ring_max_depth = MAX_QUEUE_DEPTH;
> > > > +			break;
> > > > +		}
> > > > +		/* Store the address of the unaligned mem block */
> > > > +		base_addrs[i] = sw_rings_base;
> > > > +		i++;
> > > > +	}
> > > > +
> > > > +	/* Free all unaligned blocks of mem allocated in the loop */
> > > > +	free_base_addresses(base_addrs, i); }
> > >
> > > It's strange to firstly alloc memory and then free memory but on
> > > operations on this memory.
> >
> > I may miss your point. We are freeing the exact same mem we did get
> > from rte_zmalloc.
> > Not that the base_addrs array refers to multiple attempts of mallocs,
> > not multiple operations in a ring.
> 
> You alloc memory sw_rings_base, after some translate, assign this memory to
> cc100_device *d, and before the function return, this memory has been freed.

If you follow the logic, this actually only frees the memory from attempts which were not successfully well aligned, not the one which ends up being in fact used for sw rings.  
The actually memory for sw rings is obviously used and actually gets freed when closing the device below => ie. rte_free(d->sw_rings_base);
Let me know if unclear. I could add more comments if this not obvious from the code. Ie. /* Free all _unaligned_ blocks of mem allocated in the loop */*

Thanks for your review. I can see how it can look a bit odd initially. 

> 
> > >
> > > > +
> > > > +/* Allocate 64MB memory used for all software rings */ static int
> > > > +acc100_setup_queues(struct rte_bbdev *dev, uint16_t num_queues,
> > int
> > > > +socket_id) {
> > > > +	uint32_t phys_low, phys_high, payload;
> > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > +	const struct acc100_registry_addr *reg_addr;
> > > > +
> > > > +	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
> > > > +		rte_bbdev_log(NOTICE,
> > > > +				"%s has PF mode disabled. This PF can't be
> > > > used.",
> > > > +				dev->data->name);
> > > > +		return -ENODEV;
> > > > +	}
> > > > +
> > > > +	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
> > > > +
> > > > +	/* If minimal memory space approach failed, then allocate
> > > > +	 * the 2 * 64MB block for the sw rings
> > > > +	 */
> > > > +	if (d->sw_rings == NULL)
> > > > +		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
> > > > +
> > > > +	/* Configure ACC100 with the base address for DMA descriptor rings
> > > > +	 * Same descriptor rings used for UL and DL DMA Engines
> > > > +	 * Note : Assuming only VF0 bundle is used for PF mode
> > > > +	 */
> > > > +	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
> > > > +	phys_low  = (uint32_t)(d->sw_rings_phys &
> > > > ~(ACC100_SIZE_64MBYTE-1));
> > > > +
> > > > +	/* Choose correct registry addresses for the device type */
> > > > +	if (d->pf_device)
> > > > +		reg_addr = &pf_reg_addr;
> > > > +	else
> > > > +		reg_addr = &vf_reg_addr;
> > > > +
> > > > +	/* Read the populated cfg from ACC100 registers */
> > > > +	fetch_acc100_config(dev);
> > > > +
> > > > +	/* Mark as configured properly */
> > > > +	d->configured = true;
> > > > +
> > > > +	/* Release AXI from PF */
> > > > +	if (d->pf_device)
> > > > +		acc100_reg_write(d, HWPfDmaAxiControl, 1);
> > > > +
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo, phys_low);
> > > > +
> > > > +	/*
> > > > +	 * Configure Ring Size to the max queue ring size
> > > > +	 * (used for wrapping purpose)
> > > > +	 */
> > > > +	payload = log2_basic(d->sw_ring_size / 64);
> > > > +	acc100_reg_write(d, reg_addr->ring_size, payload);
> > > > +
> > > > +	/* Configure tail pointer for use when SDONE enabled */
> > > > +	d->tail_ptrs = rte_zmalloc_socket(
> > > > +			dev->device->driver->name,
> > > > +			ACC100_NUM_QGRPS * ACC100_NUM_AQS *
> > > > sizeof(uint32_t),
> > > > +			RTE_CACHE_LINE_SIZE, socket_id);
> > > > +	if (d->tail_ptrs == NULL) {
> > > > +		rte_bbdev_log(ERR, "Failed to allocate tail ptr for %s:%u",
> > > > +				dev->device->driver->name,
> > > > +				dev->data->dev_id);
> > > > +		rte_free(d->sw_rings);
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
> > > > +
> > > > +	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
> > > > +	phys_low  = (uint32_t)(d->tail_ptr_phys);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
> > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
> > > > +
> > > > +	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
> > > > +			ACC100_HARQ_LAYOUT * sizeof(*d->harq_layout),
> > > > +			RTE_CACHE_LINE_SIZE, dev->data->socket_id);
> > > > +
> > > > +	rte_bbdev_log_debug(
> > > > +			"ACC100 (%s) configured  sw_rings = %p,
> > > > sw_rings_phys = %#"
> > > > +			PRIx64, dev->data->name, d->sw_rings, d-
> > > > >sw_rings_phys);
> > > > +
> > > > +	return 0;
> > > > +}
> > > > +
> > > >  /* Free 64MB memory used for software rings */  static int -
> > > > acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
> > > > +acc100_dev_close(struct rte_bbdev *dev)
> > > >  {
> > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > +	if (d->sw_rings_base != NULL) {
> > > > +		rte_free(d->tail_ptrs);
> > > > +		rte_free(d->sw_rings_base);
> > > > +		d->sw_rings_base = NULL;
> > > > +	}
> > > > +	usleep(1000);
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +
> > > > +/**
> > > > + * Report a ACC100 queue index which is free
> > > > + * Return 0 to 16k for a valid queue_idx or -1 when no queue is
> > > > +available
> > > > + * Note : Only supporting VF0 Bundle for PF mode  */ static int
> > > > +acc100_find_free_queue_idx(struct rte_bbdev *dev,
> > > > +		const struct rte_bbdev_queue_conf *conf) {
> > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > +	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
> > > > +	int acc = op_2_acc[conf->op_type];
> > > > +	struct rte_q_topology_t *qtop = NULL;
> > > > +	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
> > > > +	if (qtop == NULL)
> > > > +		return -1;
> > > > +	/* Identify matching QGroup Index which are sorted in priority
> > > > +order
> > > > */
> > > > +	uint16_t group_idx = qtop->first_qgroup_index;
> > > > +	group_idx += conf->priority;
> > > > +	if (group_idx >= ACC100_NUM_QGRPS ||
> > > > +			conf->priority >= qtop->num_qgroups) {
> > > > +		rte_bbdev_log(INFO, "Invalid Priority on %s, priority %u",
> > > > +				dev->data->name, conf->priority);
> > > > +		return -1;
> > > > +	}
> > > > +	/* Find a free AQ_idx  */
> > > > +	uint16_t aq_idx;
> > > > +	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups; aq_idx++) {
> > > > +		if (((d->q_assigned_bit_map[group_idx] >> aq_idx) & 0x1)
> > > > == 0) {
> > > > +			/* Mark the Queue as assigned */
> > > > +			d->q_assigned_bit_map[group_idx] |= (1 << aq_idx);
> > > > +			/* Report the AQ Index */
> > > > +			return (group_idx << GRP_ID_SHIFT) + aq_idx;
> > > > +		}
> > > > +	}
> > > > +	rte_bbdev_log(INFO, "Failed to find free queue on %s, priority %u",
> > > > +			dev->data->name, conf->priority);
> > > > +	return -1;
> > > > +}
> > > > +
> > > > +/* Setup ACC100 queue */
> > > > +static int
> > > > +acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
> > > > +		const struct rte_bbdev_queue_conf *conf) {
> > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > +	struct acc100_queue *q;
> > > > +	int16_t q_idx;
> > > > +
> > > > +	/* Allocate the queue data structure. */
> > > > +	q = rte_zmalloc_socket(dev->device->driver->name, sizeof(*q),
> > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > +	if (q == NULL) {
> > > > +		rte_bbdev_log(ERR, "Failed to allocate queue memory");
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +
> > > > +	q->d = d;
> > > > +	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size *
> > > > queue_id));
> > > > +	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size *
> > > > queue_id);
> > > > +
> > > > +	/* Prepare the Ring with default descriptor format */
> > > > +	union acc100_dma_desc *desc = NULL;
> > > > +	unsigned int desc_idx, b_idx;
> > > > +	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
> > > > +		ACC100_FCW_LE_BLEN : (conf->op_type ==
> > > > RTE_BBDEV_OP_TURBO_DEC ?
> > > > +		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
> > > > +
> > > > +	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
> > > > +		desc = q->ring_addr + desc_idx;
> > > > +		desc->req.word0 = ACC100_DMA_DESC_TYPE;
> > > > +		desc->req.word1 = 0; /**< Timestamp */
> > > > +		desc->req.word2 = 0;
> > > > +		desc->req.word3 = 0;
> > > > +		uint64_t fcw_offset = (desc_idx << 8) +
> > > > ACC100_DESC_FCW_OFFSET;
> > > > +		desc->req.data_ptrs[0].address = q->ring_addr_phys +
> > > > fcw_offset;
> > > > +		desc->req.data_ptrs[0].blen = fcw_len;
> > > > +		desc->req.data_ptrs[0].blkid = ACC100_DMA_BLKID_FCW;
> > > > +		desc->req.data_ptrs[0].last = 0;
> > > > +		desc->req.data_ptrs[0].dma_ext = 0;
> > > > +		for (b_idx = 1; b_idx < ACC100_DMA_MAX_NUM_POINTERS
> > > > - 1;
> > > > +				b_idx++) {
> > > > +			desc->req.data_ptrs[b_idx].blkid =
> > > > ACC100_DMA_BLKID_IN;
> > > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > > +			b_idx++;
> > > > +			desc->req.data_ptrs[b_idx].blkid =
> > > > +					ACC100_DMA_BLKID_OUT_ENC;
> > > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > > +		}
> > > > +		/* Preset some fields of LDPC FCW */
> > > > +		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
> > > > +		desc->req.fcw_ld.gain_i = 1;
> > > > +		desc->req.fcw_ld.gain_h = 1;
> > > > +	}
> > > > +
> > > > +	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
> > > > +			RTE_CACHE_LINE_SIZE,
> > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > +	if (q->lb_in == NULL) {
> > > > +		rte_bbdev_log(ERR, "Failed to allocate lb_in memory");
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
> > > > +	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
> > > > +			RTE_CACHE_LINE_SIZE,
> > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > +	if (q->lb_out == NULL) {
> > > > +		rte_bbdev_log(ERR, "Failed to allocate lb_out memory");
> > > > +		return -ENOMEM;
> > > > +	}
> > > > +	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
> > > > +
> > > > +	/*
> > > > +	 * Software queue ring wraps synchronously with the HW when it
> > > > reaches
> > > > +	 * the boundary of the maximum allocated queue size, no matter
> > > > what the
> > > > +	 * sw queue size is. This wrapping is guarded by setting the
> > > > wrap_mask
> > > > +	 * to represent the maximum queue size as allocated at the time
> > > > when
> > > > +	 * the device has been setup (in configure()).
> > > > +	 *
> > > > +	 * The queue depth is set to the queue size value (conf-
> > > > >queue_size).
> > > > +	 * This limits the occupancy of the queue at any point of time, so that
> > > > +	 * the queue does not get swamped with enqueue requests.
> > > > +	 */
> > > > +	q->sw_ring_depth = conf->queue_size;
> > > > +	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
> > > > +
> > > > +	q->op_type = conf->op_type;
> > > > +
> > > > +	q_idx = acc100_find_free_queue_idx(dev, conf);
> > > > +	if (q_idx == -1) {
> > > > +		rte_free(q);
> > > > +		return -1;
> > > > +	}
> > > > +
> > > > +	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
> > > > +	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
> > > > +	q->aq_id = q_idx & 0xF;
> > > > +	q->aq_depth = (conf->op_type ==  RTE_BBDEV_OP_TURBO_DEC) ?
> > > > +			(1 << d->acc100_conf.q_ul_4g.aq_depth_log2) :
> > > > +			(1 << d->acc100_conf.q_dl_4g.aq_depth_log2);
> > > > +
> > > > +	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
> > > > +			queue_offset(d->pf_device,
> > > > +					q->vf_id, q->qgrp_id, q->aq_id));
> > > > +
> > > > +	rte_bbdev_log_debug(
> > > > +			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u,
> > > > aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
> > > > +			dev->data->dev_id, queue_id, q->qgrp_id, q->vf_id,
> > > > +			q->aq_id, q->aq_depth, q->mmio_reg_enqueue);
> > > > +
> > > > +	dev->data->queues[queue_id].queue_private = q;
> > > > +	return 0;
> > > > +}
> > > > +
> > > > +/* Release ACC100 queue */
> > > > +static int
> > > > +acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id) {
> > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > +	struct acc100_queue *q = dev->data->queues[q_id].queue_private;
> > > > +
> > > > +	if (q != NULL) {
> > > > +		/* Mark the Queue as un-assigned */
> > > > +		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF -
> > > > +				(1 << q->aq_id));
> > > > +		rte_free(q->lb_in);
> > > > +		rte_free(q->lb_out);
> > > > +		rte_free(q);
> > > > +		dev->data->queues[q_id].queue_private = NULL;
> > > > +	}
> > > > +
> > > >  	return 0;
> > > >  }
> > > >
> > > > @@ -258,8 +673,11 @@
> > > >  }
> > > >
> > > >  static const struct rte_bbdev_ops acc100_bbdev_ops = {
> > > > +	.setup_queues = acc100_setup_queues,
> > > >  	.close = acc100_dev_close,
> > > >  	.info_get = acc100_dev_info_get,
> > > > +	.queue_setup = acc100_queue_setup,
> > > > +	.queue_release = acc100_queue_release,
> > > >  };
> > > >
> > > >  /* ACC100 PCI PF address map */
> > > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > index 662e2c8..0e2b79c 100644
> > > > --- a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > @@ -518,11 +518,56 @@ struct acc100_registry_addr {
> > > >  	.ddr_range = HWVfDmaDdrBaseRangeRoVf,  };
> > > >
> > > > +/* Structure associated with each queue. */ struct
> > > > +__rte_cache_aligned acc100_queue {
> > > > +	union acc100_dma_desc *ring_addr;  /* Virtual address of sw ring */
> > > > +	rte_iova_t ring_addr_phys;  /* Physical address of software ring */
> > > > +	uint32_t sw_ring_head;  /* software ring head */
> > > > +	uint32_t sw_ring_tail;  /* software ring tail */
> > > > +	/* software ring size (descriptors, not bytes) */
> > > > +	uint32_t sw_ring_depth;
> > > > +	/* mask used to wrap enqueued descriptors on the sw ring */
> > > > +	uint32_t sw_ring_wrap_mask;
> > > > +	/* MMIO register used to enqueue descriptors */
> > > > +	void *mmio_reg_enqueue;
> > > > +	uint8_t vf_id;  /* VF ID (max = 63) */
> > > > +	uint8_t qgrp_id;  /* Queue Group ID */
> > > > +	uint16_t aq_id;  /* Atomic Queue ID */
> > > > +	uint16_t aq_depth;  /* Depth of atomic queue */
> > > > +	uint32_t aq_enqueued;  /* Count how many "batches" have been
> > > > enqueued */
> > > > +	uint32_t aq_dequeued;  /* Count how many "batches" have been
> > > > dequeued */
> > > > +	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
> > > > +	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> > > > +	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD
> > > > */
> > > > +	/* Internal Buffers for loopback input */
> > > > +	uint8_t *lb_in;
> > > > +	uint8_t *lb_out;
> > > > +	rte_iova_t lb_in_addr_phys;
> > > > +	rte_iova_t lb_out_addr_phys;
> > > > +	struct acc100_device *d;
> > > > +};
> > > > +
> > > >  /* Private data structure for each ACC100 device */  struct
> > > > acc100_device
> > {
> > > >  	void *mmio_base;  /**< Base address of MMIO registers (BAR0) */
> > > > +	void *sw_rings_base;  /* Base addr of un-aligned memory for sw
> > > > rings */
> > > > +	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw rings */
> > > > +	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
> > > > +	/* Virtual address of the info memory routed to the this
> > > > +function
> > > > under
> > > > +	 * operation, whether it is PF or VF.
> > > > +	 */
> > > > +	union acc100_harq_layout_data *harq_layout;
> > > > +	uint32_t sw_ring_size;
> > > >  	uint32_t ddr_size; /* Size in kB */
> > > > +	uint32_t *tail_ptrs; /* Base address of response tail pointer buffer */
> > > > +	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers */
> > > > +	/* Max number of entries available for each queue in device,
> > > > depending
> > > > +	 * on how many queues are enabled with configure()
> > > > +	 */
> > > > +	uint32_t sw_ring_max_depth;
> > > >  	struct acc100_conf acc100_conf; /* ACC100 Initial configuration
> > > > */
> > > > +	/* Bitmap capturing which Queues have already been assigned */
> > > > +	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
> > > >  	bool pf_device; /**< True if this is a PF ACC100 device */
> > > >  	bool configured; /**< True if this ACC100 device is configured
> > > > */ };
> > > > --
> > > > 1.8.3.1
  
Xu, Rosen Sept. 4, 2020, 2:01 a.m. UTC | #5
Hi,

> -----Original Message-----
> From: Chautru, Nicolas <nicolas.chautru@intel.com>
> Sent: Friday, September 04, 2020 6:49
> To: Xu, Rosen <rosen.xu@intel.com>; dev@dpdk.org; akhil.goyal@nxp.com
> Cc: Richardson, Bruce <bruce.richardson@intel.com>
> Subject: RE: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> configuration
> 
> > From: Xu, Rosen <rosen.xu@intel.com>
> >
> > Hi,
> >
> > > -----Original Message-----
> > > From: Chautru, Nicolas <nicolas.chautru@intel.com>
> > > Sent: Sunday, August 30, 2020 1:48
> > > To: Xu, Rosen <rosen.xu@intel.com>; dev@dpdk.org;
> > > akhil.goyal@nxp.com
> > > Cc: Richardson, Bruce <bruce.richardson@intel.com>
> > > Subject: RE: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > > configuration
> > >
> > > Hi,
> > >
> > > > From: Xu, Rosen <rosen.xu@intel.com>
> > > >
> > > > Hi,
> > > >
> > > > > -----Original Message-----
> > > > > From: dev <dev-bounces@dpdk.org> On Behalf Of Nicolas Chautru
> > > > > Sent: Wednesday, August 19, 2020 8:25
> > > > > To: dev@dpdk.org; akhil.goyal@nxp.com
> > > > > Cc: Richardson, Bruce <bruce.richardson@intel.com>; Chautru,
> > > > > Nicolas <nicolas.chautru@intel.com>
> > > > > Subject: [dpdk-dev] [PATCH v3 04/11] baseband/acc100: add queue
> > > > > configuration
> > > > >
> > > > > Adding function to create and configure queues for the device.
> > > > > Still no capability.
> > > > >
> > > > > Signed-off-by: Nicolas Chautru <nicolas.chautru@intel.com>
> > > > > ---
> > > > >  drivers/baseband/acc100/rte_acc100_pmd.c | 420
> > > > > ++++++++++++++++++++++++++++++-
> > > > > drivers/baseband/acc100/rte_acc100_pmd.h |  45 ++++
> > > > >  2 files changed, 464 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > > b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > > index 7807a30..7a21c57 100644
> > > > > --- a/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.c
> > > > > @@ -26,6 +26,22 @@
> > > > >  RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);
> > > > > #endif
> > > > >
> > > > > +/* Write to MMIO register address */ static inline void
> > > > > +mmio_write(void *addr, uint32_t value) {
> > > > > +	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value); }
> > > > > +
> > > > > +/* Write a register of a ACC100 device */ static inline void
> > > > > +acc100_reg_write(struct acc100_device *d, uint32_t offset,
> > > > > +uint32_t
> > > > > +payload) {
> > > > > +	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
> > > > > +	mmio_write(reg_addr, payload);
> > > > > +	usleep(1000);
> > > > > +}
> > > > > +
> > > > >  /* Read a register of a ACC100 device */  static inline
> > > > > uint32_t acc100_reg_read(struct acc100_device *d, uint32_t
> > > > > offset) @@ -36,6
> > > > > +52,22 @@
> > > > >  	return rte_le_to_cpu_32(ret);
> > > > >  }
> > > > >
> > > > > +/* Basic Implementation of Log2 for exact 2^N */ static inline
> > > > > +uint32_t log2_basic(uint32_t value) {
> > > > > +	return (value == 0) ? 0 : __builtin_ctz(value); }
> > > > > +
> > > > > +/* Calculate memory alignment offset assuming alignment is 2^N
> > > > > +*/ static inline uint32_t calc_mem_alignment_offset(void
> > > > > +*unaligned_virt_mem, uint32_t alignment) {
> > > > > +	rte_iova_t unaligned_phy_mem =
> > > > > rte_malloc_virt2iova(unaligned_virt_mem);
> > > > > +	return (uint32_t)(alignment -
> > > > > +			(unaligned_phy_mem & (alignment-1))); }
> > > > > +
> > > > >  /* Calculate the offset of the enqueue register */  static
> > > > > inline uint32_t queue_offset(bool pf_device, uint8_t vf_id,
> > > > > uint8_t qgrp_id, uint16_t aq_id) @@ -204,10 +236,393 @@
> > > > >  			acc100_conf->q_dl_5g.aq_depth_log2);
> > > > >  }
> > > > >
> > > > > +static void
> > > > > +free_base_addresses(void **base_addrs, int size) {
> > > > > +	int i;
> > > > > +	for (i = 0; i < size; i++)
> > > > > +		rte_free(base_addrs[i]);
> > > > > +}
> > > > > +
> > > > > +static inline uint32_t
> > > > > +get_desc_len(void)
> > > > > +{
> > > > > +	return sizeof(union acc100_dma_desc); }
> > > > > +
> > > > > +/* Allocate the 2 * 64MB block for the sw rings */ static int
> > > > > +alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct
> > > > > +acc100_device
> > > > > *d,
> > > > > +		int socket)
> > > > > +{
> > > > > +	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
> > > > > +	d->sw_rings_base = rte_zmalloc_socket(dev->device-
> >driver-
> > > > > >name,
> > > > > +			2 * sw_ring_size, RTE_CACHE_LINE_SIZE,
> socket);
> > > > > +	if (d->sw_rings_base == NULL) {
> > > > > +		rte_bbdev_log(ERR, "Failed to allocate memory
> for %s:%u",
> > > > > +				dev->device->driver->name,
> > > > > +				dev->data->dev_id);
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
> > > > > +	uint32_t next_64mb_align_offset =
> calc_mem_alignment_offset(
> > > > > +			d->sw_rings_base, ACC100_SIZE_64MBYTE);
> > > > > +	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base,
> > > > > next_64mb_align_offset);
> > > > > +	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base)
> +
> > > > > +			next_64mb_align_offset;
> > > > > +	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
> > > > > +	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > >
> > > > Why not a common alloc memory function but special function for
> > > > different memory size?
> > >
> > > This is a bit convoluted but due to the fact the first attempt
> > > method which is optimal (minimum) may not always find aligned memory.
> >
> > What's convoluted? Can you explain?
> > For packet processing, in most scenarios, aren't we aligned memory
> > when we alloc memory?
> 
> Hi Rosen,
> This is related to both the alignment and the size of the contiguous amount
> of data in pinned down memory = 64MB contiguous block aligned on 64MB
> boundary of physical address (not linear).
> The first method can potentially fail hence is run incrementally while the 2nd
> version may be used as safe fall through and is more wasteful in term of
> footprint (hence not used as default).
> That is the part that I considered "convoluted" in this way to reliably allocate
> memory. It is possible to only use the 2nd version which would look cleaner
> in term of code but more wasteful in memory usage.

As you mentioned, it's not cleaner, looking forwarding your next version patch.

> 
> 
> > >
> > > >
> > > > > +/* Attempt to allocate minimised memory space for sw rings */
> > > > > +static void alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct
> > > > > acc100_device
> > > > > +*d,
> > > > > +		uint16_t num_queues, int socket) {
> > > > > +	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
> > > > > +	uint32_t next_64mb_align_offset;
> > > > > +	rte_iova_t sw_ring_phys_end_addr;
> > > > > +	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
> > > > > +	void *sw_rings_base;
> > > > > +	int i = 0;
> > > > > +	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH *
> get_desc_len();
> > > > > +	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
> > > > > +
> > > > > +	/* Find an aligned block of memory to store sw rings */
> > > > > +	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
> > > > > +		/*
> > > > > +		 * sw_ring allocated memory is guaranteed to be
> aligned to
> > > > > +		 * q_sw_ring_size at the condition that the
> requested size is
> > > > > +		 * less than the page size
> > > > > +		 */
> > > > > +		sw_rings_base = rte_zmalloc_socket(
> > > > > +				dev->device->driver->name,
> > > > > +				dev_sw_ring_size, q_sw_ring_size,
> socket);
> > > > > +
> > > > > +		if (sw_rings_base == NULL) {
> > > > > +			rte_bbdev_log(ERR,
> > > > > +					"Failed to allocate memory
> > > > > for %s:%u",
> > > > > +					dev->device->driver->name,
> > > > > +					dev->data->dev_id);
> > > > > +			break;
> > > > > +		}
> > > > > +
> > > > > +		sw_rings_base_phy =
> rte_malloc_virt2iova(sw_rings_base);
> > > > > +		next_64mb_align_offset =
> calc_mem_alignment_offset(
> > > > > +				sw_rings_base,
> ACC100_SIZE_64MBYTE);
> > > > > +		next_64mb_align_addr_phy = sw_rings_base_phy +
> > > > > +				next_64mb_align_offset;
> > > > > +		sw_ring_phys_end_addr = sw_rings_base_phy +
> > > > > dev_sw_ring_size;
> > > > > +
> > > > > +		/* Check if the end of the sw ring memory block is
> before the
> > > > > +		 * start of next 64MB aligned mem address
> > > > > +		 */
> > > > > +		if (sw_ring_phys_end_addr <
> next_64mb_align_addr_phy) {
> > > > > +			d->sw_rings_phys = sw_rings_base_phy;
> > > > > +			d->sw_rings = sw_rings_base;
> > > > > +			d->sw_rings_base = sw_rings_base;
> > > > > +			d->sw_ring_size = q_sw_ring_size;
> > > > > +			d->sw_ring_max_depth =
> MAX_QUEUE_DEPTH;
> > > > > +			break;
> > > > > +		}
> > > > > +		/* Store the address of the unaligned mem block */
> > > > > +		base_addrs[i] = sw_rings_base;
> > > > > +		i++;
> > > > > +	}
> > > > > +
> > > > > +	/* Free all unaligned blocks of mem allocated in the loop */
> > > > > +	free_base_addresses(base_addrs, i); }
> > > >
> > > > It's strange to firstly alloc memory and then free memory but on
> > > > operations on this memory.
> > >
> > > I may miss your point. We are freeing the exact same mem we did get
> > > from rte_zmalloc.
> > > Not that the base_addrs array refers to multiple attempts of mallocs,
> > > not multiple operations in a ring.
> >
> > You alloc memory sw_rings_base, after some translate, assign this memory
> to
> > cc100_device *d, and before the function return, this memory has been
> freed.
> 
> If you follow the logic, this actually only frees the memory from attempts
> which were not successfully well aligned, not the one which ends up being in
> fact used for sw rings.
> The actually memory for sw rings is obviously used and actually gets freed
> when closing the device below => ie. rte_free(d->sw_rings_base);
> Let me know if unclear. I could add more comments if this not obvious from
> the code. Ie. /* Free all _unaligned_ blocks of mem allocated in the loop */*
> 
> Thanks for your review. I can see how it can look a bit odd initially.

Pls make sure you code can works well in each branch.

> >
> > > >
> > > > > +
> > > > > +/* Allocate 64MB memory used for all software rings */ static int
> > > > > +acc100_setup_queues(struct rte_bbdev *dev, uint16_t
> num_queues,
> > > int
> > > > > +socket_id) {
> > > > > +	uint32_t phys_low, phys_high, payload;
> > > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > > +	const struct acc100_registry_addr *reg_addr;
> > > > > +
> > > > > +	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
> > > > > +		rte_bbdev_log(NOTICE,
> > > > > +				"%s has PF mode disabled. This PF
> can't be
> > > > > used.",
> > > > > +				dev->data->name);
> > > > > +		return -ENODEV;
> > > > > +	}
> > > > > +
> > > > > +	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
> > > > > +
> > > > > +	/* If minimal memory space approach failed, then allocate
> > > > > +	 * the 2 * 64MB block for the sw rings
> > > > > +	 */
> > > > > +	if (d->sw_rings == NULL)
> > > > > +		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
> > > > > +
> > > > > +	/* Configure ACC100 with the base address for DMA
> descriptor rings
> > > > > +	 * Same descriptor rings used for UL and DL DMA Engines
> > > > > +	 * Note : Assuming only VF0 bundle is used for PF mode
> > > > > +	 */
> > > > > +	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
> > > > > +	phys_low  = (uint32_t)(d->sw_rings_phys &
> > > > > ~(ACC100_SIZE_64MBYTE-1));
> > > > > +
> > > > > +	/* Choose correct registry addresses for the device type */
> > > > > +	if (d->pf_device)
> > > > > +		reg_addr = &pf_reg_addr;
> > > > > +	else
> > > > > +		reg_addr = &vf_reg_addr;
> > > > > +
> > > > > +	/* Read the populated cfg from ACC100 registers */
> > > > > +	fetch_acc100_config(dev);
> > > > > +
> > > > > +	/* Mark as configured properly */
> > > > > +	d->configured = true;
> > > > > +
> > > > > +	/* Release AXI from PF */
> > > > > +	if (d->pf_device)
> > > > > +		acc100_reg_write(d, HWPfDmaAxiControl, 1);
> > > > > +
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi,
> phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo,
> phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi,
> phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo,
> phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi,
> phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo,
> phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi,
> phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo,
> phys_low);
> > > > > +
> > > > > +	/*
> > > > > +	 * Configure Ring Size to the max queue ring size
> > > > > +	 * (used for wrapping purpose)
> > > > > +	 */
> > > > > +	payload = log2_basic(d->sw_ring_size / 64);
> > > > > +	acc100_reg_write(d, reg_addr->ring_size, payload);
> > > > > +
> > > > > +	/* Configure tail pointer for use when SDONE enabled */
> > > > > +	d->tail_ptrs = rte_zmalloc_socket(
> > > > > +			dev->device->driver->name,
> > > > > +			ACC100_NUM_QGRPS * ACC100_NUM_AQS
> *
> > > > > sizeof(uint32_t),
> > > > > +			RTE_CACHE_LINE_SIZE, socket_id);
> > > > > +	if (d->tail_ptrs == NULL) {
> > > > > +		rte_bbdev_log(ERR, "Failed to allocate tail ptr
> for %s:%u",
> > > > > +				dev->device->driver->name,
> > > > > +				dev->data->dev_id);
> > > > > +		rte_free(d->sw_rings);
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
> > > > > +
> > > > > +	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
> > > > > +	phys_low  = (uint32_t)(d->tail_ptr_phys);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
> > > > > +	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
> > > > > +
> > > > > +	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
> > > > > +			ACC100_HARQ_LAYOUT * sizeof(*d-
> >harq_layout),
> > > > > +			RTE_CACHE_LINE_SIZE, dev->data-
> >socket_id);
> > > > > +
> > > > > +	rte_bbdev_log_debug(
> > > > > +			"ACC100 (%s) configured  sw_rings = %p,
> > > > > sw_rings_phys = %#"
> > > > > +			PRIx64, dev->data->name, d->sw_rings, d-
> > > > > >sw_rings_phys);
> > > > > +
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > >  /* Free 64MB memory used for software rings */  static int -
> > > > > acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
> > > > > +acc100_dev_close(struct rte_bbdev *dev)
> > > > >  {
> > > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > > +	if (d->sw_rings_base != NULL) {
> > > > > +		rte_free(d->tail_ptrs);
> > > > > +		rte_free(d->sw_rings_base);
> > > > > +		d->sw_rings_base = NULL;
> > > > > +	}
> > > > > +	usleep(1000);
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +
> > > > > +/**
> > > > > + * Report a ACC100 queue index which is free
> > > > > + * Return 0 to 16k for a valid queue_idx or -1 when no queue is
> > > > > +available
> > > > > + * Note : Only supporting VF0 Bundle for PF mode  */ static int
> > > > > +acc100_find_free_queue_idx(struct rte_bbdev *dev,
> > > > > +		const struct rte_bbdev_queue_conf *conf) {
> > > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > > +	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
> > > > > +	int acc = op_2_acc[conf->op_type];
> > > > > +	struct rte_q_topology_t *qtop = NULL;
> > > > > +	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
> > > > > +	if (qtop == NULL)
> > > > > +		return -1;
> > > > > +	/* Identify matching QGroup Index which are sorted in
> priority
> > > > > +order
> > > > > */
> > > > > +	uint16_t group_idx = qtop->first_qgroup_index;
> > > > > +	group_idx += conf->priority;
> > > > > +	if (group_idx >= ACC100_NUM_QGRPS ||
> > > > > +			conf->priority >= qtop->num_qgroups) {
> > > > > +		rte_bbdev_log(INFO, "Invalid Priority on %s,
> priority %u",
> > > > > +				dev->data->name, conf->priority);
> > > > > +		return -1;
> > > > > +	}
> > > > > +	/* Find a free AQ_idx  */
> > > > > +	uint16_t aq_idx;
> > > > > +	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups;
> aq_idx++) {
> > > > > +		if (((d->q_assigned_bit_map[group_idx] >> aq_idx)
> & 0x1)
> > > > > == 0) {
> > > > > +			/* Mark the Queue as assigned */
> > > > > +			d->q_assigned_bit_map[group_idx] |= (1 <<
> aq_idx);
> > > > > +			/* Report the AQ Index */
> > > > > +			return (group_idx << GRP_ID_SHIFT) +
> aq_idx;
> > > > > +		}
> > > > > +	}
> > > > > +	rte_bbdev_log(INFO, "Failed to find free queue on %s,
> priority %u",
> > > > > +			dev->data->name, conf->priority);
> > > > > +	return -1;
> > > > > +}
> > > > > +
> > > > > +/* Setup ACC100 queue */
> > > > > +static int
> > > > > +acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
> > > > > +		const struct rte_bbdev_queue_conf *conf) {
> > > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > > +	struct acc100_queue *q;
> > > > > +	int16_t q_idx;
> > > > > +
> > > > > +	/* Allocate the queue data structure. */
> > > > > +	q = rte_zmalloc_socket(dev->device->driver->name,
> sizeof(*q),
> > > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > > +	if (q == NULL) {
> > > > > +		rte_bbdev_log(ERR, "Failed to allocate queue
> memory");
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +
> > > > > +	q->d = d;
> > > > > +	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size
> *
> > > > > queue_id));
> > > > > +	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size *
> > > > > queue_id);
> > > > > +
> > > > > +	/* Prepare the Ring with default descriptor format */
> > > > > +	union acc100_dma_desc *desc = NULL;
> > > > > +	unsigned int desc_idx, b_idx;
> > > > > +	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
> > > > > +		ACC100_FCW_LE_BLEN : (conf->op_type ==
> > > > > RTE_BBDEV_OP_TURBO_DEC ?
> > > > > +		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
> > > > > +
> > > > > +	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth;
> desc_idx++) {
> > > > > +		desc = q->ring_addr + desc_idx;
> > > > > +		desc->req.word0 = ACC100_DMA_DESC_TYPE;
> > > > > +		desc->req.word1 = 0; /**< Timestamp */
> > > > > +		desc->req.word2 = 0;
> > > > > +		desc->req.word3 = 0;
> > > > > +		uint64_t fcw_offset = (desc_idx << 8) +
> > > > > ACC100_DESC_FCW_OFFSET;
> > > > > +		desc->req.data_ptrs[0].address = q->ring_addr_phys
> +
> > > > > fcw_offset;
> > > > > +		desc->req.data_ptrs[0].blen = fcw_len;
> > > > > +		desc->req.data_ptrs[0].blkid =
> ACC100_DMA_BLKID_FCW;
> > > > > +		desc->req.data_ptrs[0].last = 0;
> > > > > +		desc->req.data_ptrs[0].dma_ext = 0;
> > > > > +		for (b_idx = 1; b_idx <
> ACC100_DMA_MAX_NUM_POINTERS
> > > > > - 1;
> > > > > +				b_idx++) {
> > > > > +			desc->req.data_ptrs[b_idx].blkid =
> > > > > ACC100_DMA_BLKID_IN;
> > > > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > > > +			b_idx++;
> > > > > +			desc->req.data_ptrs[b_idx].blkid =
> > > > > +
> 	ACC100_DMA_BLKID_OUT_ENC;
> > > > > +			desc->req.data_ptrs[b_idx].last = 1;
> > > > > +			desc->req.data_ptrs[b_idx].dma_ext = 0;
> > > > > +		}
> > > > > +		/* Preset some fields of LDPC FCW */
> > > > > +		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
> > > > > +		desc->req.fcw_ld.gain_i = 1;
> > > > > +		desc->req.fcw_ld.gain_h = 1;
> > > > > +	}
> > > > > +
> > > > > +	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
> > > > > +			RTE_CACHE_LINE_SIZE,
> > > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > > +	if (q->lb_in == NULL) {
> > > > > +		rte_bbdev_log(ERR, "Failed to allocate lb_in
> memory");
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
> > > > > +	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
> > > > > +			RTE_CACHE_LINE_SIZE,
> > > > > +			RTE_CACHE_LINE_SIZE, conf->socket);
> > > > > +	if (q->lb_out == NULL) {
> > > > > +		rte_bbdev_log(ERR, "Failed to allocate lb_out
> memory");
> > > > > +		return -ENOMEM;
> > > > > +	}
> > > > > +	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
> > > > > +
> > > > > +	/*
> > > > > +	 * Software queue ring wraps synchronously with the HW
> when it
> > > > > reaches
> > > > > +	 * the boundary of the maximum allocated queue size, no
> matter
> > > > > what the
> > > > > +	 * sw queue size is. This wrapping is guarded by setting the
> > > > > wrap_mask
> > > > > +	 * to represent the maximum queue size as allocated at the
> time
> > > > > when
> > > > > +	 * the device has been setup (in configure()).
> > > > > +	 *
> > > > > +	 * The queue depth is set to the queue size value (conf-
> > > > > >queue_size).
> > > > > +	 * This limits the occupancy of the queue at any point of time,
> so that
> > > > > +	 * the queue does not get swamped with enqueue requests.
> > > > > +	 */
> > > > > +	q->sw_ring_depth = conf->queue_size;
> > > > > +	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
> > > > > +
> > > > > +	q->op_type = conf->op_type;
> > > > > +
> > > > > +	q_idx = acc100_find_free_queue_idx(dev, conf);
> > > > > +	if (q_idx == -1) {
> > > > > +		rte_free(q);
> > > > > +		return -1;
> > > > > +	}
> > > > > +
> > > > > +	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
> > > > > +	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
> > > > > +	q->aq_id = q_idx & 0xF;
> > > > > +	q->aq_depth = (conf->op_type ==
> RTE_BBDEV_OP_TURBO_DEC) ?
> > > > > +			(1 << d-
> >acc100_conf.q_ul_4g.aq_depth_log2) :
> > > > > +			(1 << d-
> >acc100_conf.q_dl_4g.aq_depth_log2);
> > > > > +
> > > > > +	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
> > > > > +			queue_offset(d->pf_device,
> > > > > +					q->vf_id, q->qgrp_id, q-
> >aq_id));
> > > > > +
> > > > > +	rte_bbdev_log_debug(
> > > > > +			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u,
> > > > > aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
> > > > > +			dev->data->dev_id, queue_id, q->qgrp_id,
> q->vf_id,
> > > > > +			q->aq_id, q->aq_depth, q-
> >mmio_reg_enqueue);
> > > > > +
> > > > > +	dev->data->queues[queue_id].queue_private = q;
> > > > > +	return 0;
> > > > > +}
> > > > > +
> > > > > +/* Release ACC100 queue */
> > > > > +static int
> > > > > +acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id) {
> > > > > +	struct acc100_device *d = dev->data->dev_private;
> > > > > +	struct acc100_queue *q = dev->data-
> >queues[q_id].queue_private;
> > > > > +
> > > > > +	if (q != NULL) {
> > > > > +		/* Mark the Queue as un-assigned */
> > > > > +		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF
> -
> > > > > +				(1 << q->aq_id));
> > > > > +		rte_free(q->lb_in);
> > > > > +		rte_free(q->lb_out);
> > > > > +		rte_free(q);
> > > > > +		dev->data->queues[q_id].queue_private = NULL;
> > > > > +	}
> > > > > +
> > > > >  	return 0;
> > > > >  }
> > > > >
> > > > > @@ -258,8 +673,11 @@
> > > > >  }
> > > > >
> > > > >  static const struct rte_bbdev_ops acc100_bbdev_ops = {
> > > > > +	.setup_queues = acc100_setup_queues,
> > > > >  	.close = acc100_dev_close,
> > > > >  	.info_get = acc100_dev_info_get,
> > > > > +	.queue_setup = acc100_queue_setup,
> > > > > +	.queue_release = acc100_queue_release,
> > > > >  };
> > > > >
> > > > >  /* ACC100 PCI PF address map */
> > > > > diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > > b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > > index 662e2c8..0e2b79c 100644
> > > > > --- a/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > > +++ b/drivers/baseband/acc100/rte_acc100_pmd.h
> > > > > @@ -518,11 +518,56 @@ struct acc100_registry_addr {
> > > > >  	.ddr_range = HWVfDmaDdrBaseRangeRoVf,  };
> > > > >
> > > > > +/* Structure associated with each queue. */ struct
> > > > > +__rte_cache_aligned acc100_queue {
> > > > > +	union acc100_dma_desc *ring_addr;  /* Virtual address of sw
> ring */
> > > > > +	rte_iova_t ring_addr_phys;  /* Physical address of software
> ring */
> > > > > +	uint32_t sw_ring_head;  /* software ring head */
> > > > > +	uint32_t sw_ring_tail;  /* software ring tail */
> > > > > +	/* software ring size (descriptors, not bytes) */
> > > > > +	uint32_t sw_ring_depth;
> > > > > +	/* mask used to wrap enqueued descriptors on the sw ring
> */
> > > > > +	uint32_t sw_ring_wrap_mask;
> > > > > +	/* MMIO register used to enqueue descriptors */
> > > > > +	void *mmio_reg_enqueue;
> > > > > +	uint8_t vf_id;  /* VF ID (max = 63) */
> > > > > +	uint8_t qgrp_id;  /* Queue Group ID */
> > > > > +	uint16_t aq_id;  /* Atomic Queue ID */
> > > > > +	uint16_t aq_depth;  /* Depth of atomic queue */
> > > > > +	uint32_t aq_enqueued;  /* Count how many "batches" have
> been
> > > > > enqueued */
> > > > > +	uint32_t aq_dequeued;  /* Count how many "batches" have
> been
> > > > > dequeued */
> > > > > +	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set
> to 1 */
> > > > > +	struct rte_mempool *fcw_mempool;  /* FCW mempool */
> > > > > +	enum rte_bbdev_op_type op_type;  /* Type of this Queue:
> TE or TD
> > > > > */
> > > > > +	/* Internal Buffers for loopback input */
> > > > > +	uint8_t *lb_in;
> > > > > +	uint8_t *lb_out;
> > > > > +	rte_iova_t lb_in_addr_phys;
> > > > > +	rte_iova_t lb_out_addr_phys;
> > > > > +	struct acc100_device *d;
> > > > > +};
> > > > > +
> > > > >  /* Private data structure for each ACC100 device */  struct
> > > > > acc100_device
> > > {
> > > > >  	void *mmio_base;  /**< Base address of MMIO registers
> (BAR0) */
> > > > > +	void *sw_rings_base;  /* Base addr of un-aligned memory
> for sw
> > > > > rings */
> > > > > +	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw
> rings */
> > > > > +	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
> > > > > +	/* Virtual address of the info memory routed to the this
> > > > > +function
> > > > > under
> > > > > +	 * operation, whether it is PF or VF.
> > > > > +	 */
> > > > > +	union acc100_harq_layout_data *harq_layout;
> > > > > +	uint32_t sw_ring_size;
> > > > >  	uint32_t ddr_size; /* Size in kB */
> > > > > +	uint32_t *tail_ptrs; /* Base address of response tail pointer
> buffer */
> > > > > +	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers
> */
> > > > > +	/* Max number of entries available for each queue in device,
> > > > > depending
> > > > > +	 * on how many queues are enabled with configure()
> > > > > +	 */
> > > > > +	uint32_t sw_ring_max_depth;
> > > > >  	struct acc100_conf acc100_conf; /* ACC100 Initial
> configuration
> > > > > */
> > > > > +	/* Bitmap capturing which Queues have already been
> assigned */
> > > > > +	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
> > > > >  	bool pf_device; /**< True if this is a PF ACC100 device */
> > > > >  	bool configured; /**< True if this ACC100 device is configured
> > > > > */ };
> > > > > --
> > > > > 1.8.3.1
  

Patch

diff --git a/drivers/baseband/acc100/rte_acc100_pmd.c b/drivers/baseband/acc100/rte_acc100_pmd.c
index 7807a30..7a21c57 100644
--- a/drivers/baseband/acc100/rte_acc100_pmd.c
+++ b/drivers/baseband/acc100/rte_acc100_pmd.c
@@ -26,6 +26,22 @@ 
 RTE_LOG_REGISTER(acc100_logtype, pmd.bb.acc100, NOTICE);
 #endif
 
+/* Write to MMIO register address */
+static inline void
+mmio_write(void *addr, uint32_t value)
+{
+	*((volatile uint32_t *)(addr)) = rte_cpu_to_le_32(value);
+}
+
+/* Write a register of a ACC100 device */
+static inline void
+acc100_reg_write(struct acc100_device *d, uint32_t offset, uint32_t payload)
+{
+	void *reg_addr = RTE_PTR_ADD(d->mmio_base, offset);
+	mmio_write(reg_addr, payload);
+	usleep(1000);
+}
+
 /* Read a register of a ACC100 device */
 static inline uint32_t
 acc100_reg_read(struct acc100_device *d, uint32_t offset)
@@ -36,6 +52,22 @@ 
 	return rte_le_to_cpu_32(ret);
 }
 
+/* Basic Implementation of Log2 for exact 2^N */
+static inline uint32_t
+log2_basic(uint32_t value)
+{
+	return (value == 0) ? 0 : __builtin_ctz(value);
+}
+
+/* Calculate memory alignment offset assuming alignment is 2^N */
+static inline uint32_t
+calc_mem_alignment_offset(void *unaligned_virt_mem, uint32_t alignment)
+{
+	rte_iova_t unaligned_phy_mem = rte_malloc_virt2iova(unaligned_virt_mem);
+	return (uint32_t)(alignment -
+			(unaligned_phy_mem & (alignment-1)));
+}
+
 /* Calculate the offset of the enqueue register */
 static inline uint32_t
 queue_offset(bool pf_device, uint8_t vf_id, uint8_t qgrp_id, uint16_t aq_id)
@@ -204,10 +236,393 @@ 
 			acc100_conf->q_dl_5g.aq_depth_log2);
 }
 
+static void
+free_base_addresses(void **base_addrs, int size)
+{
+	int i;
+	for (i = 0; i < size; i++)
+		rte_free(base_addrs[i]);
+}
+
+static inline uint32_t
+get_desc_len(void)
+{
+	return sizeof(union acc100_dma_desc);
+}
+
+/* Allocate the 2 * 64MB block for the sw rings */
+static int
+alloc_2x64mb_sw_rings_mem(struct rte_bbdev *dev, struct acc100_device *d,
+		int socket)
+{
+	uint32_t sw_ring_size = ACC100_SIZE_64MBYTE;
+	d->sw_rings_base = rte_zmalloc_socket(dev->device->driver->name,
+			2 * sw_ring_size, RTE_CACHE_LINE_SIZE, socket);
+	if (d->sw_rings_base == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate memory for %s:%u",
+				dev->device->driver->name,
+				dev->data->dev_id);
+		return -ENOMEM;
+	}
+	memset(d->sw_rings_base, 0, ACC100_SIZE_64MBYTE);
+	uint32_t next_64mb_align_offset = calc_mem_alignment_offset(
+			d->sw_rings_base, ACC100_SIZE_64MBYTE);
+	d->sw_rings = RTE_PTR_ADD(d->sw_rings_base, next_64mb_align_offset);
+	d->sw_rings_phys = rte_malloc_virt2iova(d->sw_rings_base) +
+			next_64mb_align_offset;
+	d->sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
+	d->sw_ring_max_depth = d->sw_ring_size / get_desc_len();
+
+	return 0;
+}
+
+/* Attempt to allocate minimised memory space for sw rings */
+static void
+alloc_sw_rings_min_mem(struct rte_bbdev *dev, struct acc100_device *d,
+		uint16_t num_queues, int socket)
+{
+	rte_iova_t sw_rings_base_phy, next_64mb_align_addr_phy;
+	uint32_t next_64mb_align_offset;
+	rte_iova_t sw_ring_phys_end_addr;
+	void *base_addrs[SW_RING_MEM_ALLOC_ATTEMPTS];
+	void *sw_rings_base;
+	int i = 0;
+	uint32_t q_sw_ring_size = MAX_QUEUE_DEPTH * get_desc_len();
+	uint32_t dev_sw_ring_size = q_sw_ring_size * num_queues;
+
+	/* Find an aligned block of memory to store sw rings */
+	while (i < SW_RING_MEM_ALLOC_ATTEMPTS) {
+		/*
+		 * sw_ring allocated memory is guaranteed to be aligned to
+		 * q_sw_ring_size at the condition that the requested size is
+		 * less than the page size
+		 */
+		sw_rings_base = rte_zmalloc_socket(
+				dev->device->driver->name,
+				dev_sw_ring_size, q_sw_ring_size, socket);
+
+		if (sw_rings_base == NULL) {
+			rte_bbdev_log(ERR,
+					"Failed to allocate memory for %s:%u",
+					dev->device->driver->name,
+					dev->data->dev_id);
+			break;
+		}
+
+		sw_rings_base_phy = rte_malloc_virt2iova(sw_rings_base);
+		next_64mb_align_offset = calc_mem_alignment_offset(
+				sw_rings_base, ACC100_SIZE_64MBYTE);
+		next_64mb_align_addr_phy = sw_rings_base_phy +
+				next_64mb_align_offset;
+		sw_ring_phys_end_addr = sw_rings_base_phy + dev_sw_ring_size;
+
+		/* Check if the end of the sw ring memory block is before the
+		 * start of next 64MB aligned mem address
+		 */
+		if (sw_ring_phys_end_addr < next_64mb_align_addr_phy) {
+			d->sw_rings_phys = sw_rings_base_phy;
+			d->sw_rings = sw_rings_base;
+			d->sw_rings_base = sw_rings_base;
+			d->sw_ring_size = q_sw_ring_size;
+			d->sw_ring_max_depth = MAX_QUEUE_DEPTH;
+			break;
+		}
+		/* Store the address of the unaligned mem block */
+		base_addrs[i] = sw_rings_base;
+		i++;
+	}
+
+	/* Free all unaligned blocks of mem allocated in the loop */
+	free_base_addresses(base_addrs, i);
+}
+
+
+/* Allocate 64MB memory used for all software rings */
+static int
+acc100_setup_queues(struct rte_bbdev *dev, uint16_t num_queues, int socket_id)
+{
+	uint32_t phys_low, phys_high, payload;
+	struct acc100_device *d = dev->data->dev_private;
+	const struct acc100_registry_addr *reg_addr;
+
+	if (d->pf_device && !d->acc100_conf.pf_mode_en) {
+		rte_bbdev_log(NOTICE,
+				"%s has PF mode disabled. This PF can't be used.",
+				dev->data->name);
+		return -ENODEV;
+	}
+
+	alloc_sw_rings_min_mem(dev, d, num_queues, socket_id);
+
+	/* If minimal memory space approach failed, then allocate
+	 * the 2 * 64MB block for the sw rings
+	 */
+	if (d->sw_rings == NULL)
+		alloc_2x64mb_sw_rings_mem(dev, d, socket_id);
+
+	/* Configure ACC100 with the base address for DMA descriptor rings
+	 * Same descriptor rings used for UL and DL DMA Engines
+	 * Note : Assuming only VF0 bundle is used for PF mode
+	 */
+	phys_high = (uint32_t)(d->sw_rings_phys >> 32);
+	phys_low  = (uint32_t)(d->sw_rings_phys & ~(ACC100_SIZE_64MBYTE-1));
+
+	/* Choose correct registry addresses for the device type */
+	if (d->pf_device)
+		reg_addr = &pf_reg_addr;
+	else
+		reg_addr = &vf_reg_addr;
+
+	/* Read the populated cfg from ACC100 registers */
+	fetch_acc100_config(dev);
+
+	/* Mark as configured properly */
+	d->configured = true;
+
+	/* Release AXI from PF */
+	if (d->pf_device)
+		acc100_reg_write(d, HWPfDmaAxiControl, 1);
+
+	acc100_reg_write(d, reg_addr->dma_ring_ul5g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->dma_ring_ul5g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->dma_ring_dl5g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->dma_ring_dl5g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->dma_ring_ul4g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->dma_ring_ul4g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->dma_ring_dl4g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->dma_ring_dl4g_lo, phys_low);
+
+	/*
+	 * Configure Ring Size to the max queue ring size
+	 * (used for wrapping purpose)
+	 */
+	payload = log2_basic(d->sw_ring_size / 64);
+	acc100_reg_write(d, reg_addr->ring_size, payload);
+
+	/* Configure tail pointer for use when SDONE enabled */
+	d->tail_ptrs = rte_zmalloc_socket(
+			dev->device->driver->name,
+			ACC100_NUM_QGRPS * ACC100_NUM_AQS * sizeof(uint32_t),
+			RTE_CACHE_LINE_SIZE, socket_id);
+	if (d->tail_ptrs == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate tail ptr for %s:%u",
+				dev->device->driver->name,
+				dev->data->dev_id);
+		rte_free(d->sw_rings);
+		return -ENOMEM;
+	}
+	d->tail_ptr_phys = rte_malloc_virt2iova(d->tail_ptrs);
+
+	phys_high = (uint32_t)(d->tail_ptr_phys >> 32);
+	phys_low  = (uint32_t)(d->tail_ptr_phys);
+	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->tail_ptrs_ul5g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->tail_ptrs_dl5g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->tail_ptrs_ul4g_lo, phys_low);
+	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_hi, phys_high);
+	acc100_reg_write(d, reg_addr->tail_ptrs_dl4g_lo, phys_low);
+
+	d->harq_layout = rte_zmalloc_socket("HARQ Layout",
+			ACC100_HARQ_LAYOUT * sizeof(*d->harq_layout),
+			RTE_CACHE_LINE_SIZE, dev->data->socket_id);
+
+	rte_bbdev_log_debug(
+			"ACC100 (%s) configured  sw_rings = %p, sw_rings_phys = %#"
+			PRIx64, dev->data->name, d->sw_rings, d->sw_rings_phys);
+
+	return 0;
+}
+
 /* Free 64MB memory used for software rings */
 static int
-acc100_dev_close(struct rte_bbdev *dev  __rte_unused)
+acc100_dev_close(struct rte_bbdev *dev)
 {
+	struct acc100_device *d = dev->data->dev_private;
+	if (d->sw_rings_base != NULL) {
+		rte_free(d->tail_ptrs);
+		rte_free(d->sw_rings_base);
+		d->sw_rings_base = NULL;
+	}
+	usleep(1000);
+	return 0;
+}
+
+
+/**
+ * Report a ACC100 queue index which is free
+ * Return 0 to 16k for a valid queue_idx or -1 when no queue is available
+ * Note : Only supporting VF0 Bundle for PF mode
+ */
+static int
+acc100_find_free_queue_idx(struct rte_bbdev *dev,
+		const struct rte_bbdev_queue_conf *conf)
+{
+	struct acc100_device *d = dev->data->dev_private;
+	int op_2_acc[5] = {0, UL_4G, DL_4G, UL_5G, DL_5G};
+	int acc = op_2_acc[conf->op_type];
+	struct rte_q_topology_t *qtop = NULL;
+	qtopFromAcc(&qtop, acc, &(d->acc100_conf));
+	if (qtop == NULL)
+		return -1;
+	/* Identify matching QGroup Index which are sorted in priority order */
+	uint16_t group_idx = qtop->first_qgroup_index;
+	group_idx += conf->priority;
+	if (group_idx >= ACC100_NUM_QGRPS ||
+			conf->priority >= qtop->num_qgroups) {
+		rte_bbdev_log(INFO, "Invalid Priority on %s, priority %u",
+				dev->data->name, conf->priority);
+		return -1;
+	}
+	/* Find a free AQ_idx  */
+	uint16_t aq_idx;
+	for (aq_idx = 0; aq_idx < qtop->num_aqs_per_groups; aq_idx++) {
+		if (((d->q_assigned_bit_map[group_idx] >> aq_idx) & 0x1) == 0) {
+			/* Mark the Queue as assigned */
+			d->q_assigned_bit_map[group_idx] |= (1 << aq_idx);
+			/* Report the AQ Index */
+			return (group_idx << GRP_ID_SHIFT) + aq_idx;
+		}
+	}
+	rte_bbdev_log(INFO, "Failed to find free queue on %s, priority %u",
+			dev->data->name, conf->priority);
+	return -1;
+}
+
+/* Setup ACC100 queue */
+static int
+acc100_queue_setup(struct rte_bbdev *dev, uint16_t queue_id,
+		const struct rte_bbdev_queue_conf *conf)
+{
+	struct acc100_device *d = dev->data->dev_private;
+	struct acc100_queue *q;
+	int16_t q_idx;
+
+	/* Allocate the queue data structure. */
+	q = rte_zmalloc_socket(dev->device->driver->name, sizeof(*q),
+			RTE_CACHE_LINE_SIZE, conf->socket);
+	if (q == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate queue memory");
+		return -ENOMEM;
+	}
+
+	q->d = d;
+	q->ring_addr = RTE_PTR_ADD(d->sw_rings, (d->sw_ring_size * queue_id));
+	q->ring_addr_phys = d->sw_rings_phys + (d->sw_ring_size * queue_id);
+
+	/* Prepare the Ring with default descriptor format */
+	union acc100_dma_desc *desc = NULL;
+	unsigned int desc_idx, b_idx;
+	int fcw_len = (conf->op_type == RTE_BBDEV_OP_LDPC_ENC ?
+		ACC100_FCW_LE_BLEN : (conf->op_type == RTE_BBDEV_OP_TURBO_DEC ?
+		ACC100_FCW_TD_BLEN : ACC100_FCW_LD_BLEN));
+
+	for (desc_idx = 0; desc_idx < d->sw_ring_max_depth; desc_idx++) {
+		desc = q->ring_addr + desc_idx;
+		desc->req.word0 = ACC100_DMA_DESC_TYPE;
+		desc->req.word1 = 0; /**< Timestamp */
+		desc->req.word2 = 0;
+		desc->req.word3 = 0;
+		uint64_t fcw_offset = (desc_idx << 8) + ACC100_DESC_FCW_OFFSET;
+		desc->req.data_ptrs[0].address = q->ring_addr_phys + fcw_offset;
+		desc->req.data_ptrs[0].blen = fcw_len;
+		desc->req.data_ptrs[0].blkid = ACC100_DMA_BLKID_FCW;
+		desc->req.data_ptrs[0].last = 0;
+		desc->req.data_ptrs[0].dma_ext = 0;
+		for (b_idx = 1; b_idx < ACC100_DMA_MAX_NUM_POINTERS - 1;
+				b_idx++) {
+			desc->req.data_ptrs[b_idx].blkid = ACC100_DMA_BLKID_IN;
+			desc->req.data_ptrs[b_idx].last = 1;
+			desc->req.data_ptrs[b_idx].dma_ext = 0;
+			b_idx++;
+			desc->req.data_ptrs[b_idx].blkid =
+					ACC100_DMA_BLKID_OUT_ENC;
+			desc->req.data_ptrs[b_idx].last = 1;
+			desc->req.data_ptrs[b_idx].dma_ext = 0;
+		}
+		/* Preset some fields of LDPC FCW */
+		desc->req.fcw_ld.FCWversion = ACC100_FCW_VER;
+		desc->req.fcw_ld.gain_i = 1;
+		desc->req.fcw_ld.gain_h = 1;
+	}
+
+	q->lb_in = rte_zmalloc_socket(dev->device->driver->name,
+			RTE_CACHE_LINE_SIZE,
+			RTE_CACHE_LINE_SIZE, conf->socket);
+	if (q->lb_in == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate lb_in memory");
+		return -ENOMEM;
+	}
+	q->lb_in_addr_phys = rte_malloc_virt2iova(q->lb_in);
+	q->lb_out = rte_zmalloc_socket(dev->device->driver->name,
+			RTE_CACHE_LINE_SIZE,
+			RTE_CACHE_LINE_SIZE, conf->socket);
+	if (q->lb_out == NULL) {
+		rte_bbdev_log(ERR, "Failed to allocate lb_out memory");
+		return -ENOMEM;
+	}
+	q->lb_out_addr_phys = rte_malloc_virt2iova(q->lb_out);
+
+	/*
+	 * Software queue ring wraps synchronously with the HW when it reaches
+	 * the boundary of the maximum allocated queue size, no matter what the
+	 * sw queue size is. This wrapping is guarded by setting the wrap_mask
+	 * to represent the maximum queue size as allocated at the time when
+	 * the device has been setup (in configure()).
+	 *
+	 * The queue depth is set to the queue size value (conf->queue_size).
+	 * This limits the occupancy of the queue at any point of time, so that
+	 * the queue does not get swamped with enqueue requests.
+	 */
+	q->sw_ring_depth = conf->queue_size;
+	q->sw_ring_wrap_mask = d->sw_ring_max_depth - 1;
+
+	q->op_type = conf->op_type;
+
+	q_idx = acc100_find_free_queue_idx(dev, conf);
+	if (q_idx == -1) {
+		rte_free(q);
+		return -1;
+	}
+
+	q->qgrp_id = (q_idx >> GRP_ID_SHIFT) & 0xF;
+	q->vf_id = (q_idx >> VF_ID_SHIFT)  & 0x3F;
+	q->aq_id = q_idx & 0xF;
+	q->aq_depth = (conf->op_type ==  RTE_BBDEV_OP_TURBO_DEC) ?
+			(1 << d->acc100_conf.q_ul_4g.aq_depth_log2) :
+			(1 << d->acc100_conf.q_dl_4g.aq_depth_log2);
+
+	q->mmio_reg_enqueue = RTE_PTR_ADD(d->mmio_base,
+			queue_offset(d->pf_device,
+					q->vf_id, q->qgrp_id, q->aq_id));
+
+	rte_bbdev_log_debug(
+			"Setup dev%u q%u: qgrp_id=%u, vf_id=%u, aq_id=%u, aq_depth=%u, mmio_reg_enqueue=%p",
+			dev->data->dev_id, queue_id, q->qgrp_id, q->vf_id,
+			q->aq_id, q->aq_depth, q->mmio_reg_enqueue);
+
+	dev->data->queues[queue_id].queue_private = q;
+	return 0;
+}
+
+/* Release ACC100 queue */
+static int
+acc100_queue_release(struct rte_bbdev *dev, uint16_t q_id)
+{
+	struct acc100_device *d = dev->data->dev_private;
+	struct acc100_queue *q = dev->data->queues[q_id].queue_private;
+
+	if (q != NULL) {
+		/* Mark the Queue as un-assigned */
+		d->q_assigned_bit_map[q->qgrp_id] &= (0xFFFFFFFF -
+				(1 << q->aq_id));
+		rte_free(q->lb_in);
+		rte_free(q->lb_out);
+		rte_free(q);
+		dev->data->queues[q_id].queue_private = NULL;
+	}
+
 	return 0;
 }
 
@@ -258,8 +673,11 @@ 
 }
 
 static const struct rte_bbdev_ops acc100_bbdev_ops = {
+	.setup_queues = acc100_setup_queues,
 	.close = acc100_dev_close,
 	.info_get = acc100_dev_info_get,
+	.queue_setup = acc100_queue_setup,
+	.queue_release = acc100_queue_release,
 };
 
 /* ACC100 PCI PF address map */
diff --git a/drivers/baseband/acc100/rte_acc100_pmd.h b/drivers/baseband/acc100/rte_acc100_pmd.h
index 662e2c8..0e2b79c 100644
--- a/drivers/baseband/acc100/rte_acc100_pmd.h
+++ b/drivers/baseband/acc100/rte_acc100_pmd.h
@@ -518,11 +518,56 @@  struct acc100_registry_addr {
 	.ddr_range = HWVfDmaDdrBaseRangeRoVf,
 };
 
+/* Structure associated with each queue. */
+struct __rte_cache_aligned acc100_queue {
+	union acc100_dma_desc *ring_addr;  /* Virtual address of sw ring */
+	rte_iova_t ring_addr_phys;  /* Physical address of software ring */
+	uint32_t sw_ring_head;  /* software ring head */
+	uint32_t sw_ring_tail;  /* software ring tail */
+	/* software ring size (descriptors, not bytes) */
+	uint32_t sw_ring_depth;
+	/* mask used to wrap enqueued descriptors on the sw ring */
+	uint32_t sw_ring_wrap_mask;
+	/* MMIO register used to enqueue descriptors */
+	void *mmio_reg_enqueue;
+	uint8_t vf_id;  /* VF ID (max = 63) */
+	uint8_t qgrp_id;  /* Queue Group ID */
+	uint16_t aq_id;  /* Atomic Queue ID */
+	uint16_t aq_depth;  /* Depth of atomic queue */
+	uint32_t aq_enqueued;  /* Count how many "batches" have been enqueued */
+	uint32_t aq_dequeued;  /* Count how many "batches" have been dequeued */
+	uint32_t irq_enable;  /* Enable ops dequeue interrupts if set to 1 */
+	struct rte_mempool *fcw_mempool;  /* FCW mempool */
+	enum rte_bbdev_op_type op_type;  /* Type of this Queue: TE or TD */
+	/* Internal Buffers for loopback input */
+	uint8_t *lb_in;
+	uint8_t *lb_out;
+	rte_iova_t lb_in_addr_phys;
+	rte_iova_t lb_out_addr_phys;
+	struct acc100_device *d;
+};
+
 /* Private data structure for each ACC100 device */
 struct acc100_device {
 	void *mmio_base;  /**< Base address of MMIO registers (BAR0) */
+	void *sw_rings_base;  /* Base addr of un-aligned memory for sw rings */
+	void *sw_rings;  /* 64MBs of 64MB aligned memory for sw rings */
+	rte_iova_t sw_rings_phys;  /* Physical address of sw_rings */
+	/* Virtual address of the info memory routed to the this function under
+	 * operation, whether it is PF or VF.
+	 */
+	union acc100_harq_layout_data *harq_layout;
+	uint32_t sw_ring_size;
 	uint32_t ddr_size; /* Size in kB */
+	uint32_t *tail_ptrs; /* Base address of response tail pointer buffer */
+	rte_iova_t tail_ptr_phys; /* Physical address of tail pointers */
+	/* Max number of entries available for each queue in device, depending
+	 * on how many queues are enabled with configure()
+	 */
+	uint32_t sw_ring_max_depth;
 	struct acc100_conf acc100_conf; /* ACC100 Initial configuration */
+	/* Bitmap capturing which Queues have already been assigned */
+	uint16_t q_assigned_bit_map[ACC100_NUM_QGRPS];
 	bool pf_device; /**< True if this is a PF ACC100 device */
 	bool configured; /**< True if this ACC100 device is configured */
 };