[v2,05/13] vdpa/mlx5: prepare HW queues

Message ID 1580292549-27439-6-git-send-email-matan@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Maxime Coquelin
Headers
Series Introduce mlx5 vDPA driver |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Matan Azrad Jan. 29, 2020, 10:09 a.m. UTC
  As an arrangement to the vitrio queues creation, a 2 QPs and CQ may be
created for the virtio queue.

The design is to trigger an event for the guest and for the vdpa driver
when a new CQE is posted by the HW after the packet transition.

This patch add the basic operations to create and destroy the above HW
objects  and to trigger the CQE events when a new CQE is posted.

Signed-off-by: Matan Azrad <matan@mellanox.com>
Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
---
 drivers/common/mlx5/mlx5_prm.h      |   4 +
 drivers/vdpa/mlx5/Makefile          |   1 +
 drivers/vdpa/mlx5/meson.build       |   1 +
 drivers/vdpa/mlx5/mlx5_vdpa.h       |  89 ++++++++
 drivers/vdpa/mlx5/mlx5_vdpa_event.c | 399 ++++++++++++++++++++++++++++++++++++
 5 files changed, 494 insertions(+)
 create mode 100644 drivers/vdpa/mlx5/mlx5_vdpa_event.c
  

Comments

Maxime Coquelin Jan. 30, 2020, 6:17 p.m. UTC | #1
On 1/29/20 11:09 AM, Matan Azrad wrote:
> As an arrangement to the vitrio queues creation, a 2 QPs and CQ may be
> created for the virtio queue.
> 
> The design is to trigger an event for the guest and for the vdpa driver
> when a new CQE is posted by the HW after the packet transition.
> 
> This patch add the basic operations to create and destroy the above HW
> objects  and to trigger the CQE events when a new CQE is posted.
> 
> Signed-off-by: Matan Azrad <matan@mellanox.com>
> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> ---
>  drivers/common/mlx5/mlx5_prm.h      |   4 +
>  drivers/vdpa/mlx5/Makefile          |   1 +
>  drivers/vdpa/mlx5/meson.build       |   1 +
>  drivers/vdpa/mlx5/mlx5_vdpa.h       |  89 ++++++++
>  drivers/vdpa/mlx5/mlx5_vdpa_event.c | 399 ++++++++++++++++++++++++++++++++++++
>  5 files changed, 494 insertions(+)
>  create mode 100644 drivers/vdpa/mlx5/mlx5_vdpa_event.c
> 
> diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
> index b48cd0a..b533798 100644
> --- a/drivers/common/mlx5/mlx5_prm.h
> +++ b/drivers/common/mlx5/mlx5_prm.h
> @@ -392,6 +392,10 @@ struct mlx5_cqe {
>  /* CQE format value. */
>  #define MLX5_COMPRESSED 0x3
>  
> +/* CQ doorbell cmd types. */
> +#define MLX5_CQ_DBR_CMD_SOL_ONLY (1 << 24)
> +#define MLX5_CQ_DBR_CMD_ALL (0 << 24)
> +
>  /* Action type of header modification. */
>  enum {
>  	MLX5_MODIFICATION_TYPE_SET = 0x1,
> diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
> index 5472797..7f13756 100644
> --- a/drivers/vdpa/mlx5/Makefile
> +++ b/drivers/vdpa/mlx5/Makefile
> @@ -9,6 +9,7 @@ LIB = librte_pmd_mlx5_vdpa.a
>  # Sources.
>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
> +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c
>  
>  # Basic CFLAGS.
>  CFLAGS += -O3
> diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build
> index 7e5dd95..c609f7c 100644
> --- a/drivers/vdpa/mlx5/meson.build
> +++ b/drivers/vdpa/mlx5/meson.build
> @@ -13,6 +13,7 @@ deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
>  sources = files(
>  	'mlx5_vdpa.c',
>  	'mlx5_vdpa_mem.c',
> +	'mlx5_vdpa_event.c',
>  )
>  cflags_options = [
>  	'-std=c11',
> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
> index e27baea..30030b7 100644
> --- a/drivers/vdpa/mlx5/mlx5_vdpa.h
> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
> @@ -9,9 +9,40 @@
>  
>  #include <rte_vdpa.h>
>  #include <rte_vhost.h>
> +#include <rte_spinlock.h>
> +#include <rte_interrupts.h>
>  
>  #include <mlx5_glue.h>
>  #include <mlx5_devx_cmds.h>
> +#include <mlx5_prm.h>
> +
> +
> +#define MLX5_VDPA_INTR_RETRIES 256
> +#define MLX5_VDPA_INTR_RETRIES_USEC 1000
> +
> +struct mlx5_vdpa_cq {
> +	uint16_t log_desc_n;
> +	uint32_t cq_ci:24;
> +	uint32_t arm_sn:2;
> +	rte_spinlock_t sl;
> +	struct mlx5_devx_obj *cq;
> +	struct mlx5dv_devx_umem *umem_obj;
> +	union {
> +		volatile void *umem_buf;
> +		volatile struct mlx5_cqe *cqes;
> +	};
> +	volatile uint32_t *db_rec;
> +	uint64_t errors;
> +};
> +
> +struct mlx5_vdpa_event_qp {
> +	struct mlx5_vdpa_cq cq;
> +	struct mlx5_devx_obj *fw_qp;
> +	struct mlx5_devx_obj *sw_qp;
> +	struct mlx5dv_devx_umem *umem_obj;
> +	void *umem_buf;
> +	volatile uint32_t *db_rec;
> +};
>  
>  struct mlx5_vdpa_query_mr {
>  	SLIST_ENTRY(mlx5_vdpa_query_mr) next;
> @@ -34,6 +65,10 @@ struct mlx5_vdpa_priv {
>  	uint32_t gpa_mkey_index;
>  	struct ibv_mr *null_mr;
>  	struct rte_vhost_memory *vmem;
> +	uint32_t eqn;
> +	struct mlx5dv_devx_event_channel *eventc;
> +	struct mlx5dv_devx_uar *uar;
> +	struct rte_intr_handle intr_handle;
>  	SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
>  };
>  
> @@ -57,4 +92,58 @@ struct mlx5_vdpa_priv {
>   */
>  int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
>  
> +
> +/**
> + * Create an event QP and all its related resources.
> + *
> + * @param[in] priv
> + *   The vdpa driver private structure.
> + * @param[in] desc_n
> + *   Number of descriptors.
> + * @param[in] callfd
> + *   The guest notification file descriptor.
> + * @param[in/out] eqp
> + *   Pointer to the event QP structure.
> + *
> + * @return
> + *   0 on success, -1 otherwise and rte_errno is set.
> + */
> +int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
> +			      int callfd, struct mlx5_vdpa_event_qp *eqp);
> +
> +/**
> + * Destroy an event QP and all its related resources.
> + *
> + * @param[in/out] eqp
> + *   Pointer to the event QP structure.
> + */
> +void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp);
> +
> +/**
> + * Release all the event global resources.
> + *
> + * @param[in] priv
> + *   The vdpa driver private structure.
> + */
> +void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv);
> +
> +/**
> + * Setup CQE event.
> + *
> + * @param[in] priv
> + *   The vdpa driver private structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
> +
> +/**
> + * Unset CQE event .
> + *
> + * @param[in] priv
> + *   The vdpa driver private structure.
> + */
> +void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
> +
>  #endif /* RTE_PMD_MLX5_VDPA_H_ */
> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
> new file mode 100644
> index 0000000..35518ad
> --- /dev/null
> +++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
> @@ -0,0 +1,399 @@
> +/* SPDX-License-Identifier: BSD-3-Clause
> + * Copyright 2019 Mellanox Technologies, Ltd
> + */
> +#include <unistd.h>
> +#include <stdint.h>
> +#include <fcntl.h>
> +
> +#include <rte_malloc.h>
> +#include <rte_errno.h>
> +#include <rte_lcore.h>
> +#include <rte_atomic.h>
> +#include <rte_common.h>
> +#include <rte_io.h>
> +
> +#include <mlx5_common.h>
> +
> +#include "mlx5_vdpa_utils.h"
> +#include "mlx5_vdpa.h"
> +
> +
> +void
> +mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
> +{
> +	if (priv->uar) {
> +		mlx5_glue->devx_free_uar(priv->uar);
> +		priv->uar = NULL;
> +	}
> +	if (priv->eventc) {
> +		mlx5_glue->devx_destroy_event_channel(priv->eventc);
> +		priv->eventc = NULL;
> +	}
> +	priv->eqn = 0;
> +}
> +
> +/* Prepare all the global resources for all the event objects.*/
> +static int
> +mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv *priv)
> +{
> +	uint32_t lcore;
> +
> +	if (priv->eventc)
> +		return 0;
> +	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
> +	if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) {
> +		rte_errno = errno;
> +		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
> +		return -1;
> +	}
> +	priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx,
> +			   MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
> +	if (!priv->eventc) {
> +		rte_errno = errno;
> +		DRV_LOG(ERR, "Failed to create event channel %d.",
> +			rte_errno);
> +		goto error;
> +	}
> +	priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0);
> +	if (!priv->uar) {
> +		rte_errno = errno;
> +		DRV_LOG(ERR, "Failed to allocate UAR.");
> +		goto error;
> +	}
> +	return 0;
> +error:
> +	mlx5_vdpa_event_qp_global_release(priv);
> +	return -1;
> +}
> +
> +static void
> +mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq)
> +{
> +	if (cq->cq)
> +		claim_zero(mlx5_devx_cmd_destroy(cq->cq));
> +	if (cq->umem_obj)
> +		claim_zero(mlx5_glue->devx_umem_dereg(cq->umem_obj));
> +	if (cq->umem_buf)
> +		rte_free((void *)(uintptr_t)cq->umem_buf);
> +	memset(cq, 0, sizeof(*cq));
> +}
> +
> +static inline void
> +mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq)
> +{
> +	const unsigned int cqe_mask = (1 << cq->log_desc_n) - 1;
> +	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
> +	uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK & cqe_mask;
> +	uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
> +	uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id;
> +	uint64_t db_be = rte_cpu_to_be_64(doorbell);
> +	uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr, MLX5_CQ_DOORBELL);
> +
> +	rte_io_wmb();
> +	cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
> +	rte_wmb();
> +#ifdef RTE_ARCH_64
> +	*(uint64_t *)addr = db_be;
> +#else
> +	*(uint32_t *)addr = db_be;
> +	rte_io_wmb();
> +	*((uint32_t *)addr + 1) = db_be >> 32;
> +#endif
> +	cq->arm_sn++;
> +}
> +
> +static int
> +mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
> +		    int callfd, struct mlx5_vdpa_cq *cq)
> +{
> +	struct mlx5_devx_cq_attr attr;
> +	size_t pgsize = sysconf(_SC_PAGESIZE);
> +	uint32_t umem_size;
> +	int ret;
> +	uint16_t event_nums[1] = {0};
> +
> +	cq->log_desc_n = log_desc_n;
> +	umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) +
> +							sizeof(*cq->db_rec) * 2;
> +	cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
> +	if (!cq->umem_buf) {
> +		DRV_LOG(ERR, "Failed to allocate memory for CQ.");
> +		rte_errno = ENOMEM;
> +		return -ENOMEM;
> +	}
> +	cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
> +						(void *)(uintptr_t)cq->umem_buf,
> +						umem_size,
> +						IBV_ACCESS_LOCAL_WRITE);
> +	if (!cq->umem_obj) {
> +		DRV_LOG(ERR, "Failed to register umem for CQ.");
> +		goto error;
> +	}
> +	attr.q_umem_valid = 1;
> +	attr.db_umem_valid = 1;
> +	attr.use_first_only = 0;
> +	attr.overrun_ignore = 0;
> +	attr.uar_page_id = priv->uar->page_id;
> +	attr.q_umem_id = cq->umem_obj->umem_id;
> +	attr.q_umem_offset = 0;
> +	attr.db_umem_id = cq->umem_obj->umem_id;
> +	attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n);
> +	attr.eqn = priv->eqn;
> +	attr.log_cq_size = log_desc_n;
> +	attr.log_page_size = rte_log2_u32(pgsize);
> +	cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr);
> +	if (!cq->cq)
> +		goto error;
> +	cq->db_rec = RTE_PTR_ADD(cq->umem_buf, (uintptr_t)attr.db_umem_offset);
> +	cq->cq_ci = 0;
> +	rte_spinlock_init(&cq->sl);
> +	/* Subscribe CQ event to the event channel controlled by the driver. */
> +	ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq->obj,
> +						   sizeof(event_nums),
> +						   event_nums,
> +						   (uint64_t)(uintptr_t)cq);
> +	if (ret) {
> +		DRV_LOG(ERR, "Failed to subscribe CQE event.");
> +		rte_errno = errno;
> +		goto error;
> +	}
> +	/* Subscribe CQ event to the guest FD only if it is not in poll mode. */
> +	if (callfd != -1) {
> +		ret = mlx5_glue->devx_subscribe_devx_event_fd(priv->eventc,
> +							      callfd,
> +							      cq->cq->obj, 0);
> +		if (ret) {
> +			DRV_LOG(ERR, "Failed to subscribe CQE event fd.");
> +			rte_errno = errno;
> +			goto error;
> +		}
> +	}
> +	/* First arming. */
> +	mlx5_vdpa_cq_arm(priv, cq);
> +	return 0;
> +error:
> +	mlx5_vdpa_cq_destroy(cq);
> +	return -1;
> +}
> +
> +static inline void __rte_unused
> +mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
> +		  struct mlx5_vdpa_cq *cq)
> +{
> +	struct mlx5_vdpa_event_qp *eqp =
> +				container_of(cq, struct mlx5_vdpa_event_qp, cq);
> +	const unsigned int cqe_size = 1 << cq->log_desc_n;
> +	const unsigned int cqe_mask = cqe_size - 1;
> +	int ret;
> +
> +	do {
> +		volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
> +							    cqe_mask);
> +
> +		ret = check_cqe(cqe, cqe_size, cq->cq_ci);
> +		switch (ret) {
> +		case MLX5_CQE_STATUS_ERR:
> +			cq->errors++;
> +			/*fall-through*/
> +		case MLX5_CQE_STATUS_SW_OWN:
> +			cq->cq_ci++;
> +			break;
> +		case MLX5_CQE_STATUS_HW_OWN:
> +		default:
> +			break;
> +		}
> +	} while (ret != MLX5_CQE_STATUS_HW_OWN);

Isn't there a risk of endless loop here?

> +	rte_io_wmb();
> +	/* Ring CQ doorbell record. */
> +	cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
> +	rte_io_wmb();
> +	/* Ring SW QP doorbell record. */
> +	eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cqe_size);
> +}
> +
  
Matan Azrad Jan. 31, 2020, 6:56 a.m. UTC | #2
Hi

From: Maxime Coquelin
> On 1/29/20 11:09 AM, Matan Azrad wrote:
> > As an arrangement to the vitrio queues creation, a 2 QPs and CQ may be
> > created for the virtio queue.
> >
> > The design is to trigger an event for the guest and for the vdpa
> > driver when a new CQE is posted by the HW after the packet transition.
> >
> > This patch add the basic operations to create and destroy the above HW
> > objects  and to trigger the CQE events when a new CQE is posted.
> >
> > Signed-off-by: Matan Azrad <matan@mellanox.com>
> > Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
> > ---
> >  drivers/common/mlx5/mlx5_prm.h      |   4 +
> >  drivers/vdpa/mlx5/Makefile          |   1 +
> >  drivers/vdpa/mlx5/meson.build       |   1 +
> >  drivers/vdpa/mlx5/mlx5_vdpa.h       |  89 ++++++++
> >  drivers/vdpa/mlx5/mlx5_vdpa_event.c | 399
> > ++++++++++++++++++++++++++++++++++++
> >  5 files changed, 494 insertions(+)
> >  create mode 100644 drivers/vdpa/mlx5/mlx5_vdpa_event.c
> >
> > diff --git a/drivers/common/mlx5/mlx5_prm.h
> > b/drivers/common/mlx5/mlx5_prm.h index b48cd0a..b533798 100644
> > --- a/drivers/common/mlx5/mlx5_prm.h
> > +++ b/drivers/common/mlx5/mlx5_prm.h
> > @@ -392,6 +392,10 @@ struct mlx5_cqe {
> >  /* CQE format value. */
> >  #define MLX5_COMPRESSED 0x3
> >
> > +/* CQ doorbell cmd types. */
> > +#define MLX5_CQ_DBR_CMD_SOL_ONLY (1 << 24) #define
> > +MLX5_CQ_DBR_CMD_ALL (0 << 24)
> > +
> >  /* Action type of header modification. */  enum {
> >  	MLX5_MODIFICATION_TYPE_SET = 0x1,
> > diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
> > index 5472797..7f13756 100644
> > --- a/drivers/vdpa/mlx5/Makefile
> > +++ b/drivers/vdpa/mlx5/Makefile
> > @@ -9,6 +9,7 @@ LIB = librte_pmd_mlx5_vdpa.a  # Sources.
> >  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
> >  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
> > +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c
> >
> >  # Basic CFLAGS.
> >  CFLAGS += -O3
> > diff --git a/drivers/vdpa/mlx5/meson.build
> > b/drivers/vdpa/mlx5/meson.build index 7e5dd95..c609f7c 100644
> > --- a/drivers/vdpa/mlx5/meson.build
> > +++ b/drivers/vdpa/mlx5/meson.build
> > @@ -13,6 +13,7 @@ deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci',
> > 'eal', 'sched']  sources = files(
> >  	'mlx5_vdpa.c',
> >  	'mlx5_vdpa_mem.c',
> > +	'mlx5_vdpa_event.c',
> >  )
> >  cflags_options = [
> >  	'-std=c11',
> > diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h
> > b/drivers/vdpa/mlx5/mlx5_vdpa.h index e27baea..30030b7 100644
> > --- a/drivers/vdpa/mlx5/mlx5_vdpa.h
> > +++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
> > @@ -9,9 +9,40 @@
> >
> >  #include <rte_vdpa.h>
> >  #include <rte_vhost.h>
> > +#include <rte_spinlock.h>
> > +#include <rte_interrupts.h>
> >
> >  #include <mlx5_glue.h>
> >  #include <mlx5_devx_cmds.h>
> > +#include <mlx5_prm.h>
> > +
> > +
> > +#define MLX5_VDPA_INTR_RETRIES 256
> > +#define MLX5_VDPA_INTR_RETRIES_USEC 1000
> > +
> > +struct mlx5_vdpa_cq {
> > +	uint16_t log_desc_n;
> > +	uint32_t cq_ci:24;
> > +	uint32_t arm_sn:2;
> > +	rte_spinlock_t sl;
> > +	struct mlx5_devx_obj *cq;
> > +	struct mlx5dv_devx_umem *umem_obj;
> > +	union {
> > +		volatile void *umem_buf;
> > +		volatile struct mlx5_cqe *cqes;
> > +	};
> > +	volatile uint32_t *db_rec;
> > +	uint64_t errors;
> > +};
> > +
> > +struct mlx5_vdpa_event_qp {
> > +	struct mlx5_vdpa_cq cq;
> > +	struct mlx5_devx_obj *fw_qp;
> > +	struct mlx5_devx_obj *sw_qp;
> > +	struct mlx5dv_devx_umem *umem_obj;
> > +	void *umem_buf;
> > +	volatile uint32_t *db_rec;
> > +};
> >
> >  struct mlx5_vdpa_query_mr {
> >  	SLIST_ENTRY(mlx5_vdpa_query_mr) next; @@ -34,6 +65,10 @@
> struct
> > mlx5_vdpa_priv {
> >  	uint32_t gpa_mkey_index;
> >  	struct ibv_mr *null_mr;
> >  	struct rte_vhost_memory *vmem;
> > +	uint32_t eqn;
> > +	struct mlx5dv_devx_event_channel *eventc;
> > +	struct mlx5dv_devx_uar *uar;
> > +	struct rte_intr_handle intr_handle;
> >  	SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;  };
> >
> > @@ -57,4 +92,58 @@ struct mlx5_vdpa_priv {
> >   */
> >  int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
> >
> > +
> > +/**
> > + * Create an event QP and all its related resources.
> > + *
> > + * @param[in] priv
> > + *   The vdpa driver private structure.
> > + * @param[in] desc_n
> > + *   Number of descriptors.
> > + * @param[in] callfd
> > + *   The guest notification file descriptor.
> > + * @param[in/out] eqp
> > + *   Pointer to the event QP structure.
> > + *
> > + * @return
> > + *   0 on success, -1 otherwise and rte_errno is set.
> > + */
> > +int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t
> desc_n,
> > +			      int callfd, struct mlx5_vdpa_event_qp *eqp);
> > +
> > +/**
> > + * Destroy an event QP and all its related resources.
> > + *
> > + * @param[in/out] eqp
> > + *   Pointer to the event QP structure.
> > + */
> > +void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp);
> > +
> > +/**
> > + * Release all the event global resources.
> > + *
> > + * @param[in] priv
> > + *   The vdpa driver private structure.
> > + */
> > +void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv);
> > +
> > +/**
> > + * Setup CQE event.
> > + *
> > + * @param[in] priv
> > + *   The vdpa driver private structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
> > +
> > +/**
> > + * Unset CQE event .
> > + *
> > + * @param[in] priv
> > + *   The vdpa driver private structure.
> > + */
> > +void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
> > +
> >  #endif /* RTE_PMD_MLX5_VDPA_H_ */
> > diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
> > b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
> > new file mode 100644
> > index 0000000..35518ad
> > --- /dev/null
> > +++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
> > @@ -0,0 +1,399 @@
> > +/* SPDX-License-Identifier: BSD-3-Clause
> > + * Copyright 2019 Mellanox Technologies, Ltd  */ #include <unistd.h>
> > +#include <stdint.h> #include <fcntl.h>
> > +
> > +#include <rte_malloc.h>
> > +#include <rte_errno.h>
> > +#include <rte_lcore.h>
> > +#include <rte_atomic.h>
> > +#include <rte_common.h>
> > +#include <rte_io.h>
> > +
> > +#include <mlx5_common.h>
> > +
> > +#include "mlx5_vdpa_utils.h"
> > +#include "mlx5_vdpa.h"
> > +
> > +
> > +void
> > +mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv) {
> > +	if (priv->uar) {
> > +		mlx5_glue->devx_free_uar(priv->uar);
> > +		priv->uar = NULL;
> > +	}
> > +	if (priv->eventc) {
> > +		mlx5_glue->devx_destroy_event_channel(priv->eventc);
> > +		priv->eventc = NULL;
> > +	}
> > +	priv->eqn = 0;
> > +}
> > +
> > +/* Prepare all the global resources for all the event objects.*/
> > +static int mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv
> > +*priv) {
> > +	uint32_t lcore;
> > +
> > +	if (priv->eventc)
> > +		return 0;
> > +	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
> > +	if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) {
> > +		rte_errno = errno;
> > +		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
> > +		return -1;
> > +	}
> > +	priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx,
> > +
> MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
> > +	if (!priv->eventc) {
> > +		rte_errno = errno;
> > +		DRV_LOG(ERR, "Failed to create event channel %d.",
> > +			rte_errno);
> > +		goto error;
> > +	}
> > +	priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0);
> > +	if (!priv->uar) {
> > +		rte_errno = errno;
> > +		DRV_LOG(ERR, "Failed to allocate UAR.");
> > +		goto error;
> > +	}
> > +	return 0;
> > +error:
> > +	mlx5_vdpa_event_qp_global_release(priv);
> > +	return -1;
> > +}
> > +
> > +static void
> > +mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq) {
> > +	if (cq->cq)
> > +		claim_zero(mlx5_devx_cmd_destroy(cq->cq));
> > +	if (cq->umem_obj)
> > +		claim_zero(mlx5_glue->devx_umem_dereg(cq-
> >umem_obj));
> > +	if (cq->umem_buf)
> > +		rte_free((void *)(uintptr_t)cq->umem_buf);
> > +	memset(cq, 0, sizeof(*cq));
> > +}
> > +
> > +static inline void
> > +mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq
> > +*cq) {
> > +	const unsigned int cqe_mask = (1 << cq->log_desc_n) - 1;
> > +	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
> > +	uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK & cqe_mask;
> > +	uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
> > +	uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id;
> > +	uint64_t db_be = rte_cpu_to_be_64(doorbell);
> > +	uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr,
> > +MLX5_CQ_DOORBELL);
> > +
> > +	rte_io_wmb();
> > +	cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
> > +	rte_wmb();
> > +#ifdef RTE_ARCH_64
> > +	*(uint64_t *)addr = db_be;
> > +#else
> > +	*(uint32_t *)addr = db_be;
> > +	rte_io_wmb();
> > +	*((uint32_t *)addr + 1) = db_be >> 32; #endif
> > +	cq->arm_sn++;
> > +}
> > +
> > +static int
> > +mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
> > +		    int callfd, struct mlx5_vdpa_cq *cq) {
> > +	struct mlx5_devx_cq_attr attr;
> > +	size_t pgsize = sysconf(_SC_PAGESIZE);
> > +	uint32_t umem_size;
> > +	int ret;
> > +	uint16_t event_nums[1] = {0};
> > +
> > +	cq->log_desc_n = log_desc_n;
> > +	umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) +
> > +							sizeof(*cq->db_rec)
> * 2;
> > +	cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
> > +	if (!cq->umem_buf) {
> > +		DRV_LOG(ERR, "Failed to allocate memory for CQ.");
> > +		rte_errno = ENOMEM;
> > +		return -ENOMEM;
> > +	}
> > +	cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
> > +						(void *)(uintptr_t)cq-
> >umem_buf,
> > +						umem_size,
> > +						IBV_ACCESS_LOCAL_WRITE);
> > +	if (!cq->umem_obj) {
> > +		DRV_LOG(ERR, "Failed to register umem for CQ.");
> > +		goto error;
> > +	}
> > +	attr.q_umem_valid = 1;
> > +	attr.db_umem_valid = 1;
> > +	attr.use_first_only = 0;
> > +	attr.overrun_ignore = 0;
> > +	attr.uar_page_id = priv->uar->page_id;
> > +	attr.q_umem_id = cq->umem_obj->umem_id;
> > +	attr.q_umem_offset = 0;
> > +	attr.db_umem_id = cq->umem_obj->umem_id;
> > +	attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n);
> > +	attr.eqn = priv->eqn;
> > +	attr.log_cq_size = log_desc_n;
> > +	attr.log_page_size = rte_log2_u32(pgsize);
> > +	cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr);
> > +	if (!cq->cq)
> > +		goto error;
> > +	cq->db_rec = RTE_PTR_ADD(cq->umem_buf,
> (uintptr_t)attr.db_umem_offset);
> > +	cq->cq_ci = 0;
> > +	rte_spinlock_init(&cq->sl);
> > +	/* Subscribe CQ event to the event channel controlled by the driver.
> */
> > +	ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq-
> >obj,
> > +						   sizeof(event_nums),
> > +						   event_nums,
> > +						   (uint64_t)(uintptr_t)cq);
> > +	if (ret) {
> > +		DRV_LOG(ERR, "Failed to subscribe CQE event.");
> > +		rte_errno = errno;
> > +		goto error;
> > +	}
> > +	/* Subscribe CQ event to the guest FD only if it is not in poll mode. */
> > +	if (callfd != -1) {
> > +		ret = mlx5_glue->devx_subscribe_devx_event_fd(priv-
> >eventc,
> > +							      callfd,
> > +							      cq->cq->obj, 0);
> > +		if (ret) {
> > +			DRV_LOG(ERR, "Failed to subscribe CQE event fd.");
> > +			rte_errno = errno;
> > +			goto error;
> > +		}
> > +	}
> > +	/* First arming. */
> > +	mlx5_vdpa_cq_arm(priv, cq);
> > +	return 0;
> > +error:
> > +	mlx5_vdpa_cq_destroy(cq);
> > +	return -1;
> > +}
> > +
> > +static inline void __rte_unused
> > +mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
> > +		  struct mlx5_vdpa_cq *cq)
> > +{
> > +	struct mlx5_vdpa_event_qp *eqp =
> > +				container_of(cq, struct
> mlx5_vdpa_event_qp, cq);
> > +	const unsigned int cqe_size = 1 << cq->log_desc_n;
> > +	const unsigned int cqe_mask = cqe_size - 1;
> > +	int ret;
> > +
> > +	do {
> > +		volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
> > +							    cqe_mask);
> > +
> > +		ret = check_cqe(cqe, cqe_size, cq->cq_ci);
> > +		switch (ret) {
> > +		case MLX5_CQE_STATUS_ERR:
> > +			cq->errors++;
> > +			/*fall-through*/
> > +		case MLX5_CQE_STATUS_SW_OWN:
> > +			cq->cq_ci++;
> > +			break;
> > +		case MLX5_CQE_STATUS_HW_OWN:
> > +		default:
> > +			break;
> > +		}
> > +	} while (ret != MLX5_CQE_STATUS_HW_OWN);
> 
> Isn't there a risk of endless loop here?

No. maximum iterations number is the CQ size , since HW cannot write more CQEs before the doorbell record is updated.

> 
> > +	rte_io_wmb();
> > +	/* Ring CQ doorbell record. */
> > +	cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
> > +	rte_io_wmb();
> > +	/* Ring SW QP doorbell record. */
> > +	eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cqe_size); }
> > +
  
Maxime Coquelin Jan. 31, 2020, 2:47 p.m. UTC | #3
On 1/31/20 7:56 AM, Matan Azrad wrote:
> 
> Hi
> 
> From: Maxime Coquelin
>> On 1/29/20 11:09 AM, Matan Azrad wrote:
>>> As an arrangement to the vitrio queues creation, a 2 QPs and CQ may be
>>> created for the virtio queue.
>>>
>>> The design is to trigger an event for the guest and for the vdpa
>>> driver when a new CQE is posted by the HW after the packet transition.
>>>
>>> This patch add the basic operations to create and destroy the above HW
>>> objects  and to trigger the CQE events when a new CQE is posted.
>>>
>>> Signed-off-by: Matan Azrad <matan@mellanox.com>
>>> Acked-by: Viacheslav Ovsiienko <viacheslavo@mellanox.com>
>>> ---
>>>  drivers/common/mlx5/mlx5_prm.h      |   4 +
>>>  drivers/vdpa/mlx5/Makefile          |   1 +
>>>  drivers/vdpa/mlx5/meson.build       |   1 +
>>>  drivers/vdpa/mlx5/mlx5_vdpa.h       |  89 ++++++++
>>>  drivers/vdpa/mlx5/mlx5_vdpa_event.c | 399
>>> ++++++++++++++++++++++++++++++++++++
>>>  5 files changed, 494 insertions(+)
>>>  create mode 100644 drivers/vdpa/mlx5/mlx5_vdpa_event.c
>>>
>>> diff --git a/drivers/common/mlx5/mlx5_prm.h
>>> b/drivers/common/mlx5/mlx5_prm.h index b48cd0a..b533798 100644
>>> --- a/drivers/common/mlx5/mlx5_prm.h
>>> +++ b/drivers/common/mlx5/mlx5_prm.h
>>> @@ -392,6 +392,10 @@ struct mlx5_cqe {
>>>  /* CQE format value. */
>>>  #define MLX5_COMPRESSED 0x3
>>>
>>> +/* CQ doorbell cmd types. */
>>> +#define MLX5_CQ_DBR_CMD_SOL_ONLY (1 << 24) #define
>>> +MLX5_CQ_DBR_CMD_ALL (0 << 24)
>>> +
>>>  /* Action type of header modification. */  enum {
>>>  	MLX5_MODIFICATION_TYPE_SET = 0x1,
>>> diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
>>> index 5472797..7f13756 100644
>>> --- a/drivers/vdpa/mlx5/Makefile
>>> +++ b/drivers/vdpa/mlx5/Makefile
>>> @@ -9,6 +9,7 @@ LIB = librte_pmd_mlx5_vdpa.a  # Sources.
>>>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
>>>  SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
>>> +SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c
>>>
>>>  # Basic CFLAGS.
>>>  CFLAGS += -O3
>>> diff --git a/drivers/vdpa/mlx5/meson.build
>>> b/drivers/vdpa/mlx5/meson.build index 7e5dd95..c609f7c 100644
>>> --- a/drivers/vdpa/mlx5/meson.build
>>> +++ b/drivers/vdpa/mlx5/meson.build
>>> @@ -13,6 +13,7 @@ deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci',
>>> 'eal', 'sched']  sources = files(
>>>  	'mlx5_vdpa.c',
>>>  	'mlx5_vdpa_mem.c',
>>> +	'mlx5_vdpa_event.c',
>>>  )
>>>  cflags_options = [
>>>  	'-std=c11',
>>> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h
>>> b/drivers/vdpa/mlx5/mlx5_vdpa.h index e27baea..30030b7 100644
>>> --- a/drivers/vdpa/mlx5/mlx5_vdpa.h
>>> +++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
>>> @@ -9,9 +9,40 @@
>>>
>>>  #include <rte_vdpa.h>
>>>  #include <rte_vhost.h>
>>> +#include <rte_spinlock.h>
>>> +#include <rte_interrupts.h>
>>>
>>>  #include <mlx5_glue.h>
>>>  #include <mlx5_devx_cmds.h>
>>> +#include <mlx5_prm.h>
>>> +
>>> +
>>> +#define MLX5_VDPA_INTR_RETRIES 256
>>> +#define MLX5_VDPA_INTR_RETRIES_USEC 1000
>>> +
>>> +struct mlx5_vdpa_cq {
>>> +	uint16_t log_desc_n;
>>> +	uint32_t cq_ci:24;
>>> +	uint32_t arm_sn:2;
>>> +	rte_spinlock_t sl;
>>> +	struct mlx5_devx_obj *cq;
>>> +	struct mlx5dv_devx_umem *umem_obj;
>>> +	union {
>>> +		volatile void *umem_buf;
>>> +		volatile struct mlx5_cqe *cqes;
>>> +	};
>>> +	volatile uint32_t *db_rec;
>>> +	uint64_t errors;
>>> +};
>>> +
>>> +struct mlx5_vdpa_event_qp {
>>> +	struct mlx5_vdpa_cq cq;
>>> +	struct mlx5_devx_obj *fw_qp;
>>> +	struct mlx5_devx_obj *sw_qp;
>>> +	struct mlx5dv_devx_umem *umem_obj;
>>> +	void *umem_buf;
>>> +	volatile uint32_t *db_rec;
>>> +};
>>>
>>>  struct mlx5_vdpa_query_mr {
>>>  	SLIST_ENTRY(mlx5_vdpa_query_mr) next; @@ -34,6 +65,10 @@
>> struct
>>> mlx5_vdpa_priv {
>>>  	uint32_t gpa_mkey_index;
>>>  	struct ibv_mr *null_mr;
>>>  	struct rte_vhost_memory *vmem;
>>> +	uint32_t eqn;
>>> +	struct mlx5dv_devx_event_channel *eventc;
>>> +	struct mlx5dv_devx_uar *uar;
>>> +	struct rte_intr_handle intr_handle;
>>>  	SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;  };
>>>
>>> @@ -57,4 +92,58 @@ struct mlx5_vdpa_priv {
>>>   */
>>>  int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
>>>
>>> +
>>> +/**
>>> + * Create an event QP and all its related resources.
>>> + *
>>> + * @param[in] priv
>>> + *   The vdpa driver private structure.
>>> + * @param[in] desc_n
>>> + *   Number of descriptors.
>>> + * @param[in] callfd
>>> + *   The guest notification file descriptor.
>>> + * @param[in/out] eqp
>>> + *   Pointer to the event QP structure.
>>> + *
>>> + * @return
>>> + *   0 on success, -1 otherwise and rte_errno is set.
>>> + */
>>> +int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t
>> desc_n,
>>> +			      int callfd, struct mlx5_vdpa_event_qp *eqp);
>>> +
>>> +/**
>>> + * Destroy an event QP and all its related resources.
>>> + *
>>> + * @param[in/out] eqp
>>> + *   Pointer to the event QP structure.
>>> + */
>>> +void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp);
>>> +
>>> +/**
>>> + * Release all the event global resources.
>>> + *
>>> + * @param[in] priv
>>> + *   The vdpa driver private structure.
>>> + */
>>> +void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv);
>>> +
>>> +/**
>>> + * Setup CQE event.
>>> + *
>>> + * @param[in] priv
>>> + *   The vdpa driver private structure.
>>> + *
>>> + * @return
>>> + *   0 on success, a negative errno value otherwise and rte_errno is set.
>>> + */
>>> +int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
>>> +
>>> +/**
>>> + * Unset CQE event .
>>> + *
>>> + * @param[in] priv
>>> + *   The vdpa driver private structure.
>>> + */
>>> +void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
>>> +
>>>  #endif /* RTE_PMD_MLX5_VDPA_H_ */
>>> diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c
>>> b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
>>> new file mode 100644
>>> index 0000000..35518ad
>>> --- /dev/null
>>> +++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
>>> @@ -0,0 +1,399 @@
>>> +/* SPDX-License-Identifier: BSD-3-Clause
>>> + * Copyright 2019 Mellanox Technologies, Ltd  */ #include <unistd.h>
>>> +#include <stdint.h> #include <fcntl.h>
>>> +
>>> +#include <rte_malloc.h>
>>> +#include <rte_errno.h>
>>> +#include <rte_lcore.h>
>>> +#include <rte_atomic.h>
>>> +#include <rte_common.h>
>>> +#include <rte_io.h>
>>> +
>>> +#include <mlx5_common.h>
>>> +
>>> +#include "mlx5_vdpa_utils.h"
>>> +#include "mlx5_vdpa.h"
>>> +
>>> +
>>> +void
>>> +mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv) {
>>> +	if (priv->uar) {
>>> +		mlx5_glue->devx_free_uar(priv->uar);
>>> +		priv->uar = NULL;
>>> +	}
>>> +	if (priv->eventc) {
>>> +		mlx5_glue->devx_destroy_event_channel(priv->eventc);
>>> +		priv->eventc = NULL;
>>> +	}
>>> +	priv->eqn = 0;
>>> +}
>>> +
>>> +/* Prepare all the global resources for all the event objects.*/
>>> +static int mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv
>>> +*priv) {
>>> +	uint32_t lcore;
>>> +
>>> +	if (priv->eventc)
>>> +		return 0;
>>> +	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
>>> +	if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) {
>>> +		rte_errno = errno;
>>> +		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
>>> +		return -1;
>>> +	}
>>> +	priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx,
>>> +
>> MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
>>> +	if (!priv->eventc) {
>>> +		rte_errno = errno;
>>> +		DRV_LOG(ERR, "Failed to create event channel %d.",
>>> +			rte_errno);
>>> +		goto error;
>>> +	}
>>> +	priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0);
>>> +	if (!priv->uar) {
>>> +		rte_errno = errno;
>>> +		DRV_LOG(ERR, "Failed to allocate UAR.");
>>> +		goto error;
>>> +	}
>>> +	return 0;
>>> +error:
>>> +	mlx5_vdpa_event_qp_global_release(priv);
>>> +	return -1;
>>> +}
>>> +
>>> +static void
>>> +mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq) {
>>> +	if (cq->cq)
>>> +		claim_zero(mlx5_devx_cmd_destroy(cq->cq));
>>> +	if (cq->umem_obj)
>>> +		claim_zero(mlx5_glue->devx_umem_dereg(cq-
>>> umem_obj));
>>> +	if (cq->umem_buf)
>>> +		rte_free((void *)(uintptr_t)cq->umem_buf);
>>> +	memset(cq, 0, sizeof(*cq));
>>> +}
>>> +
>>> +static inline void
>>> +mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq
>>> +*cq) {
>>> +	const unsigned int cqe_mask = (1 << cq->log_desc_n) - 1;
>>> +	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
>>> +	uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK & cqe_mask;
>>> +	uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
>>> +	uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id;
>>> +	uint64_t db_be = rte_cpu_to_be_64(doorbell);
>>> +	uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr,
>>> +MLX5_CQ_DOORBELL);
>>> +
>>> +	rte_io_wmb();
>>> +	cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
>>> +	rte_wmb();
>>> +#ifdef RTE_ARCH_64
>>> +	*(uint64_t *)addr = db_be;
>>> +#else
>>> +	*(uint32_t *)addr = db_be;
>>> +	rte_io_wmb();
>>> +	*((uint32_t *)addr + 1) = db_be >> 32; #endif
>>> +	cq->arm_sn++;
>>> +}
>>> +
>>> +static int
>>> +mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
>>> +		    int callfd, struct mlx5_vdpa_cq *cq) {
>>> +	struct mlx5_devx_cq_attr attr;
>>> +	size_t pgsize = sysconf(_SC_PAGESIZE);
>>> +	uint32_t umem_size;
>>> +	int ret;
>>> +	uint16_t event_nums[1] = {0};
>>> +
>>> +	cq->log_desc_n = log_desc_n;
>>> +	umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) +
>>> +							sizeof(*cq->db_rec)
>> * 2;
>>> +	cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
>>> +	if (!cq->umem_buf) {
>>> +		DRV_LOG(ERR, "Failed to allocate memory for CQ.");
>>> +		rte_errno = ENOMEM;
>>> +		return -ENOMEM;
>>> +	}
>>> +	cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
>>> +						(void *)(uintptr_t)cq-
>>> umem_buf,
>>> +						umem_size,
>>> +						IBV_ACCESS_LOCAL_WRITE);
>>> +	if (!cq->umem_obj) {
>>> +		DRV_LOG(ERR, "Failed to register umem for CQ.");
>>> +		goto error;
>>> +	}
>>> +	attr.q_umem_valid = 1;
>>> +	attr.db_umem_valid = 1;
>>> +	attr.use_first_only = 0;
>>> +	attr.overrun_ignore = 0;
>>> +	attr.uar_page_id = priv->uar->page_id;
>>> +	attr.q_umem_id = cq->umem_obj->umem_id;
>>> +	attr.q_umem_offset = 0;
>>> +	attr.db_umem_id = cq->umem_obj->umem_id;
>>> +	attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n);
>>> +	attr.eqn = priv->eqn;
>>> +	attr.log_cq_size = log_desc_n;
>>> +	attr.log_page_size = rte_log2_u32(pgsize);
>>> +	cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr);
>>> +	if (!cq->cq)
>>> +		goto error;
>>> +	cq->db_rec = RTE_PTR_ADD(cq->umem_buf,
>> (uintptr_t)attr.db_umem_offset);
>>> +	cq->cq_ci = 0;
>>> +	rte_spinlock_init(&cq->sl);
>>> +	/* Subscribe CQ event to the event channel controlled by the driver.
>> */
>>> +	ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq-
>>> obj,
>>> +						   sizeof(event_nums),
>>> +						   event_nums,
>>> +						   (uint64_t)(uintptr_t)cq);
>>> +	if (ret) {
>>> +		DRV_LOG(ERR, "Failed to subscribe CQE event.");
>>> +		rte_errno = errno;
>>> +		goto error;
>>> +	}
>>> +	/* Subscribe CQ event to the guest FD only if it is not in poll mode. */
>>> +	if (callfd != -1) {
>>> +		ret = mlx5_glue->devx_subscribe_devx_event_fd(priv-
>>> eventc,
>>> +							      callfd,
>>> +							      cq->cq->obj, 0);
>>> +		if (ret) {
>>> +			DRV_LOG(ERR, "Failed to subscribe CQE event fd.");
>>> +			rte_errno = errno;
>>> +			goto error;
>>> +		}
>>> +	}
>>> +	/* First arming. */
>>> +	mlx5_vdpa_cq_arm(priv, cq);
>>> +	return 0;
>>> +error:
>>> +	mlx5_vdpa_cq_destroy(cq);
>>> +	return -1;
>>> +}
>>> +
>>> +static inline void __rte_unused
>>> +mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
>>> +		  struct mlx5_vdpa_cq *cq)
>>> +{
>>> +	struct mlx5_vdpa_event_qp *eqp =
>>> +				container_of(cq, struct
>> mlx5_vdpa_event_qp, cq);
>>> +	const unsigned int cqe_size = 1 << cq->log_desc_n;
>>> +	const unsigned int cqe_mask = cqe_size - 1;
>>> +	int ret;
>>> +
>>> +	do {
>>> +		volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
>>> +							    cqe_mask);
>>> +
>>> +		ret = check_cqe(cqe, cqe_size, cq->cq_ci);
>>> +		switch (ret) {
>>> +		case MLX5_CQE_STATUS_ERR:
>>> +			cq->errors++;
>>> +			/*fall-through*/
>>> +		case MLX5_CQE_STATUS_SW_OWN:
>>> +			cq->cq_ci++;
>>> +			break;
>>> +		case MLX5_CQE_STATUS_HW_OWN:
>>> +		default:
>>> +			break;
>>> +		}
>>> +	} while (ret != MLX5_CQE_STATUS_HW_OWN);
>>
>> Isn't there a risk of endless loop here?
> 
> No. maximum iterations number is the CQ size , since HW cannot write more CQEs before the doorbell record is updated.

Ok.
  

Patch

diff --git a/drivers/common/mlx5/mlx5_prm.h b/drivers/common/mlx5/mlx5_prm.h
index b48cd0a..b533798 100644
--- a/drivers/common/mlx5/mlx5_prm.h
+++ b/drivers/common/mlx5/mlx5_prm.h
@@ -392,6 +392,10 @@  struct mlx5_cqe {
 /* CQE format value. */
 #define MLX5_COMPRESSED 0x3
 
+/* CQ doorbell cmd types. */
+#define MLX5_CQ_DBR_CMD_SOL_ONLY (1 << 24)
+#define MLX5_CQ_DBR_CMD_ALL (0 << 24)
+
 /* Action type of header modification. */
 enum {
 	MLX5_MODIFICATION_TYPE_SET = 0x1,
diff --git a/drivers/vdpa/mlx5/Makefile b/drivers/vdpa/mlx5/Makefile
index 5472797..7f13756 100644
--- a/drivers/vdpa/mlx5/Makefile
+++ b/drivers/vdpa/mlx5/Makefile
@@ -9,6 +9,7 @@  LIB = librte_pmd_mlx5_vdpa.a
 # Sources.
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa.c
 SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_mem.c
+SRCS-$(CONFIG_RTE_LIBRTE_MLX5_VDPA_PMD) += mlx5_vdpa_event.c
 
 # Basic CFLAGS.
 CFLAGS += -O3
diff --git a/drivers/vdpa/mlx5/meson.build b/drivers/vdpa/mlx5/meson.build
index 7e5dd95..c609f7c 100644
--- a/drivers/vdpa/mlx5/meson.build
+++ b/drivers/vdpa/mlx5/meson.build
@@ -13,6 +13,7 @@  deps += ['hash', 'common_mlx5', 'vhost', 'bus_pci', 'eal', 'sched']
 sources = files(
 	'mlx5_vdpa.c',
 	'mlx5_vdpa_mem.c',
+	'mlx5_vdpa_event.c',
 )
 cflags_options = [
 	'-std=c11',
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa.h b/drivers/vdpa/mlx5/mlx5_vdpa.h
index e27baea..30030b7 100644
--- a/drivers/vdpa/mlx5/mlx5_vdpa.h
+++ b/drivers/vdpa/mlx5/mlx5_vdpa.h
@@ -9,9 +9,40 @@ 
 
 #include <rte_vdpa.h>
 #include <rte_vhost.h>
+#include <rte_spinlock.h>
+#include <rte_interrupts.h>
 
 #include <mlx5_glue.h>
 #include <mlx5_devx_cmds.h>
+#include <mlx5_prm.h>
+
+
+#define MLX5_VDPA_INTR_RETRIES 256
+#define MLX5_VDPA_INTR_RETRIES_USEC 1000
+
+struct mlx5_vdpa_cq {
+	uint16_t log_desc_n;
+	uint32_t cq_ci:24;
+	uint32_t arm_sn:2;
+	rte_spinlock_t sl;
+	struct mlx5_devx_obj *cq;
+	struct mlx5dv_devx_umem *umem_obj;
+	union {
+		volatile void *umem_buf;
+		volatile struct mlx5_cqe *cqes;
+	};
+	volatile uint32_t *db_rec;
+	uint64_t errors;
+};
+
+struct mlx5_vdpa_event_qp {
+	struct mlx5_vdpa_cq cq;
+	struct mlx5_devx_obj *fw_qp;
+	struct mlx5_devx_obj *sw_qp;
+	struct mlx5dv_devx_umem *umem_obj;
+	void *umem_buf;
+	volatile uint32_t *db_rec;
+};
 
 struct mlx5_vdpa_query_mr {
 	SLIST_ENTRY(mlx5_vdpa_query_mr) next;
@@ -34,6 +65,10 @@  struct mlx5_vdpa_priv {
 	uint32_t gpa_mkey_index;
 	struct ibv_mr *null_mr;
 	struct rte_vhost_memory *vmem;
+	uint32_t eqn;
+	struct mlx5dv_devx_event_channel *eventc;
+	struct mlx5dv_devx_uar *uar;
+	struct rte_intr_handle intr_handle;
 	SLIST_HEAD(mr_list, mlx5_vdpa_query_mr) mr_list;
 };
 
@@ -57,4 +92,58 @@  struct mlx5_vdpa_priv {
  */
 int mlx5_vdpa_mem_register(struct mlx5_vdpa_priv *priv);
 
+
+/**
+ * Create an event QP and all its related resources.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ * @param[in] desc_n
+ *   Number of descriptors.
+ * @param[in] callfd
+ *   The guest notification file descriptor.
+ * @param[in/out] eqp
+ *   Pointer to the event QP structure.
+ *
+ * @return
+ *   0 on success, -1 otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
+			      int callfd, struct mlx5_vdpa_event_qp *eqp);
+
+/**
+ * Destroy an event QP and all its related resources.
+ *
+ * @param[in/out] eqp
+ *   Pointer to the event QP structure.
+ */
+void mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp);
+
+/**
+ * Release all the event global resources.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Setup CQE event.
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv);
+
+/**
+ * Unset CQE event .
+ *
+ * @param[in] priv
+ *   The vdpa driver private structure.
+ */
+void mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv);
+
 #endif /* RTE_PMD_MLX5_VDPA_H_ */
diff --git a/drivers/vdpa/mlx5/mlx5_vdpa_event.c b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
new file mode 100644
index 0000000..35518ad
--- /dev/null
+++ b/drivers/vdpa/mlx5/mlx5_vdpa_event.c
@@ -0,0 +1,399 @@ 
+/* SPDX-License-Identifier: BSD-3-Clause
+ * Copyright 2019 Mellanox Technologies, Ltd
+ */
+#include <unistd.h>
+#include <stdint.h>
+#include <fcntl.h>
+
+#include <rte_malloc.h>
+#include <rte_errno.h>
+#include <rte_lcore.h>
+#include <rte_atomic.h>
+#include <rte_common.h>
+#include <rte_io.h>
+
+#include <mlx5_common.h>
+
+#include "mlx5_vdpa_utils.h"
+#include "mlx5_vdpa.h"
+
+
+void
+mlx5_vdpa_event_qp_global_release(struct mlx5_vdpa_priv *priv)
+{
+	if (priv->uar) {
+		mlx5_glue->devx_free_uar(priv->uar);
+		priv->uar = NULL;
+	}
+	if (priv->eventc) {
+		mlx5_glue->devx_destroy_event_channel(priv->eventc);
+		priv->eventc = NULL;
+	}
+	priv->eqn = 0;
+}
+
+/* Prepare all the global resources for all the event objects.*/
+static int
+mlx5_vdpa_event_qp_global_prepare(struct mlx5_vdpa_priv *priv)
+{
+	uint32_t lcore;
+
+	if (priv->eventc)
+		return 0;
+	lcore = (uint32_t)rte_lcore_to_cpu_id(-1);
+	if (mlx5_glue->devx_query_eqn(priv->ctx, lcore, &priv->eqn)) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to query EQ number %d.", rte_errno);
+		return -1;
+	}
+	priv->eventc = mlx5_glue->devx_create_event_channel(priv->ctx,
+			   MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA);
+	if (!priv->eventc) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to create event channel %d.",
+			rte_errno);
+		goto error;
+	}
+	priv->uar = mlx5_glue->devx_alloc_uar(priv->ctx, 0);
+	if (!priv->uar) {
+		rte_errno = errno;
+		DRV_LOG(ERR, "Failed to allocate UAR.");
+		goto error;
+	}
+	return 0;
+error:
+	mlx5_vdpa_event_qp_global_release(priv);
+	return -1;
+}
+
+static void
+mlx5_vdpa_cq_destroy(struct mlx5_vdpa_cq *cq)
+{
+	if (cq->cq)
+		claim_zero(mlx5_devx_cmd_destroy(cq->cq));
+	if (cq->umem_obj)
+		claim_zero(mlx5_glue->devx_umem_dereg(cq->umem_obj));
+	if (cq->umem_buf)
+		rte_free((void *)(uintptr_t)cq->umem_buf);
+	memset(cq, 0, sizeof(*cq));
+}
+
+static inline void
+mlx5_vdpa_cq_arm(struct mlx5_vdpa_priv *priv, struct mlx5_vdpa_cq *cq)
+{
+	const unsigned int cqe_mask = (1 << cq->log_desc_n) - 1;
+	uint32_t arm_sn = cq->arm_sn << MLX5_CQ_SQN_OFFSET;
+	uint32_t cq_ci = cq->cq_ci & MLX5_CI_MASK & cqe_mask;
+	uint32_t doorbell_hi = arm_sn | MLX5_CQ_DBR_CMD_ALL | cq_ci;
+	uint64_t doorbell = ((uint64_t)doorbell_hi << 32) | cq->cq->id;
+	uint64_t db_be = rte_cpu_to_be_64(doorbell);
+	uint32_t *addr = RTE_PTR_ADD(priv->uar->base_addr, MLX5_CQ_DOORBELL);
+
+	rte_io_wmb();
+	cq->db_rec[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
+	rte_wmb();
+#ifdef RTE_ARCH_64
+	*(uint64_t *)addr = db_be;
+#else
+	*(uint32_t *)addr = db_be;
+	rte_io_wmb();
+	*((uint32_t *)addr + 1) = db_be >> 32;
+#endif
+	cq->arm_sn++;
+}
+
+static int
+mlx5_vdpa_cq_create(struct mlx5_vdpa_priv *priv, uint16_t log_desc_n,
+		    int callfd, struct mlx5_vdpa_cq *cq)
+{
+	struct mlx5_devx_cq_attr attr;
+	size_t pgsize = sysconf(_SC_PAGESIZE);
+	uint32_t umem_size;
+	int ret;
+	uint16_t event_nums[1] = {0};
+
+	cq->log_desc_n = log_desc_n;
+	umem_size = sizeof(struct mlx5_cqe) * (1 << log_desc_n) +
+							sizeof(*cq->db_rec) * 2;
+	cq->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
+	if (!cq->umem_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for CQ.");
+		rte_errno = ENOMEM;
+		return -ENOMEM;
+	}
+	cq->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
+						(void *)(uintptr_t)cq->umem_buf,
+						umem_size,
+						IBV_ACCESS_LOCAL_WRITE);
+	if (!cq->umem_obj) {
+		DRV_LOG(ERR, "Failed to register umem for CQ.");
+		goto error;
+	}
+	attr.q_umem_valid = 1;
+	attr.db_umem_valid = 1;
+	attr.use_first_only = 0;
+	attr.overrun_ignore = 0;
+	attr.uar_page_id = priv->uar->page_id;
+	attr.q_umem_id = cq->umem_obj->umem_id;
+	attr.q_umem_offset = 0;
+	attr.db_umem_id = cq->umem_obj->umem_id;
+	attr.db_umem_offset = sizeof(struct mlx5_cqe) * (1 << log_desc_n);
+	attr.eqn = priv->eqn;
+	attr.log_cq_size = log_desc_n;
+	attr.log_page_size = rte_log2_u32(pgsize);
+	cq->cq = mlx5_devx_cmd_create_cq(priv->ctx, &attr);
+	if (!cq->cq)
+		goto error;
+	cq->db_rec = RTE_PTR_ADD(cq->umem_buf, (uintptr_t)attr.db_umem_offset);
+	cq->cq_ci = 0;
+	rte_spinlock_init(&cq->sl);
+	/* Subscribe CQ event to the event channel controlled by the driver. */
+	ret = mlx5_glue->devx_subscribe_devx_event(priv->eventc, cq->cq->obj,
+						   sizeof(event_nums),
+						   event_nums,
+						   (uint64_t)(uintptr_t)cq);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to subscribe CQE event.");
+		rte_errno = errno;
+		goto error;
+	}
+	/* Subscribe CQ event to the guest FD only if it is not in poll mode. */
+	if (callfd != -1) {
+		ret = mlx5_glue->devx_subscribe_devx_event_fd(priv->eventc,
+							      callfd,
+							      cq->cq->obj, 0);
+		if (ret) {
+			DRV_LOG(ERR, "Failed to subscribe CQE event fd.");
+			rte_errno = errno;
+			goto error;
+		}
+	}
+	/* First arming. */
+	mlx5_vdpa_cq_arm(priv, cq);
+	return 0;
+error:
+	mlx5_vdpa_cq_destroy(cq);
+	return -1;
+}
+
+static inline void __rte_unused
+mlx5_vdpa_cq_poll(struct mlx5_vdpa_priv *priv __rte_unused,
+		  struct mlx5_vdpa_cq *cq)
+{
+	struct mlx5_vdpa_event_qp *eqp =
+				container_of(cq, struct mlx5_vdpa_event_qp, cq);
+	const unsigned int cqe_size = 1 << cq->log_desc_n;
+	const unsigned int cqe_mask = cqe_size - 1;
+	int ret;
+
+	do {
+		volatile struct mlx5_cqe *cqe = cq->cqes + (cq->cq_ci &
+							    cqe_mask);
+
+		ret = check_cqe(cqe, cqe_size, cq->cq_ci);
+		switch (ret) {
+		case MLX5_CQE_STATUS_ERR:
+			cq->errors++;
+			/*fall-through*/
+		case MLX5_CQE_STATUS_SW_OWN:
+			cq->cq_ci++;
+			break;
+		case MLX5_CQE_STATUS_HW_OWN:
+		default:
+			break;
+		}
+	} while (ret != MLX5_CQE_STATUS_HW_OWN);
+	rte_io_wmb();
+	/* Ring CQ doorbell record. */
+	cq->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci);
+	rte_io_wmb();
+	/* Ring SW QP doorbell record. */
+	eqp->db_rec[0] = rte_cpu_to_be_32(cq->cq_ci + cqe_size);
+}
+
+static void
+mlx5_vdpa_interrupt_handler(void *cb_arg)
+{
+#ifndef HAVE_IBV_DEVX_EVENT
+	(void)cb_arg;
+	return;
+#else
+	struct mlx5_vdpa_priv *priv = cb_arg;
+	union {
+		struct mlx5dv_devx_async_event_hdr event_resp;
+		uint8_t buf[sizeof(struct mlx5dv_devx_async_event_hdr) + 128];
+	} out;
+
+	while (mlx5_glue->devx_get_event(priv->eventc, &out.event_resp,
+					 sizeof(out.buf)) >=
+				       (ssize_t)sizeof(out.event_resp.cookie)) {
+		struct mlx5_vdpa_cq *cq = (struct mlx5_vdpa_cq *)
+					       (uintptr_t)out.event_resp.cookie;
+		rte_spinlock_lock(&cq->sl);
+		mlx5_vdpa_cq_poll(priv, cq);
+		mlx5_vdpa_cq_arm(priv, cq);
+		rte_spinlock_unlock(&cq->sl);
+		DRV_LOG(DEBUG, "CQ %p event: new cq_ci = %u.", cq, cq->cq_ci);
+	}
+#endif /* HAVE_IBV_DEVX_ASYNC */
+}
+
+int
+mlx5_vdpa_cqe_event_setup(struct mlx5_vdpa_priv *priv)
+{
+	int flags = fcntl(priv->eventc->fd, F_GETFL);
+	int ret = fcntl(priv->eventc->fd, F_SETFL, flags | O_NONBLOCK);
+	if (ret) {
+		DRV_LOG(ERR, "Failed to change event channel FD.");
+		rte_errno = errno;
+		return -rte_errno;
+	}
+	priv->intr_handle.fd = priv->eventc->fd;
+	priv->intr_handle.type = RTE_INTR_HANDLE_EXT;
+	if (rte_intr_callback_register(&priv->intr_handle,
+				       mlx5_vdpa_interrupt_handler, priv)) {
+		priv->intr_handle.fd = 0;
+		DRV_LOG(ERR, "Failed to register CQE interrupt %d.", rte_errno);
+		return -rte_errno;
+	}
+	return 0;
+}
+
+void
+mlx5_vdpa_cqe_event_unset(struct mlx5_vdpa_priv *priv)
+{
+	int retries = MLX5_VDPA_INTR_RETRIES;
+	int ret = -EAGAIN;
+
+	if (priv->intr_handle.fd) {
+		while (retries-- && ret == -EAGAIN) {
+			ret = rte_intr_callback_unregister(&priv->intr_handle,
+						    mlx5_vdpa_interrupt_handler,
+						    priv);
+			if (ret == -EAGAIN) {
+				DRV_LOG(DEBUG, "Try again to unregister fd %d "
+					"of CQ interrupt, retries = %d.",
+					priv->intr_handle.fd, retries);
+				usleep(MLX5_VDPA_INTR_RETRIES_USEC);
+			}
+		}
+		memset(&priv->intr_handle, 0, sizeof(priv->intr_handle));
+	}
+}
+
+void
+mlx5_vdpa_event_qp_destroy(struct mlx5_vdpa_event_qp *eqp)
+{
+	if (eqp->sw_qp)
+		claim_zero(mlx5_devx_cmd_destroy(eqp->sw_qp));
+	if (eqp->umem_obj)
+		claim_zero(mlx5_glue->devx_umem_dereg(eqp->umem_obj));
+	if (eqp->umem_buf)
+		rte_free(eqp->umem_buf);
+	if (eqp->fw_qp)
+		claim_zero(mlx5_devx_cmd_destroy(eqp->fw_qp));
+	mlx5_vdpa_cq_destroy(&eqp->cq);
+	memset(eqp, 0, sizeof(*eqp));
+}
+
+static int
+mlx5_vdpa_qps2rts(struct mlx5_vdpa_event_qp *eqp)
+{
+	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RST2INIT_QP,
+					  eqp->sw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify FW QP to INIT state(%u).",
+			rte_errno);
+		return -1;
+	}
+	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RST2INIT_QP,
+					  eqp->fw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify SW QP to INIT state(%u).",
+			rte_errno);
+		return -1;
+	}
+	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_INIT2RTR_QP,
+					  eqp->sw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify FW QP to RTR state(%u).",
+			rte_errno);
+		return -1;
+	}
+	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_INIT2RTR_QP,
+					  eqp->fw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify SW QP to RTR state(%u).",
+			rte_errno);
+		return -1;
+	}
+	if (mlx5_devx_cmd_modify_qp_state(eqp->fw_qp, MLX5_CMD_OP_RTR2RTS_QP,
+					  eqp->sw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify FW QP to RTS state(%u).",
+			rte_errno);
+		return -1;
+	}
+	if (mlx5_devx_cmd_modify_qp_state(eqp->sw_qp, MLX5_CMD_OP_RTR2RTS_QP,
+					  eqp->fw_qp->id)) {
+		DRV_LOG(ERR, "Failed to modify SW QP to RTS state(%u).",
+			rte_errno);
+		return -1;
+	}
+	return 0;
+}
+
+int
+mlx5_vdpa_event_qp_create(struct mlx5_vdpa_priv *priv, uint16_t desc_n,
+			  int callfd, struct mlx5_vdpa_event_qp *eqp)
+{
+	struct mlx5_devx_qp_attr attr = {0};
+	uint16_t log_desc_n = rte_log2_u32(desc_n);
+	uint32_t umem_size = (1 << log_desc_n) * MLX5_WSEG_SIZE +
+						       sizeof(*eqp->db_rec) * 2;
+
+	if (mlx5_vdpa_event_qp_global_prepare(priv))
+		return -1;
+	if (mlx5_vdpa_cq_create(priv, log_desc_n, callfd, &eqp->cq))
+		return -1;
+	attr.pd = priv->pdn;
+	eqp->fw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
+	if (!eqp->fw_qp) {
+		DRV_LOG(ERR, "Failed to create FW QP(%u).", rte_errno);
+		goto error;
+	}
+	eqp->umem_buf = rte_zmalloc(__func__, umem_size, 4096);
+	if (!eqp->umem_buf) {
+		DRV_LOG(ERR, "Failed to allocate memory for SW QP.");
+		rte_errno = ENOMEM;
+		goto error;
+	}
+	eqp->umem_obj = mlx5_glue->devx_umem_reg(priv->ctx,
+					       (void *)(uintptr_t)eqp->umem_buf,
+					       umem_size,
+					       IBV_ACCESS_LOCAL_WRITE);
+	if (!eqp->umem_obj) {
+		DRV_LOG(ERR, "Failed to register umem for SW QP.");
+		goto error;
+	}
+	attr.uar_index = priv->uar->page_id;
+	attr.cqn = eqp->cq.cq->id;
+	attr.log_page_size = rte_log2_u32(sysconf(_SC_PAGESIZE));
+	attr.rq_size = 1 << log_desc_n;
+	attr.log_rq_stride = rte_log2_u32(MLX5_WSEG_SIZE);
+	attr.sq_size = 0; /* No need SQ. */
+	attr.dbr_umem_valid = 1;
+	attr.wq_umem_id = eqp->umem_obj->umem_id;
+	attr.wq_umem_offset = 0;
+	attr.dbr_umem_id = eqp->umem_obj->umem_id;
+	attr.dbr_address = (1 << log_desc_n) * MLX5_WSEG_SIZE;
+	eqp->sw_qp = mlx5_devx_cmd_create_qp(priv->ctx, &attr);
+	if (!eqp->sw_qp) {
+		DRV_LOG(ERR, "Failed to create SW QP(%u).", rte_errno);
+		goto error;
+	}
+	eqp->db_rec = RTE_PTR_ADD(eqp->umem_buf, (uintptr_t)attr.dbr_address);
+	if (mlx5_vdpa_qps2rts(eqp))
+		goto error;
+	/* First ringing. */
+	rte_write32(rte_cpu_to_be_32(1 << log_desc_n), &eqp->db_rec[0]);
+	return 0;
+error:
+	mlx5_vdpa_event_qp_destroy(eqp);
+	return -1;
+}