[v2,04/20] net/mlx5: support flow Ethernet item among with drop action

Message ID 957989d98f2363d6e2e377bd54893cf7b29e4985.1530111623.git.nelio.laranjeiro@6wind.com (mailing list archive)
State Superseded, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx5: flow rework |

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation fail apply issues

Commit Message

Nélio Laranjeiro June 27, 2018, 3:07 p.m. UTC
  Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
---
 drivers/net/mlx5/mlx5.c      |   1 +
 drivers/net/mlx5/mlx5_flow.c | 671 ++++++++++++++++++++++++++++++++---
 2 files changed, 630 insertions(+), 42 deletions(-)
  

Comments

Yongseok Koh July 3, 2018, 10:27 p.m. UTC | #1
On Wed, Jun 27, 2018 at 05:07:36PM +0200, Nelio Laranjeiro wrote:

net/mlx5: support flow Ethernet item among with drop action

Typo? "among" -> "along"?

> Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> ---
>  drivers/net/mlx5/mlx5.c      |   1 +
>  drivers/net/mlx5/mlx5_flow.c | 671 ++++++++++++++++++++++++++++++++---
>  2 files changed, 630 insertions(+), 42 deletions(-)
> 
> diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> index c3c8dffae..665a3c31f 100644
> --- a/drivers/net/mlx5/mlx5.c
> +++ b/drivers/net/mlx5/mlx5.c
> @@ -241,6 +241,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
>  	/* In case mlx5_dev_stop() has not been called. */
>  	mlx5_dev_interrupt_handler_uninstall(dev);
>  	mlx5_traffic_disable(dev);
> +	mlx5_flow_flush(dev, NULL);
>  	/* Prevent crashes when queues are still in use. */
>  	dev->rx_pkt_burst = removed_rx_burst;
>  	dev->tx_pkt_burst = removed_tx_burst;
> diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> index 5d3bc183d..bc8c5de33 100644
> --- a/drivers/net/mlx5/mlx5_flow.c
> +++ b/drivers/net/mlx5/mlx5_flow.c
> @@ -35,11 +35,50 @@
>  extern const struct eth_dev_ops mlx5_dev_ops;
>  extern const struct eth_dev_ops mlx5_dev_ops_isolate;
>  
> +/* Pattern Layer bits. */
> +#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
> +#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
> +#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
> +#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
> +#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
> +#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
> +/* Masks. */
> +#define MLX5_FLOW_LAYER_OUTER_L3 \
> +	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
> +#define MLX5_FLOW_LAYER_OUTER_L4 \
> +	(MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
> +
> +/* Action fate on the packet. */
> +#define MLX5_FLOW_FATE_DROP (1u << 0)

I understand what you mean by 'fate' but 'action fate' sounds strange to me.
How about 'terminal action' instead? E.g. flow->terminal_actions or
flow->term_actions instead of flow->fate.

And, I'm not sure why you define a new (redundant) macro. You could use
RTE_FLOW_ACTION_TYPE_DROP instead. I don't see any reason for setting different
action flag in flow->fate. If you want to specify a group of terminal actions,
you could've done:

#define MLX5_FLOW_TERMINAL_ACTIONS \
	(RTE_FLOW_ACTION_TYPE_DROP | RTE_FLOW_ACTION_TYPE_RSS | \
	 RTE_FLOW_ACTION_TYPE_QUEUE)

> +/** Handles information leading to a drop fate. */
> +struct mlx5_flow_verbs {
> +	unsigned int size; /**< Size of the attribute. */
> +	struct {
> +		struct ibv_flow_attr *attr;
> +		/**< Pointer to the Specification buffer. */
> +		uint8_t *specs; /**< Pointer to the specifications. */
> +	};
> +	struct ibv_flow *flow; /**< Verbs flow pointer. */
> +	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
> +};
> +
> +/* Flow structure. */
>  struct rte_flow {
>  	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
> +	struct rte_flow_attr attributes; /**< User flow attribute. */
> +	uint32_t layers;
> +	/**< Bit-fields of present layers see MLX5_FLOW_ITEMS_*. */
> +	uint32_t fate;
> +	/**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */
> +	struct mlx5_flow_verbs verbs; /* Verbs drop flow. */
>  };
>  
>  static const struct rte_flow_ops mlx5_flow_ops = {
> +	.validate = mlx5_flow_validate,
> +	.create = mlx5_flow_create,
> +	.destroy = mlx5_flow_destroy,
> +	.flush = mlx5_flow_flush,
>  	.isolate = mlx5_flow_isolate,
>  };
>  
> @@ -128,12 +167,418 @@ mlx5_flow_priorities(struct rte_eth_dev *dev)
>  }
>  
>  /**
> - * Convert a flow.
> + * Flow debug purpose function only available when
> + * CONFIG_RTE_LIBRTE_MLX5_DEBUG=y
> + *
> + * @param flow
> + *   Pointer to the flow structure to display.
> + */
> +void
> +mlx5_flow_print(struct rte_flow *flow __rte_unused)
> +{
> +#ifndef NDEBUG
> +	fprintf(stdout, "---------8<------------\n");
> +	fprintf(stdout, "%s: flow information\n", MLX5_DRIVER_NAME);
> +	fprintf(stdout, " attributes: group %u priority %u ingress %d egress %d"
> +		" transfer %d\n", flow->attributes.group,
> +		flow->attributes.priority,
> +		flow->attributes.ingress,
> +		flow->attributes.egress,
> +		flow->attributes.transfer);
> +	fprintf(stdout, " layers: %s/%s/%s\n",
> +		flow->layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
> +		flow->layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
> +		flow->layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
> +	if (flow->fate & MLX5_FLOW_FATE_DROP)
> +		fprintf(stdout, " fate: drop queue\n");
> +	if (flow->verbs.attr) {
> +		struct ibv_spec_header *hdr =
> +			(struct ibv_spec_header *)flow->verbs.specs;
> +		const int n = flow->verbs.attr->num_of_specs;
> +		int i;
> +
> +		fprintf(stdout, " Verbs attributes: specs_n %u\n",
> +			flow->verbs.attr->num_of_specs);
> +		for (i = 0; i != n; ++i) {
> +			rte_hexdump(stdout, " ", hdr, hdr->size);
> +			hdr = (struct ibv_spec_header *)
> +				((uint8_t *)hdr + hdr->size);
> +		}
> +	}
> +	fprintf(stdout, "--------->8------------\n");
> +#endif
> +}
> +
> +/**
> + * Validate Attributes provided by the user.

And it also stores attributes.

>   *
>   * @param dev
>   *   Pointer to Ethernet device.
> - * @param list
> - *   Pointer to a TAILQ flow list.
> + * @param attr
> + *   Pointer to flow attributes
> + * @param flow
> + *   Pointer to the rte_flow structure.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
> +		     struct rte_flow *flow, struct rte_flow_error *error)

Better to have mlx5_flow_attributes_validate() like mlx5_flow_item_validate()?

> +{
> +	uint32_t priority_max =
> +		((struct priv *)dev->data->dev_private)->config.flow_prio;
> +
> +	if (attr->group)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
> +					  NULL,
> +					  "groups are not supported");

"group is not supported"?

> +	if (attr->priority >= priority_max)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
> +					  NULL,
> +					  "priority value is not supported");

"priority is out of range" or something else?

> +	if (attr->egress)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
> +					  NULL,
> +					  "egress is not supported");
> +	if (attr->transfer)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
> +					  NULL,
> +					  "transfer is not supported");
> +	if (!attr->ingress)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> +					  NULL,
> +					  "only ingress is supported");

Better to have "ingress should be specified" or something else?

> +	flow->attributes = *attr;
> +	return 0;
> +}
> +
> +/**
> + * Check support for a given item.
> + *
> + * @param item[in]
> + *   Item specification.
> + * @param default_mask[in]
> + *   Bit-masks covering supported fields to compare with spec, last and mask in
> + *   \item.

Remove the back-slash and the description isn't clear. From the code, it seems
to be a default mask to be used if item doesn't have mask.

> + * @param nic_mask[in]
> + *   Bit-masks covering supported fields by the NIC to compare with user mask.

s/Bit-masks/Bit-mask


> + * @param size
> + *   Bit-Mask size in bytes.

s/Bit-Mask/Bit-mask

> + * @param error[out]
> + *   Pointer to error structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_item_validate(const struct rte_flow_item *item,
> +			const uint8_t *default_mask,
> +			const uint8_t *nic_mask,
> +			unsigned int size,
> +			struct rte_flow_error *error)
> +{
> +	const uint8_t *mask = item->mask ? item->mask : default_mask;

In the mlx5_flow_item_eth(), rte_flow_item_eth_mask is passed to default_mask if
item->mask is null, then why doing it again here?

> +	unsigned int i;
> +
> +	assert(nic_mask);
> +	for (i = 0; i < size; ++i)
> +		if ((nic_mask[i] | mask[i]) != nic_mask[i])
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "mask enables non supported"
> +						  " bits");

default_mask is determined by caller, which is rte_flow_item_eth_mask or
item->mask. And nic_mask is also given by PMD, which is a function local
structure in the caller func. Doesn't look like a clear definition. Intead of
having both default_mask and nic_mask here, you can take supported_mask, which
can be (rte_flow_item_{name}_mask | nic_mask) in the calling func. And check
validity of item->mask against supported_mask only if item->mask isn't null.

> +	if (!item->spec && (item->mask || item->last))
> +		return rte_flow_error_set(error, EINVAL,
> +					  RTE_FLOW_ERROR_TYPE_ITEM,
> +					  item,
> +					  "mask/last without a spec is not"
> +					  " supported");
> +	if (item->spec && item->last) {
> +		uint8_t spec[size];
> +		uint8_t last[size];
> +		unsigned int i;
> +		int ret;
> +
> +		for (i = 0; i < size; ++i) {
> +			spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
> +			last[i] = ((const uint8_t *)item->last)[i] & mask[i];
> +		}
> +		ret = memcmp(spec, last, size);
> +		if (ret != 0)
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  item,
> +						  "spec and last different"
> +						  " is not supported");

Can you rephrase the error message?

> +	}
> +	return 0;
> +}
> +
> +/**
> + * Add a verbs specification.
> + *
> + * @param flow
> + *   Pointer to flow structure.
> + * @param src
> + *   Create specification.
> + * @param size
> + *   Size in bytes of the specification to copy.
> + */
> +static void
> +mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size)
> +{
> +	if (flow->verbs.specs) {
> +		void *dst;
> +
> +		dst = (void *)(flow->verbs.specs + flow->verbs.size);
> +		memcpy(dst, src, size);
> +		++flow->verbs.attr->num_of_specs;
> +	}
> +	flow->verbs.size += size;
> +}
> +
> +/**
> + * Validate Ethernet layer and possibly create the Verbs specification.
> + *
> + * @param item[in]
> + *   Item specification.
> + * @param flow[in, out]
> + *   Pointer to flow structure.
> + * @param flow_size[in]
> + *   Size of the buffer to store the specification.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   size in bytes necessary for the conversion, a negative errno value
> + *   otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
> +		   const size_t flow_size, struct rte_flow_error *error)
> +{
> +	const struct rte_flow_item_eth *spec = item->spec;
> +	const struct rte_flow_item_eth *mask = item->mask;
> +	const struct rte_flow_item_eth nic_mask = {
> +		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> +		.type = RTE_BE16(0xffff),
> +	};

If it is a device-specific one, shouldn't it be located in a header file?

> +	const unsigned int size = sizeof(struct ibv_flow_spec_eth);
> +	struct ibv_flow_spec_eth eth = {
> +		.type = IBV_FLOW_SPEC_ETH,
> +		.size = size,
> +	};
> +	int ret;
> +
> +	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L2)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ITEM,
> +					  item,
> +					  "L2 layers already configured");
> +	if (!mask)
> +		mask = &rte_flow_item_eth_mask;

What is the meaning of default_mask for mlx5_flow_item_validate()? Also refer
to my comments in mlx5_flow_item_validate().

> +	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> +				      (const uint8_t *)&nic_mask,
> +				      sizeof(struct rte_flow_item_eth),
> +				      error);
> +	if (ret)
> +		return ret;
> +	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2;
> +	if (size > flow_size)
> +		return size;
> +	if (spec) {
> +		unsigned int i;
> +
> +		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
> +		memcpy(&eth.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
> +		eth.val.ether_type = spec->type;
> +		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
> +		memcpy(&eth.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
> +		eth.mask.ether_type = mask->type;
> +		/* Remove unwanted bits from values. */
> +		for (i = 0; i < ETHER_ADDR_LEN; ++i) {
> +			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
> +			eth.val.src_mac[i] &= eth.mask.src_mac[i];
> +		}
> +		eth.val.ether_type &= eth.mask.ether_type;
> +	}
> +	mlx5_flow_spec_verbs_add(flow, &eth, size);

Do you have a plan to change this redundant memcpy? I mean, writing directly to
the allocated memory for verbs spec instead of the local micro-scratchpad.

> +	return size;
> +}
> +
> +/**
> + * Validate items provided by the user.
> + *
> + * @param items
> + *   Pointer to flow items array.
> + * @param flow
> + *   Pointer to the rte_flow structure.
> + * @param flow_size
> + *   Size in bytes of the available space for to store the flow information.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   size in bytes necessary for the conversion, a negative errno value
> + *   otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_items(const struct rte_flow_item items[],
> +		struct rte_flow *flow, const size_t flow_size,
> +		struct rte_flow_error *error)
> +{
> +	int remain = flow_size;
> +	size_t size = 0;
> +
> +	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
> +		int ret = 0;
> +
> +		switch (items->type) {
> +		case RTE_FLOW_ITEM_TYPE_VOID:
> +			break;
> +		case RTE_FLOW_ITEM_TYPE_ETH:
> +			ret = mlx5_flow_item_eth(items, flow, remain, error);
> +			break;
> +		default:
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ITEM,
> +						  items,
> +						  "item not supported");
> +		}
> +		if (ret < 0)
> +			return ret;
> +		if (remain > ret)
> +			remain -= ret;
> +		else
> +			remain = 0;
> +		size += ret;
> +	}
> +	if (!flow->layers) {
> +		const struct rte_flow_item item = {
> +			.type = RTE_FLOW_ITEM_TYPE_ETH,
> +		};
> +
> +		return mlx5_flow_item_eth(&item, flow, flow_size, error);
> +	}
> +	return size;
> +}
> +
> +/**
> + * Validate action drop provided by the user.
> + *
> + * @param actions
> + *   Pointer to flow actions array.
> + * @param flow
> + *   Pointer to the rte_flow structure.
> + * @param flow_size
> + *   Size in bytes of the available space for to store the flow information.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   size in bytes necessary for the conversion, a negative errno value
> + *   otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_action_drop(const struct rte_flow_action *actions,
> +		      struct rte_flow *flow, const size_t flow_size,
> +		      struct rte_flow_error *error)
> +{
> +	unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
> +	struct ibv_flow_spec_action_drop drop = {
> +			.type = IBV_FLOW_SPEC_ACTION_DROP,
> +			.size = size,
> +	};
> +
> +	if (flow->fate)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_ACTION,
> +					  actions,
> +					  "multiple fate actions are not"
> +					  " supported");
> +	if (size < flow_size)
> +		mlx5_flow_spec_verbs_add(flow, &drop, size);
> +	flow->fate |= MLX5_FLOW_FATE_DROP;
> +	return size;
> +}
> +
> +/**
> + * Validate actions provided by the user.
> + *
> + * @param dev
> + *   Pointer to Ethernet device structure.
> + * @param actions
> + *   Pointer to flow actions array.
> + * @param flow
> + *   Pointer to the rte_flow structure.
> + * @param flow_size
> + *   Size in bytes of the available space for to store the flow information.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   size in bytes necessary for the conversion, a negative errno value
> + *   otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_actions(struct rte_eth_dev *dev __rte_unused,
> +		  const struct rte_flow_action actions[],
> +		  struct rte_flow *flow, const size_t flow_size,
> +		  struct rte_flow_error *error)
> +{
> +	size_t size = 0;
> +	int remain = flow_size;
> +	int ret = 0;
> +
> +	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
> +		switch (actions->type) {
> +		case RTE_FLOW_ACTION_TYPE_VOID:
> +			break;
> +		case RTE_FLOW_ACTION_TYPE_DROP:
> +			ret = mlx5_flow_action_drop(actions, flow, remain,
> +						    error);
> +			break;
> +		default:
> +			return rte_flow_error_set(error, ENOTSUP,
> +						  RTE_FLOW_ERROR_TYPE_ACTION,
> +						  actions,
> +						  "action not supported");
> +		}
> +		if (ret < 0)
> +			return ret;
> +		if (remain > ret)
> +			remain -= ret;
> +		else
> +			remain = 0;
> +		size += ret;
> +	}
> +	if (!flow->fate)
> +		return rte_flow_error_set(error, ENOTSUP,
> +					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					  NULL,
> +					  "no fate action found");
> +	return size;
> +}
> +
> +/**
> + * Validate the rule and return a flow structure filled accordingly.

Please add more detailed description. This is the most important function in
mlx5_flow.c, it deserves more lines of explanation. Make sure to explain about
the multiple functionality it has - validate, get required size and execute
translation only if it gets allocated memory. Plus, how to validate a flow, e.g.
setting flow_size=0 and flow can be null in that case.

> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + * @param flow
> + *   Pointer an flow structure.

s/an/to
shouldn't it be [out]?
And please mention flow can be null for testing call.

> + * @param flow_size
> + *   Size of the allocated space to store the flow information.

Please mention what happens if the size isn't enough or announce it can be
intentionally zero in order to validate the flow and to know the required size
for translation.

>   * @param[in] attr
>   *   Flow rule attributes.
>   * @param[in] pattern
> @@ -144,21 +589,39 @@ mlx5_flow_priorities(struct rte_eth_dev *dev)
>   *   Perform verbose error reporting if not NULL.
>   *
>   * @return
> - *   A flow on success, NULL otherwise and rte_errno is set.
> + *   The amount of bytes necessary to create the flow, a negative errno value
> + *   otherwise and rte_errno is set.

'amount of bytes' -> 'size of memory'
And not clear what is 'otherwise' because there's no definition of success.

>   */
> -static struct rte_flow *
> -mlx5_flow_list_create(struct rte_eth_dev *dev __rte_unused,
> -		      struct mlx5_flows *list __rte_unused,
> -		      const struct rte_flow_attr *attr __rte_unused,
> -		      const struct rte_flow_item items[] __rte_unused,
> -		      const struct rte_flow_action actions[] __rte_unused,
> -		      struct rte_flow_error *error)
> +static int
> +mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
> +		const size_t flow_size,
> +		const struct rte_flow_attr *attr,
> +		const struct rte_flow_item items[],
> +		const struct rte_flow_action actions[],
> +		struct rte_flow_error *error)
>  {
> -	rte_flow_error_set(error, ENOTSUP,
> -			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -			   NULL,
> -			   "action not supported");
> -	return NULL;
> +	struct rte_flow local_flow = { .layers = 0, };
> +	size_t size = sizeof(*flow) + sizeof(struct ibv_flow_attr);
> +	int remain = (flow_size > size) ? flow_size - size : 0;
> +	int ret;
> +
> +	if (!remain)
> +		flow = &local_flow;
> +	ret = mlx5_flow_attributes(dev, attr, flow, error);
> +	if (ret < 0)
> +		return ret;
> +	ret = mlx5_flow_items(items, flow, remain, error);
> +	if (ret < 0)
> +		return ret;
> +	size += ret;
> +	remain = (flow_size > size) ? flow_size - size : 0;
> +	ret = mlx5_flow_actions(dev, actions, flow, remain, error);
> +	if (ret < 0)
> +		return ret;
> +	size += ret;
> +	if (size <= flow_size)
> +		flow->verbs.attr->priority = flow->attributes.priority;
> +	return size;
>  }
>  
>  /**
> @@ -168,16 +631,142 @@ mlx5_flow_list_create(struct rte_eth_dev *dev __rte_unused,
>   * @see rte_flow_ops
>   */
>  int
> -mlx5_flow_validate(struct rte_eth_dev *dev __rte_unused,
> -		   const struct rte_flow_attr *attr __rte_unused,
> -		   const struct rte_flow_item items[] __rte_unused,
> -		   const struct rte_flow_action actions[] __rte_unused,
> +mlx5_flow_validate(struct rte_eth_dev *dev,
> +		   const struct rte_flow_attr *attr,
> +		   const struct rte_flow_item items[],
> +		   const struct rte_flow_action actions[],
>  		   struct rte_flow_error *error)
>  {
> -	return rte_flow_error_set(error, ENOTSUP,
> -				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -				  NULL,
> -				  "action not supported");
> +	int ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
> +
> +	if (ret < 0)
> +		return ret;
> +	return 0;
> +}
> +
> +/**
> + * Remove the flow.
> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + * @param flow
> + *   Pointer to flow structure.
> + */
> +static void
> +mlx5_flow_fate_remove(struct rte_eth_dev *dev, struct rte_flow *flow)

Sounds weird. Is it removing a fate of a flow?
Description is just 'removing a flow'.

> +{
> +	if (flow->fate & MLX5_FLOW_FATE_DROP) {
> +		if (flow->verbs.flow) {
> +			claim_zero(mlx5_glue->destroy_flow(flow->verbs.flow));
> +			flow->verbs.flow = NULL;
> +		}
> +	}
> +	if (flow->verbs.hrxq) {
> +		mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
> +		flow->verbs.hrxq = NULL;
> +	}
> +}
> +
> +/**
> + * Apply the flow.
> + *
> + * @param dev
> + *   Pointer to Ethernet device structure.
> + * @param flow
> + *   Pointer to flow structure.
> + * @param error
> + *   Pointer to error structure.
> + *
> + * @return
> + *   0 on success, a negative errno value otherwise and rte_errno is set.
> + */
> +static int
> +mlx5_flow_fate_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
> +		     struct rte_flow_error *error)

Same here. Either change the name or add more description.

> +{
> +	flow->verbs.hrxq = mlx5_hrxq_drop_new(dev);
> +	if (!flow->verbs.hrxq)
> +		return rte_flow_error_set
> +			(error, errno,
> +			 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +			 NULL,
> +			 "cannot allocate Drop queue");
> +	flow->verbs.flow =
> +		mlx5_glue->create_flow(flow->verbs.hrxq->qp, flow->verbs.attr);
> +	if (!flow->verbs.flow) {
> +		mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
> +		flow->verbs.hrxq = NULL;
> +		return rte_flow_error_set(error, errno,
> +					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +					  NULL,
> +					  "kernel module refuses to create"
> +					  " flow");
> +	}
> +	return 0;
> +}
> +
> +/**
> + * Create a flow.

And "insert to the provided list".

> + *
> + * @param dev
> + *   Pointer to Ethernet device.
> + * @param list
> + *   Pointer to a TAILQ flow list.
> + * @param[in] attr
> + *   Flow rule attributes.
> + * @param[in] items
> + *   Pattern specification (list terminated by the END pattern item).
> + * @param[in] actions
> + *   Associated actions (list terminated by the END action).
> + * @param[out] error
> + *   Perform verbose error reporting if not NULL.
> + *
> + * @return
> + *   A flow on success, NULL otherwise and rte_errno is set.
> + */
> +static struct rte_flow *
> +mlx5_flow_list_create(struct rte_eth_dev *dev,
> +		      struct mlx5_flows *list,
> +		      const struct rte_flow_attr *attr,
> +		      const struct rte_flow_item items[],
> +		      const struct rte_flow_action actions[],
> +		      struct rte_flow_error *error)
> +{
> +	struct rte_flow *flow;
> +	size_t size;
> +	int ret;
> +
> +	ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
> +	if (ret < 0)
> +		return NULL;
> +	size = ret;
> +	flow = rte_zmalloc(__func__, size, 0);
> +	if (!flow) {
> +		rte_flow_error_set(error, ENOMEM,
> +				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> +				   NULL,
> +				   "cannot allocate memory");
> +		return NULL;
> +	}
> +	flow->verbs.attr = (struct ibv_flow_attr *)(flow + 1);
> +	flow->verbs.specs = (uint8_t *)(flow->verbs.attr + 1);
> +	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
> +	if (ret < 0)
> +		goto error;
> +	assert((size_t)ret == size);
> +	if (dev->data->dev_started) {
> +		ret = mlx5_flow_fate_apply(dev, flow, error);
> +		if (ret < 0)
> +			goto error;
> +	}
> +	TAILQ_INSERT_TAIL(list, flow, next);
> +	return flow;
> +error:
> +	ret = rte_errno; /* Save rte_errno before cleanup. */
> +	mlx5_flow_fate_remove(dev, flow);
> +	rte_free(flow);
> +	rte_errno = ret; /* Restore rte_errno. */
> +	return NULL;
>  }
>  
>  /**
> @@ -187,17 +776,15 @@ mlx5_flow_validate(struct rte_eth_dev *dev __rte_unused,
>   * @see rte_flow_ops
>   */
>  struct rte_flow *
> -mlx5_flow_create(struct rte_eth_dev *dev __rte_unused,
> -		 const struct rte_flow_attr *attr __rte_unused,
> -		 const struct rte_flow_item items[] __rte_unused,
> -		 const struct rte_flow_action actions[] __rte_unused,
> +mlx5_flow_create(struct rte_eth_dev *dev,
> +		 const struct rte_flow_attr *attr,
> +		 const struct rte_flow_item items[],
> +		 const struct rte_flow_action actions[],
>  		 struct rte_flow_error *error)
>  {
> -	rte_flow_error_set(error, ENOTSUP,
> -			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
> -			   NULL,
> -			   "action not supported");
> -	return NULL;
> +	return mlx5_flow_list_create
> +		(dev, &((struct priv *)dev->data->dev_private)->flows,
> +		 attr, items, actions, error);
>  }
>  
>  /**
> @@ -211,10 +798,12 @@ mlx5_flow_create(struct rte_eth_dev *dev __rte_unused,
>   *   Flow to destroy.
>   */
>  static void
> -mlx5_flow_list_destroy(struct rte_eth_dev *dev __rte_unused,
> -		       struct mlx5_flows *list __rte_unused,
> -		       struct rte_flow *flow __rte_unused)
> +mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
> +		       struct rte_flow *flow)
>  {
> +	mlx5_flow_fate_remove(dev, flow);
> +	TAILQ_REMOVE(list, flow, next);
> +	rte_free(flow);
>  }
>  
>  /**
> @@ -422,9 +1011,8 @@ int
>  mlx5_flow_flush(struct rte_eth_dev *dev,
>  		struct rte_flow_error *error __rte_unused)
>  {
> -	struct priv *priv = dev->data->dev_private;
> -
> -	mlx5_flow_list_flush(dev, &priv->flows);
> +	mlx5_flow_list_flush(dev,
> +			     &((struct priv *)dev->data->dev_private)->flows);

Can you keep priv and make this line simpler?

>  	return 0;
>  }
>  
> @@ -737,9 +1325,8 @@ mlx5_fdir_filter_update(struct rte_eth_dev *dev,
>  static void
>  mlx5_fdir_filter_flush(struct rte_eth_dev *dev)
>  {
> -	struct priv *priv = dev->data->dev_private;
> -
> -	mlx5_flow_list_flush(dev, &priv->flows);
> +	mlx5_flow_list_flush(dev,
> +			     &((struct priv *)dev->data->dev_private)->flows);

Same here.

>  }
>  
>  /**
> -- 
> 2.18.0
>
  
Nélio Laranjeiro July 4, 2018, 9:24 a.m. UTC | #2
On Tue, Jul 03, 2018 at 03:27:18PM -0700, Yongseok Koh wrote:
> On Wed, Jun 27, 2018 at 05:07:36PM +0200, Nelio Laranjeiro wrote:
> 
> net/mlx5: support flow Ethernet item among with drop action
> 
> Typo? "among" -> "along"?
> 
> > Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
> > ---
> >  drivers/net/mlx5/mlx5.c      |   1 +
> >  drivers/net/mlx5/mlx5_flow.c | 671 ++++++++++++++++++++++++++++++++---
> >  2 files changed, 630 insertions(+), 42 deletions(-)
> > 
> > diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
> > index c3c8dffae..665a3c31f 100644
> > --- a/drivers/net/mlx5/mlx5.c
> > +++ b/drivers/net/mlx5/mlx5.c
> > @@ -241,6 +241,7 @@ mlx5_dev_close(struct rte_eth_dev *dev)
> >  	/* In case mlx5_dev_stop() has not been called. */
> >  	mlx5_dev_interrupt_handler_uninstall(dev);
> >  	mlx5_traffic_disable(dev);
> > +	mlx5_flow_flush(dev, NULL);
> >  	/* Prevent crashes when queues are still in use. */
> >  	dev->rx_pkt_burst = removed_rx_burst;
> >  	dev->tx_pkt_burst = removed_tx_burst;
> > diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
> > index 5d3bc183d..bc8c5de33 100644
> > --- a/drivers/net/mlx5/mlx5_flow.c
> > +++ b/drivers/net/mlx5/mlx5_flow.c
> > @@ -35,11 +35,50 @@
> >  extern const struct eth_dev_ops mlx5_dev_ops;
> >  extern const struct eth_dev_ops mlx5_dev_ops_isolate;
> >  
> > +/* Pattern Layer bits. */
> > +#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
> > +#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
> > +#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
> > +#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
> > +#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
> > +#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
> > +/* Masks. */
> > +#define MLX5_FLOW_LAYER_OUTER_L3 \
> > +	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
> > +#define MLX5_FLOW_LAYER_OUTER_L4 \
> > +	(MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
> > +
> > +/* Action fate on the packet. */
> > +#define MLX5_FLOW_FATE_DROP (1u << 0)
> 
> I understand what you mean by 'fate' but 'action fate' sounds strange to me.
> How about 'terminal action' instead? E.g. flow->terminal_actions or
> flow->term_actions instead of flow->fate.

It uses the same terminology as in the flow API [1] to make it consistent
and easier to find.
In addition "terminal" in the API is also used for actions terminating
the rule .e.g. JUMP.

> And, I'm not sure why you define a new (redundant) macro. You could use
> RTE_FLOW_ACTION_TYPE_DROP instead. I don't see any reason for setting different
> action flag in flow->fate. If you want to specify a group of terminal actions,
> you could've done:
> 
> #define MLX5_FLOW_TERMINAL_ACTIONS \
> 	(RTE_FLOW_ACTION_TYPE_DROP | RTE_FLOW_ACTION_TYPE_RSS | \
> 	 RTE_FLOW_ACTION_TYPE_QUEUE)

You are missing a point, RTE_FLOW_ACTION_TYPE_* are values not bits (see
enum rte_flow_action_type), doing this won't have the expected behavior.

>[...] 
> > +/**
> > + * Validate Attributes provided by the user.
> 
> And it also stores attributes.
> 
> >   *
> >   * @param dev
> >   *   Pointer to Ethernet device.
> > - * @param list
> > - *   Pointer to a TAILQ flow list.
> > + * @param attr
> > + *   Pointer to flow attributes
> > + * @param flow
> > + *   Pointer to the rte_flow structure.
> > + * @param error
> > + *   Pointer to error structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
> > +		     struct rte_flow *flow, struct rte_flow_error *error)
> 
> Better to have mlx5_flow_attributes_validate() like mlx5_flow_item_validate()?

If it stores, it cannot be named as validate.

> > +{
> > +	uint32_t priority_max =
> > +		((struct priv *)dev->data->dev_private)->config.flow_prio;
> > +
> > +	if (attr->group)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
> > +					  NULL,
> > +					  "groups are not supported");
> 
> "group is not supported"?
> 
> > +	if (attr->priority >= priority_max)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
> > +					  NULL,
> > +					  "priority value is not supported");
> 
> "priority is out of range" or something else?
> 
> > +	if (attr->egress)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
> > +					  NULL,
> > +					  "egress is not supported");
> > +	if (attr->transfer)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
> > +					  NULL,
> > +					  "transfer is not supported");
> > +	if (!attr->ingress)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
> > +					  NULL,
> > +					  "only ingress is supported");
> 
> Better to have "ingress should be specified" or something else?

Done,

> > +	flow->attributes = *attr;
> > +	return 0;
> > +}
> > +
> > +/**
> > + * Check support for a given item.
> > + *
> > + * @param item[in]
> > + *   Item specification.
> > + * @param default_mask[in]
> > + *   Bit-masks covering supported fields to compare with spec, last and mask in
> > + *   \item.
> 
> Remove the back-slash and the description isn't clear. From the code, it seems
> to be a default mask to be used if item doesn't have mask.
> 
> > + * @param nic_mask[in]
> > + *   Bit-masks covering supported fields by the NIC to compare with user mask.
> 
> s/Bit-masks/Bit-mask

All over the literature it is bit-masks.

> > + * @param size
> > + *   Bit-Mask size in bytes.
> 
> s/Bit-Mask/Bit-mask
>
> > + * @param error[out]
> > + *   Pointer to error structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +mlx5_flow_item_validate(const struct rte_flow_item *item,
> > +			const uint8_t *default_mask,
> > +			const uint8_t *nic_mask,
> > +			unsigned int size,
> > +			struct rte_flow_error *error)
> > +{
> > +	const uint8_t *mask = item->mask ? item->mask : default_mask;
> 
> In the mlx5_flow_item_eth(), rte_flow_item_eth_mask is passed to default_mask if
> item->mask is null, then why doing it again here?

You are right, it is not necessary here again.

> 
> > +	unsigned int i;
> > +
> > +	assert(nic_mask);
> > +	for (i = 0; i < size; ++i)
> > +		if ((nic_mask[i] | mask[i]) != nic_mask[i])
> > +			return rte_flow_error_set(error, ENOTSUP,
> > +						  RTE_FLOW_ERROR_TYPE_ITEM,
> > +						  item,
> > +						  "mask enables non supported"
> > +						  " bits");
> 
> default_mask is determined by caller, which is rte_flow_item_eth_mask or
> item->mask. And nic_mask is also given by PMD, which is a function local
> structure in the caller func. Doesn't look like a clear definition. Intead of
> having both default_mask and nic_mask here, you can take supported_mask, which
> can be (rte_flow_item_{name}_mask | nic_mask) in the calling func. And check
> validity of item->mask against supported_mask only if item->mask isn't null.

Moving the default_mask to the callee duplicates a lot of code all over
the item functions and there are several, in addition we need to
confront the user/default mask to the supported NIC mask to be able to
inform him when he tries to match bits unsupported by the NIC.  Doing
this also increase the amount of code.
This means that this for loop should be copy/paste inside each callee
function.

> > +	if (!item->spec && (item->mask || item->last))
> > +		return rte_flow_error_set(error, EINVAL,
> > +					  RTE_FLOW_ERROR_TYPE_ITEM,
> > +					  item,
> > +					  "mask/last without a spec is not"
> > +					  " supported");
> > +	if (item->spec && item->last) {
> > +		uint8_t spec[size];
> > +		uint8_t last[size];
> > +		unsigned int i;
> > +		int ret;
> > +
> > +		for (i = 0; i < size; ++i) {
> > +			spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
> > +			last[i] = ((const uint8_t *)item->last)[i] & mask[i];
> > +		}
> > +		ret = memcmp(spec, last, size);
> > +		if (ret != 0)
> > +			return rte_flow_error_set(error, ENOTSUP,
> > +						  RTE_FLOW_ERROR_TYPE_ITEM,
> > +						  item,
> > +						  "spec and last different"
> > +						  " is not supported");
> 
> Can you rephrase the error message?

Done,

>[...] 
> > +static int
> > +mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
> > +		   const size_t flow_size, struct rte_flow_error *error)
> > +{
> > +	const struct rte_flow_item_eth *spec = item->spec;
> > +	const struct rte_flow_item_eth *mask = item->mask;
> > +	const struct rte_flow_item_eth nic_mask = {
> > +		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
> > +		.type = RTE_BE16(0xffff),
> > +	};
> 
> If it is a device-specific one, shouldn't it be located in a header file?

I would agree if it was used in several places.

> > +	const unsigned int size = sizeof(struct ibv_flow_spec_eth);
> > +	struct ibv_flow_spec_eth eth = {
> > +		.type = IBV_FLOW_SPEC_ETH,
> > +		.size = size,
> > +	};
> > +	int ret;
> > +
> > +	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L2)
> > +		return rte_flow_error_set(error, ENOTSUP,
> > +					  RTE_FLOW_ERROR_TYPE_ITEM,
> > +					  item,
> > +					  "L2 layers already configured");
> > +	if (!mask)
> > +		mask = &rte_flow_item_eth_mask;
> 
> What is the meaning of default_mask for mlx5_flow_item_validate()? Also refer
> to my comments in mlx5_flow_item_validate().

An item should always have a mask defined, if the user does not set a
mask, we need to pick one and in such situation it is the flow default
mask.  But flow default mask may enable matching bits unsupported by the
NIC.

> > +	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
> > +				      (const uint8_t *)&nic_mask,
> > +				      sizeof(struct rte_flow_item_eth),
> > +				      error);
> > +	if (ret)
> > +		return ret;
> > +	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2;
> > +	if (size > flow_size)
> > +		return size;
> > +	if (spec) {
> > +		unsigned int i;
> > +
> > +		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
> > +		memcpy(&eth.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
> > +		eth.val.ether_type = spec->type;
> > +		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
> > +		memcpy(&eth.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
> > +		eth.mask.ether_type = mask->type;
> > +		/* Remove unwanted bits from values. */
> > +		for (i = 0; i < ETHER_ADDR_LEN; ++i) {
> > +			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
> > +			eth.val.src_mac[i] &= eth.mask.src_mac[i];
> > +		}
> > +		eth.val.ether_type &= eth.mask.ether_type;
> > +	}
> > +	mlx5_flow_spec_verbs_add(flow, &eth, size);
> 
> Do you have a plan to change this redundant memcpy? I mean, writing directly to
> the allocated memory for verbs spec instead of the local micro-scratchpad.

Unfortunately I don't have time for this for this release, but it is not
the only improvement to make, I'll share with you privately.

>[...]
> > +
> > +/**
> > + * Validate the rule and return a flow structure filled accordingly.
> 
> Please add more detailed description. This is the most important function in
> mlx5_flow.c, it deserves more lines of explanation. Make sure to explain about
> the multiple functionality it has - validate, get required size and execute
> translation only if it gets allocated memory. Plus, how to validate a flow, e.g.
> setting flow_size=0 and flow can be null in that case.

Agreed,
 
> > + *
> > + * @param dev
> > + *   Pointer to Ethernet device.
> > + * @param flow
> > + *   Pointer an flow structure.
> 
> s/an/to
> shouldn't it be [out]?

[in, out] in fact.

> And please mention flow can be null for testing call.

It can be anything until the flow_size is accurate.  The testing is not
based on the flow nor the size, but in the fact the conversion may use
more size than the one provided.

> > + * @param flow_size
> > + *   Size of the allocated space to store the flow information.
> 
> Please mention what happens if the size isn't enough or announce it can be
> intentionally zero in order to validate the flow and to know the required size
> for translation.
> 
> >   * @param[in] attr
> >   *   Flow rule attributes.
> >   * @param[in] pattern
> > @@ -144,21 +589,39 @@ mlx5_flow_priorities(struct rte_eth_dev *dev)
> >   *   Perform verbose error reporting if not NULL.
> >   *
> >   * @return
> > - *   A flow on success, NULL otherwise and rte_errno is set.
> > + *   The amount of bytes necessary to create the flow, a negative errno value
> > + *   otherwise and rte_errno is set.
> 
> 'amount of bytes' -> 'size of memory'
> And not clear what is 'otherwise' because there's no definition of success.

Reworded.

>[...]
> > +/**
> > + * Remove the flow.
> > + *
> > + * @param dev
> > + *   Pointer to Ethernet device.
> > + * @param flow
> > + *   Pointer to flow structure.
> > + */
> > +static void
> > +mlx5_flow_fate_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
> 
> Sounds weird. Is it removing a fate of a flow?
> Description is just 'removing a flow'.
>[...]
> > +/**
> > + * Apply the flow.
> > + *
> > + * @param dev
> > + *   Pointer to Ethernet device structure.
> > + * @param flow
> > + *   Pointer to flow structure.
> > + * @param error
> > + *   Pointer to error structure.
> > + *
> > + * @return
> > + *   0 on success, a negative errno value otherwise and rte_errno is set.
> > + */
> > +static int
> > +mlx5_flow_fate_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
> > +		     struct rte_flow_error *error)
> 
> Same here. Either change the name or add more description.

Agreed, "fate" removed in both functions.

>[...]
> > +/**
> > + * Create a flow.
> 
> And "insert to the provided list".

Added.

> > + *
> > + * @param dev
> > + *   Pointer to Ethernet device.
> > + * @param list
> > + *   Pointer to a TAILQ flow list.
> > + * @param[in] attr
> > + *   Flow rule attributes.
> > + * @param[in] items
> > + *   Pattern specification (list terminated by the END pattern item).
> > + * @param[in] actions
> > + *   Associated actions (list terminated by the END action).
> > + * @param[out] error
> > + *   Perform verbose error reporting if not NULL.
> > + *
> > + * @return
> > + *   A flow on success, NULL otherwise and rte_errno is set.
> > + */
> > +static struct rte_flow *
> > +mlx5_flow_list_create(struct rte_eth_dev *dev,
> > +		      struct mlx5_flows *list,
> > +		      const struct rte_flow_attr *attr,
> > +		      const struct rte_flow_item items[],
> > +		      const struct rte_flow_action actions[],
> > +		      struct rte_flow_error *error)
> > +{
>[...]
> >  /**
> > @@ -422,9 +1011,8 @@ int
> >  mlx5_flow_flush(struct rte_eth_dev *dev,
> >  		struct rte_flow_error *error __rte_unused)
> >  {
> > -	struct priv *priv = dev->data->dev_private;
> > -
> > -	mlx5_flow_list_flush(dev, &priv->flows);
> > +	mlx5_flow_list_flush(dev,
> > +			     &((struct priv *)dev->data->dev_private)->flows);
> 
> Can you keep priv and make this line simpler?
> 
> >  	return 0;
> >  }
> >  
> > @@ -737,9 +1325,8 @@ mlx5_fdir_filter_update(struct rte_eth_dev *dev,
> >  static void
> >  mlx5_fdir_filter_flush(struct rte_eth_dev *dev)
> >  {
> > -	struct priv *priv = dev->data->dev_private;
> > -
> > -	mlx5_flow_list_flush(dev, &priv->flows);
> > +	mlx5_flow_list_flush(dev,
> > +			     &((struct priv *)dev->data->dev_private)->flows);
> 
> Same here.

It was a request from Adrien. (done anyway).

Thanks,

[1] https://doc.dpdk.org/guides/prog_guide/rte_flow.html?highlight=rte_flow#actions
  

Patch

diff --git a/drivers/net/mlx5/mlx5.c b/drivers/net/mlx5/mlx5.c
index c3c8dffae..665a3c31f 100644
--- a/drivers/net/mlx5/mlx5.c
+++ b/drivers/net/mlx5/mlx5.c
@@ -241,6 +241,7 @@  mlx5_dev_close(struct rte_eth_dev *dev)
 	/* In case mlx5_dev_stop() has not been called. */
 	mlx5_dev_interrupt_handler_uninstall(dev);
 	mlx5_traffic_disable(dev);
+	mlx5_flow_flush(dev, NULL);
 	/* Prevent crashes when queues are still in use. */
 	dev->rx_pkt_burst = removed_rx_burst;
 	dev->tx_pkt_burst = removed_tx_burst;
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 5d3bc183d..bc8c5de33 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -35,11 +35,50 @@ 
 extern const struct eth_dev_ops mlx5_dev_ops;
 extern const struct eth_dev_ops mlx5_dev_ops_isolate;
 
+/* Pattern Layer bits. */
+#define MLX5_FLOW_LAYER_OUTER_L2 (1u << 0)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV4 (1u << 1)
+#define MLX5_FLOW_LAYER_OUTER_L3_IPV6 (1u << 2)
+#define MLX5_FLOW_LAYER_OUTER_L4_UDP (1u << 3)
+#define MLX5_FLOW_LAYER_OUTER_L4_TCP (1u << 4)
+#define MLX5_FLOW_LAYER_OUTER_VLAN (1u << 5)
+/* Masks. */
+#define MLX5_FLOW_LAYER_OUTER_L3 \
+	(MLX5_FLOW_LAYER_OUTER_L3_IPV4 | MLX5_FLOW_LAYER_OUTER_L3_IPV6)
+#define MLX5_FLOW_LAYER_OUTER_L4 \
+	(MLX5_FLOW_LAYER_OUTER_L4_UDP | MLX5_FLOW_LAYER_OUTER_L4_TCP)
+
+/* Action fate on the packet. */
+#define MLX5_FLOW_FATE_DROP (1u << 0)
+
+/** Handles information leading to a drop fate. */
+struct mlx5_flow_verbs {
+	unsigned int size; /**< Size of the attribute. */
+	struct {
+		struct ibv_flow_attr *attr;
+		/**< Pointer to the Specification buffer. */
+		uint8_t *specs; /**< Pointer to the specifications. */
+	};
+	struct ibv_flow *flow; /**< Verbs flow pointer. */
+	struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
+};
+
+/* Flow structure. */
 struct rte_flow {
 	TAILQ_ENTRY(rte_flow) next; /**< Pointer to the next flow structure. */
+	struct rte_flow_attr attributes; /**< User flow attribute. */
+	uint32_t layers;
+	/**< Bit-fields of present layers see MLX5_FLOW_ITEMS_*. */
+	uint32_t fate;
+	/**< Bit-fields of present fate see MLX5_FLOW_FATE_*. */
+	struct mlx5_flow_verbs verbs; /* Verbs drop flow. */
 };
 
 static const struct rte_flow_ops mlx5_flow_ops = {
+	.validate = mlx5_flow_validate,
+	.create = mlx5_flow_create,
+	.destroy = mlx5_flow_destroy,
+	.flush = mlx5_flow_flush,
 	.isolate = mlx5_flow_isolate,
 };
 
@@ -128,12 +167,418 @@  mlx5_flow_priorities(struct rte_eth_dev *dev)
 }
 
 /**
- * Convert a flow.
+ * Flow debug purpose function only available when
+ * CONFIG_RTE_LIBRTE_MLX5_DEBUG=y
+ *
+ * @param flow
+ *   Pointer to the flow structure to display.
+ */
+void
+mlx5_flow_print(struct rte_flow *flow __rte_unused)
+{
+#ifndef NDEBUG
+	fprintf(stdout, "---------8<------------\n");
+	fprintf(stdout, "%s: flow information\n", MLX5_DRIVER_NAME);
+	fprintf(stdout, " attributes: group %u priority %u ingress %d egress %d"
+		" transfer %d\n", flow->attributes.group,
+		flow->attributes.priority,
+		flow->attributes.ingress,
+		flow->attributes.egress,
+		flow->attributes.transfer);
+	fprintf(stdout, " layers: %s/%s/%s\n",
+		flow->layers & MLX5_FLOW_LAYER_OUTER_L2 ? "l2" : "-",
+		flow->layers & MLX5_FLOW_LAYER_OUTER_L3 ? "l3" : "-",
+		flow->layers & MLX5_FLOW_LAYER_OUTER_L4 ? "l4" : "-");
+	if (flow->fate & MLX5_FLOW_FATE_DROP)
+		fprintf(stdout, " fate: drop queue\n");
+	if (flow->verbs.attr) {
+		struct ibv_spec_header *hdr =
+			(struct ibv_spec_header *)flow->verbs.specs;
+		const int n = flow->verbs.attr->num_of_specs;
+		int i;
+
+		fprintf(stdout, " Verbs attributes: specs_n %u\n",
+			flow->verbs.attr->num_of_specs);
+		for (i = 0; i != n; ++i) {
+			rte_hexdump(stdout, " ", hdr, hdr->size);
+			hdr = (struct ibv_spec_header *)
+				((uint8_t *)hdr + hdr->size);
+		}
+	}
+	fprintf(stdout, "--------->8------------\n");
+#endif
+}
+
+/**
+ * Validate Attributes provided by the user.
  *
  * @param dev
  *   Pointer to Ethernet device.
- * @param list
- *   Pointer to a TAILQ flow list.
+ * @param attr
+ *   Pointer to flow attributes
+ * @param flow
+ *   Pointer to the rte_flow structure.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_attributes(struct rte_eth_dev *dev, const struct rte_flow_attr *attr,
+		     struct rte_flow *flow, struct rte_flow_error *error)
+{
+	uint32_t priority_max =
+		((struct priv *)dev->data->dev_private)->config.flow_prio;
+
+	if (attr->group)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+					  NULL,
+					  "groups are not supported");
+	if (attr->priority >= priority_max)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+					  NULL,
+					  "priority value is not supported");
+	if (attr->egress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_EGRESS,
+					  NULL,
+					  "egress is not supported");
+	if (attr->transfer)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_TRANSFER,
+					  NULL,
+					  "transfer is not supported");
+	if (!attr->ingress)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+					  NULL,
+					  "only ingress is supported");
+	flow->attributes = *attr;
+	return 0;
+}
+
+/**
+ * Check support for a given item.
+ *
+ * @param item[in]
+ *   Item specification.
+ * @param default_mask[in]
+ *   Bit-masks covering supported fields to compare with spec, last and mask in
+ *   \item.
+ * @param nic_mask[in]
+ *   Bit-masks covering supported fields by the NIC to compare with user mask.
+ * @param size
+ *   Bit-Mask size in bytes.
+ * @param error[out]
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_item_validate(const struct rte_flow_item *item,
+			const uint8_t *default_mask,
+			const uint8_t *nic_mask,
+			unsigned int size,
+			struct rte_flow_error *error)
+{
+	const uint8_t *mask = item->mask ? item->mask : default_mask;
+	unsigned int i;
+
+	assert(nic_mask);
+	for (i = 0; i < size; ++i)
+		if ((nic_mask[i] | mask[i]) != nic_mask[i])
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "mask enables non supported"
+						  " bits");
+	if (!item->spec && (item->mask || item->last))
+		return rte_flow_error_set(error, EINVAL,
+					  RTE_FLOW_ERROR_TYPE_ITEM,
+					  item,
+					  "mask/last without a spec is not"
+					  " supported");
+	if (item->spec && item->last) {
+		uint8_t spec[size];
+		uint8_t last[size];
+		unsigned int i;
+		int ret;
+
+		for (i = 0; i < size; ++i) {
+			spec[i] = ((const uint8_t *)item->spec)[i] & mask[i];
+			last[i] = ((const uint8_t *)item->last)[i] & mask[i];
+		}
+		ret = memcmp(spec, last, size);
+		if (ret != 0)
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  item,
+						  "spec and last different"
+						  " is not supported");
+	}
+	return 0;
+}
+
+/**
+ * Add a verbs specification.
+ *
+ * @param flow
+ *   Pointer to flow structure.
+ * @param src
+ *   Create specification.
+ * @param size
+ *   Size in bytes of the specification to copy.
+ */
+static void
+mlx5_flow_spec_verbs_add(struct rte_flow *flow, void *src, unsigned int size)
+{
+	if (flow->verbs.specs) {
+		void *dst;
+
+		dst = (void *)(flow->verbs.specs + flow->verbs.size);
+		memcpy(dst, src, size);
+		++flow->verbs.attr->num_of_specs;
+	}
+	flow->verbs.size += size;
+}
+
+/**
+ * Validate Ethernet layer and possibly create the Verbs specification.
+ *
+ * @param item[in]
+ *   Item specification.
+ * @param flow[in, out]
+ *   Pointer to flow structure.
+ * @param flow_size[in]
+ *   Size of the buffer to store the specification.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   size in bytes necessary for the conversion, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_item_eth(const struct rte_flow_item *item, struct rte_flow *flow,
+		   const size_t flow_size, struct rte_flow_error *error)
+{
+	const struct rte_flow_item_eth *spec = item->spec;
+	const struct rte_flow_item_eth *mask = item->mask;
+	const struct rte_flow_item_eth nic_mask = {
+		.dst.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.src.addr_bytes = "\xff\xff\xff\xff\xff\xff",
+		.type = RTE_BE16(0xffff),
+	};
+	const unsigned int size = sizeof(struct ibv_flow_spec_eth);
+	struct ibv_flow_spec_eth eth = {
+		.type = IBV_FLOW_SPEC_ETH,
+		.size = size,
+	};
+	int ret;
+
+	if (flow->layers & MLX5_FLOW_LAYER_OUTER_L2)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ITEM,
+					  item,
+					  "L2 layers already configured");
+	if (!mask)
+		mask = &rte_flow_item_eth_mask;
+	ret = mlx5_flow_item_validate(item, (const uint8_t *)mask,
+				      (const uint8_t *)&nic_mask,
+				      sizeof(struct rte_flow_item_eth),
+				      error);
+	if (ret)
+		return ret;
+	flow->layers |= MLX5_FLOW_LAYER_OUTER_L2;
+	if (size > flow_size)
+		return size;
+	if (spec) {
+		unsigned int i;
+
+		memcpy(&eth.val.dst_mac, spec->dst.addr_bytes, ETHER_ADDR_LEN);
+		memcpy(&eth.val.src_mac, spec->src.addr_bytes, ETHER_ADDR_LEN);
+		eth.val.ether_type = spec->type;
+		memcpy(&eth.mask.dst_mac, mask->dst.addr_bytes, ETHER_ADDR_LEN);
+		memcpy(&eth.mask.src_mac, mask->src.addr_bytes, ETHER_ADDR_LEN);
+		eth.mask.ether_type = mask->type;
+		/* Remove unwanted bits from values. */
+		for (i = 0; i < ETHER_ADDR_LEN; ++i) {
+			eth.val.dst_mac[i] &= eth.mask.dst_mac[i];
+			eth.val.src_mac[i] &= eth.mask.src_mac[i];
+		}
+		eth.val.ether_type &= eth.mask.ether_type;
+	}
+	mlx5_flow_spec_verbs_add(flow, &eth, size);
+	return size;
+}
+
+/**
+ * Validate items provided by the user.
+ *
+ * @param items
+ *   Pointer to flow items array.
+ * @param flow
+ *   Pointer to the rte_flow structure.
+ * @param flow_size
+ *   Size in bytes of the available space for to store the flow information.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   size in bytes necessary for the conversion, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_items(const struct rte_flow_item items[],
+		struct rte_flow *flow, const size_t flow_size,
+		struct rte_flow_error *error)
+{
+	int remain = flow_size;
+	size_t size = 0;
+
+	for (; items->type != RTE_FLOW_ITEM_TYPE_END; items++) {
+		int ret = 0;
+
+		switch (items->type) {
+		case RTE_FLOW_ITEM_TYPE_VOID:
+			break;
+		case RTE_FLOW_ITEM_TYPE_ETH:
+			ret = mlx5_flow_item_eth(items, flow, remain, error);
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ITEM,
+						  items,
+						  "item not supported");
+		}
+		if (ret < 0)
+			return ret;
+		if (remain > ret)
+			remain -= ret;
+		else
+			remain = 0;
+		size += ret;
+	}
+	if (!flow->layers) {
+		const struct rte_flow_item item = {
+			.type = RTE_FLOW_ITEM_TYPE_ETH,
+		};
+
+		return mlx5_flow_item_eth(&item, flow, flow_size, error);
+	}
+	return size;
+}
+
+/**
+ * Validate action drop provided by the user.
+ *
+ * @param actions
+ *   Pointer to flow actions array.
+ * @param flow
+ *   Pointer to the rte_flow structure.
+ * @param flow_size
+ *   Size in bytes of the available space for to store the flow information.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   size in bytes necessary for the conversion, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_action_drop(const struct rte_flow_action *actions,
+		      struct rte_flow *flow, const size_t flow_size,
+		      struct rte_flow_error *error)
+{
+	unsigned int size = sizeof(struct ibv_flow_spec_action_drop);
+	struct ibv_flow_spec_action_drop drop = {
+			.type = IBV_FLOW_SPEC_ACTION_DROP,
+			.size = size,
+	};
+
+	if (flow->fate)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_ACTION,
+					  actions,
+					  "multiple fate actions are not"
+					  " supported");
+	if (size < flow_size)
+		mlx5_flow_spec_verbs_add(flow, &drop, size);
+	flow->fate |= MLX5_FLOW_FATE_DROP;
+	return size;
+}
+
+/**
+ * Validate actions provided by the user.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param actions
+ *   Pointer to flow actions array.
+ * @param flow
+ *   Pointer to the rte_flow structure.
+ * @param flow_size
+ *   Size in bytes of the available space for to store the flow information.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   size in bytes necessary for the conversion, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_actions(struct rte_eth_dev *dev __rte_unused,
+		  const struct rte_flow_action actions[],
+		  struct rte_flow *flow, const size_t flow_size,
+		  struct rte_flow_error *error)
+{
+	size_t size = 0;
+	int remain = flow_size;
+	int ret = 0;
+
+	for (; actions->type != RTE_FLOW_ACTION_TYPE_END; actions++) {
+		switch (actions->type) {
+		case RTE_FLOW_ACTION_TYPE_VOID:
+			break;
+		case RTE_FLOW_ACTION_TYPE_DROP:
+			ret = mlx5_flow_action_drop(actions, flow, remain,
+						    error);
+			break;
+		default:
+			return rte_flow_error_set(error, ENOTSUP,
+						  RTE_FLOW_ERROR_TYPE_ACTION,
+						  actions,
+						  "action not supported");
+		}
+		if (ret < 0)
+			return ret;
+		if (remain > ret)
+			remain -= ret;
+		else
+			remain = 0;
+		size += ret;
+	}
+	if (!flow->fate)
+		return rte_flow_error_set(error, ENOTSUP,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "no fate action found");
+	return size;
+}
+
+/**
+ * Validate the rule and return a flow structure filled accordingly.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Pointer an flow structure.
+ * @param flow_size
+ *   Size of the allocated space to store the flow information.
  * @param[in] attr
  *   Flow rule attributes.
  * @param[in] pattern
@@ -144,21 +589,39 @@  mlx5_flow_priorities(struct rte_eth_dev *dev)
  *   Perform verbose error reporting if not NULL.
  *
  * @return
- *   A flow on success, NULL otherwise and rte_errno is set.
+ *   The amount of bytes necessary to create the flow, a negative errno value
+ *   otherwise and rte_errno is set.
  */
-static struct rte_flow *
-mlx5_flow_list_create(struct rte_eth_dev *dev __rte_unused,
-		      struct mlx5_flows *list __rte_unused,
-		      const struct rte_flow_attr *attr __rte_unused,
-		      const struct rte_flow_item items[] __rte_unused,
-		      const struct rte_flow_action actions[] __rte_unused,
-		      struct rte_flow_error *error)
+static int
+mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
+		const size_t flow_size,
+		const struct rte_flow_attr *attr,
+		const struct rte_flow_item items[],
+		const struct rte_flow_action actions[],
+		struct rte_flow_error *error)
 {
-	rte_flow_error_set(error, ENOTSUP,
-			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			   NULL,
-			   "action not supported");
-	return NULL;
+	struct rte_flow local_flow = { .layers = 0, };
+	size_t size = sizeof(*flow) + sizeof(struct ibv_flow_attr);
+	int remain = (flow_size > size) ? flow_size - size : 0;
+	int ret;
+
+	if (!remain)
+		flow = &local_flow;
+	ret = mlx5_flow_attributes(dev, attr, flow, error);
+	if (ret < 0)
+		return ret;
+	ret = mlx5_flow_items(items, flow, remain, error);
+	if (ret < 0)
+		return ret;
+	size += ret;
+	remain = (flow_size > size) ? flow_size - size : 0;
+	ret = mlx5_flow_actions(dev, actions, flow, remain, error);
+	if (ret < 0)
+		return ret;
+	size += ret;
+	if (size <= flow_size)
+		flow->verbs.attr->priority = flow->attributes.priority;
+	return size;
 }
 
 /**
@@ -168,16 +631,142 @@  mlx5_flow_list_create(struct rte_eth_dev *dev __rte_unused,
  * @see rte_flow_ops
  */
 int
-mlx5_flow_validate(struct rte_eth_dev *dev __rte_unused,
-		   const struct rte_flow_attr *attr __rte_unused,
-		   const struct rte_flow_item items[] __rte_unused,
-		   const struct rte_flow_action actions[] __rte_unused,
+mlx5_flow_validate(struct rte_eth_dev *dev,
+		   const struct rte_flow_attr *attr,
+		   const struct rte_flow_item items[],
+		   const struct rte_flow_action actions[],
 		   struct rte_flow_error *error)
 {
-	return rte_flow_error_set(error, ENOTSUP,
-				  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-				  NULL,
-				  "action not supported");
+	int ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
+
+	if (ret < 0)
+		return ret;
+	return 0;
+}
+
+/**
+ * Remove the flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param flow
+ *   Pointer to flow structure.
+ */
+static void
+mlx5_flow_fate_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
+{
+	if (flow->fate & MLX5_FLOW_FATE_DROP) {
+		if (flow->verbs.flow) {
+			claim_zero(mlx5_glue->destroy_flow(flow->verbs.flow));
+			flow->verbs.flow = NULL;
+		}
+	}
+	if (flow->verbs.hrxq) {
+		mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
+		flow->verbs.hrxq = NULL;
+	}
+}
+
+/**
+ * Apply the flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device structure.
+ * @param flow
+ *   Pointer to flow structure.
+ * @param error
+ *   Pointer to error structure.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+static int
+mlx5_flow_fate_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
+		     struct rte_flow_error *error)
+{
+	flow->verbs.hrxq = mlx5_hrxq_drop_new(dev);
+	if (!flow->verbs.hrxq)
+		return rte_flow_error_set
+			(error, errno,
+			 RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+			 NULL,
+			 "cannot allocate Drop queue");
+	flow->verbs.flow =
+		mlx5_glue->create_flow(flow->verbs.hrxq->qp, flow->verbs.attr);
+	if (!flow->verbs.flow) {
+		mlx5_hrxq_drop_release(dev, flow->verbs.hrxq);
+		flow->verbs.hrxq = NULL;
+		return rte_flow_error_set(error, errno,
+					  RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+					  NULL,
+					  "kernel module refuses to create"
+					  " flow");
+	}
+	return 0;
+}
+
+/**
+ * Create a flow.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param list
+ *   Pointer to a TAILQ flow list.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] items
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   A flow on success, NULL otherwise and rte_errno is set.
+ */
+static struct rte_flow *
+mlx5_flow_list_create(struct rte_eth_dev *dev,
+		      struct mlx5_flows *list,
+		      const struct rte_flow_attr *attr,
+		      const struct rte_flow_item items[],
+		      const struct rte_flow_action actions[],
+		      struct rte_flow_error *error)
+{
+	struct rte_flow *flow;
+	size_t size;
+	int ret;
+
+	ret = mlx5_flow_merge(dev, NULL, 0, attr, items, actions, error);
+	if (ret < 0)
+		return NULL;
+	size = ret;
+	flow = rte_zmalloc(__func__, size, 0);
+	if (!flow) {
+		rte_flow_error_set(error, ENOMEM,
+				   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
+				   NULL,
+				   "cannot allocate memory");
+		return NULL;
+	}
+	flow->verbs.attr = (struct ibv_flow_attr *)(flow + 1);
+	flow->verbs.specs = (uint8_t *)(flow->verbs.attr + 1);
+	ret = mlx5_flow_merge(dev, flow, size, attr, items, actions, error);
+	if (ret < 0)
+		goto error;
+	assert((size_t)ret == size);
+	if (dev->data->dev_started) {
+		ret = mlx5_flow_fate_apply(dev, flow, error);
+		if (ret < 0)
+			goto error;
+	}
+	TAILQ_INSERT_TAIL(list, flow, next);
+	return flow;
+error:
+	ret = rte_errno; /* Save rte_errno before cleanup. */
+	mlx5_flow_fate_remove(dev, flow);
+	rte_free(flow);
+	rte_errno = ret; /* Restore rte_errno. */
+	return NULL;
 }
 
 /**
@@ -187,17 +776,15 @@  mlx5_flow_validate(struct rte_eth_dev *dev __rte_unused,
  * @see rte_flow_ops
  */
 struct rte_flow *
-mlx5_flow_create(struct rte_eth_dev *dev __rte_unused,
-		 const struct rte_flow_attr *attr __rte_unused,
-		 const struct rte_flow_item items[] __rte_unused,
-		 const struct rte_flow_action actions[] __rte_unused,
+mlx5_flow_create(struct rte_eth_dev *dev,
+		 const struct rte_flow_attr *attr,
+		 const struct rte_flow_item items[],
+		 const struct rte_flow_action actions[],
 		 struct rte_flow_error *error)
 {
-	rte_flow_error_set(error, ENOTSUP,
-			   RTE_FLOW_ERROR_TYPE_UNSPECIFIED,
-			   NULL,
-			   "action not supported");
-	return NULL;
+	return mlx5_flow_list_create
+		(dev, &((struct priv *)dev->data->dev_private)->flows,
+		 attr, items, actions, error);
 }
 
 /**
@@ -211,10 +798,12 @@  mlx5_flow_create(struct rte_eth_dev *dev __rte_unused,
  *   Flow to destroy.
  */
 static void
-mlx5_flow_list_destroy(struct rte_eth_dev *dev __rte_unused,
-		       struct mlx5_flows *list __rte_unused,
-		       struct rte_flow *flow __rte_unused)
+mlx5_flow_list_destroy(struct rte_eth_dev *dev, struct mlx5_flows *list,
+		       struct rte_flow *flow)
 {
+	mlx5_flow_fate_remove(dev, flow);
+	TAILQ_REMOVE(list, flow, next);
+	rte_free(flow);
 }
 
 /**
@@ -422,9 +1011,8 @@  int
 mlx5_flow_flush(struct rte_eth_dev *dev,
 		struct rte_flow_error *error __rte_unused)
 {
-	struct priv *priv = dev->data->dev_private;
-
-	mlx5_flow_list_flush(dev, &priv->flows);
+	mlx5_flow_list_flush(dev,
+			     &((struct priv *)dev->data->dev_private)->flows);
 	return 0;
 }
 
@@ -737,9 +1325,8 @@  mlx5_fdir_filter_update(struct rte_eth_dev *dev,
 static void
 mlx5_fdir_filter_flush(struct rte_eth_dev *dev)
 {
-	struct priv *priv = dev->data->dev_private;
-
-	mlx5_flow_list_flush(dev, &priv->flows);
+	mlx5_flow_list_flush(dev,
+			     &((struct priv *)dev->data->dev_private)->flows);
 }
 
 /**