[v2,2/6] net/mlx5: add framework for switch flow rules

Message ID 20180713092910.26276-3-adrien.mazarguil@6wind.com (mailing list archive)
State Accepted, archived
Delegated to: Shahaf Shuler
Headers
Series net/mlx5: add support for switch flow rules |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation fail apply issues

Commit Message

Adrien Mazarguil July 13, 2018, 9:40 a.m. UTC
  Because mlx5 switch flow rules are configured through Netlink (TC
interface) and have little in common with Verbs, this patch adds a separate
parser function to handle them.

- mlx5_nl_flow_transpose() converts a rte_flow rule to its TC equivalent
  and stores the result in a buffer.

- mlx5_nl_flow_brand() gives a unique handle to a flow rule buffer.

- mlx5_nl_flow_create() instantiates a flow rule on the device based on
  such a buffer.

- mlx5_nl_flow_destroy() performs the reverse operation.

These functions are called by the existing implementation when encountering
flow rules which must be offloaded to the switch (currently relying on the
transfer attribute).

Signed-off-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
Signed-off-by: Nelio Laranjeiro <nelio.laranjeiro@6wind.com>
Acked-by: Yongseok Koh <yskoh@mellanox.com>
--
v2 changes:

- Replaced mlx5_domain_to_port_id() with mlx5_dev_to_port_id().
- Added definitions for NETLINK_CAP_ACK, TC_H_MIN_INGRESS,
  TCA_CLS_FLAGS_SKIP_SW, TCA_FLOWER_ACT and TCA_FLOWER_FLAGS in case they
  are missing from the host system (e.g. RHEL 7.2).
- Modified the size of buf_tmp[] in mlx5_nl_flow_transpose() as
  MNL_SOCKET_BUFFER_SIZE was insane. 1 kiB of message payload is plenty
  enough for the time being.
---
 drivers/net/mlx5/Makefile       |  10 ++
 drivers/net/mlx5/mlx5.h         |  18 ++
 drivers/net/mlx5/mlx5_flow.c    | 111 +++++++++++++
 drivers/net/mlx5/mlx5_nl_flow.c | 311 +++++++++++++++++++++++++++++++++++
 4 files changed, 450 insertions(+)
  

Patch

diff --git a/drivers/net/mlx5/Makefile b/drivers/net/mlx5/Makefile
index 8d3cb219b..1ccfbb594 100644
--- a/drivers/net/mlx5/Makefile
+++ b/drivers/net/mlx5/Makefile
@@ -199,6 +199,16 @@  mlx5_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		linux/if_link.h \
 		enum IFLA_PHYS_PORT_NAME \
 		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_ACT \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_ACT \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TCA_FLOWER_FLAGS \
+		linux/pkt_cls.h \
+		enum TCA_FLOWER_FLAGS \
+		$(AUTOCONF_OUTPUT)
 
 # Create mlx5_autoconf.h or update it in case it differs from the new one.
 
diff --git a/drivers/net/mlx5/mlx5.h b/drivers/net/mlx5/mlx5.h
index 98b6ec07d..5bad1b32b 100644
--- a/drivers/net/mlx5/mlx5.h
+++ b/drivers/net/mlx5/mlx5.h
@@ -156,6 +156,12 @@  struct mlx5_drop {
 	struct mlx5_rxq_ibv *rxq; /* Verbs Rx queue. */
 };
 
+/** DPDK port to network interface index (ifindex) conversion. */
+struct mlx5_nl_flow_ptoi {
+	uint16_t port_id; /**< DPDK port ID. */
+	unsigned int ifindex; /**< Network interface index. */
+};
+
 struct mnl_socket;
 
 struct priv {
@@ -385,6 +391,18 @@  int mlx5_nl_switch_info(int nl, unsigned int ifindex,
 
 /* mlx5_nl_flow.c */
 
+int mlx5_nl_flow_transpose(void *buf,
+			   size_t size,
+			   const struct mlx5_nl_flow_ptoi *ptoi,
+			   const struct rte_flow_attr *attr,
+			   const struct rte_flow_item *pattern,
+			   const struct rte_flow_action *actions,
+			   struct rte_flow_error *error);
+void mlx5_nl_flow_brand(void *buf, uint32_t handle);
+int mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
+			struct rte_flow_error *error);
+int mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
+			 struct rte_flow_error *error);
 int mlx5_nl_flow_init(struct mnl_socket *nl, unsigned int ifindex,
 		      struct rte_flow_error *error);
 struct mnl_socket *mlx5_nl_flow_socket_create(void);
diff --git a/drivers/net/mlx5/mlx5_flow.c b/drivers/net/mlx5/mlx5_flow.c
index 89bfc670f..890bf7d72 100644
--- a/drivers/net/mlx5/mlx5_flow.c
+++ b/drivers/net/mlx5/mlx5_flow.c
@@ -4,6 +4,7 @@ 
  */
 
 #include <sys/queue.h>
+#include <stdalign.h>
 #include <stdint.h>
 #include <string.h>
 
@@ -280,6 +281,7 @@  struct rte_flow {
 	struct rte_flow_action_rss rss;/**< RSS context. */
 	uint8_t key[MLX5_RSS_HASH_KEY_LEN]; /**< RSS hash key. */
 	uint16_t (*queue)[]; /**< Destination queues to redirect traffic to. */
+	void *nl_flow; /**< Netlink flow buffer if relevant. */
 };
 
 static const struct rte_flow_ops mlx5_flow_ops = {
@@ -2365,6 +2367,103 @@  mlx5_flow_actions(struct rte_eth_dev *dev,
 }
 
 /**
+ * Validate flow rule and fill flow structure accordingly.
+ *
+ * @param dev
+ *   Pointer to Ethernet device.
+ * @param[out] flow
+ *   Pointer to flow structure.
+ * @param flow_size
+ *   Size of allocated space for @p flow.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification (list terminated by the END pattern item).
+ * @param[in] actions
+ *   Associated actions (list terminated by the END action).
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   A positive value representing the size of the flow object in bytes
+ *   regardless of @p flow_size on success, a negative errno value otherwise
+ *   and rte_errno is set.
+ */
+static int
+mlx5_flow_merge_switch(struct rte_eth_dev *dev,
+		       struct rte_flow *flow,
+		       size_t flow_size,
+		       const struct rte_flow_attr *attr,
+		       const struct rte_flow_item pattern[],
+		       const struct rte_flow_action actions[],
+		       struct rte_flow_error *error)
+{
+	unsigned int n = mlx5_dev_to_port_id(dev->device, NULL, 0);
+	uint16_t port_id[!n + n];
+	struct mlx5_nl_flow_ptoi ptoi[!n + n + 1];
+	size_t off = RTE_ALIGN_CEIL(sizeof(*flow), alignof(max_align_t));
+	unsigned int i;
+	unsigned int own = 0;
+	int ret;
+
+	/* At least one port is needed when no switch domain is present. */
+	if (!n) {
+		n = 1;
+		port_id[0] = dev->data->port_id;
+	} else {
+		n = RTE_MIN(mlx5_dev_to_port_id(dev->device, port_id, n), n);
+	}
+	for (i = 0; i != n; ++i) {
+		struct rte_eth_dev_info dev_info;
+
+		rte_eth_dev_info_get(port_id[i], &dev_info);
+		if (port_id[i] == dev->data->port_id)
+			own = i;
+		ptoi[i].port_id = port_id[i];
+		ptoi[i].ifindex = dev_info.if_index;
+	}
+	/* Ensure first entry of ptoi[] is the current device. */
+	if (own) {
+		ptoi[n] = ptoi[0];
+		ptoi[0] = ptoi[own];
+		ptoi[own] = ptoi[n];
+	}
+	/* An entry with zero ifindex terminates ptoi[]. */
+	ptoi[n].port_id = 0;
+	ptoi[n].ifindex = 0;
+	if (flow_size < off)
+		flow_size = 0;
+	ret = mlx5_nl_flow_transpose((uint8_t *)flow + off,
+				     flow_size ? flow_size - off : 0,
+				     ptoi, attr, pattern, actions, error);
+	if (ret < 0)
+		return ret;
+	if (flow_size) {
+		*flow = (struct rte_flow){
+			.attributes = *attr,
+			.nl_flow = (uint8_t *)flow + off,
+		};
+		/*
+		 * Generate a reasonably unique handle based on the address
+		 * of the target buffer.
+		 *
+		 * This is straightforward on 32-bit systems where the flow
+		 * pointer can be used directly. Otherwise, its least
+		 * significant part is taken after shifting it by the
+		 * previous power of two of the pointed buffer size.
+		 */
+		if (sizeof(flow) <= 4)
+			mlx5_nl_flow_brand(flow->nl_flow, (uintptr_t)flow);
+		else
+			mlx5_nl_flow_brand
+				(flow->nl_flow,
+				 (uintptr_t)flow >>
+				 rte_log2_u32(rte_align32prevpow2(flow_size)));
+	}
+	return off + ret;
+}
+
+/**
  * Convert the @p attributes, @p pattern, @p action, into an flow for the NIC
  * after ensuring the NIC will understand and process it correctly.
  * The conversion is only performed item/action per item/action, each of
@@ -2418,6 +2517,10 @@  mlx5_flow_merge(struct rte_eth_dev *dev, struct rte_flow *flow,
 	int ret;
 	uint32_t i;
 
+	if (attributes->transfer)
+		return mlx5_flow_merge_switch(dev, flow, flow_size,
+					      attributes, pattern,
+					      actions, error);
 	if (size > flow_size)
 		flow = &local_flow;
 	ret = mlx5_flow_attributes(dev, attributes, flow, error);
@@ -2708,8 +2811,11 @@  mlx5_flow_validate(struct rte_eth_dev *dev,
 static void
 mlx5_flow_remove(struct rte_eth_dev *dev, struct rte_flow *flow)
 {
+	struct priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 
+	if (flow->nl_flow && priv->mnl_socket)
+		mlx5_nl_flow_destroy(priv->mnl_socket, flow->nl_flow, NULL);
 	LIST_FOREACH(verbs, &flow->verbs, next) {
 		if (verbs->flow) {
 			claim_zero(mlx5_glue->destroy_flow(verbs->flow));
@@ -2746,6 +2852,7 @@  static int
 mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 		struct rte_flow_error *error)
 {
+	struct priv *priv = dev->data->dev_private;
 	struct mlx5_flow_verbs *verbs;
 	int err;
 
@@ -2794,6 +2901,10 @@  mlx5_flow_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
 			goto error;
 		}
 	}
+	if (flow->nl_flow &&
+	    priv->mnl_socket &&
+	    mlx5_nl_flow_create(priv->mnl_socket, flow->nl_flow, error))
+		goto error;
 	return 0;
 error:
 	err = rte_errno; /* Save rte_errno before cleanup. */
diff --git a/drivers/net/mlx5/mlx5_nl_flow.c b/drivers/net/mlx5/mlx5_nl_flow.c
index 60a4493e5..a9a5bac49 100644
--- a/drivers/net/mlx5/mlx5_nl_flow.c
+++ b/drivers/net/mlx5/mlx5_nl_flow.c
@@ -5,7 +5,9 @@ 
 
 #include <errno.h>
 #include <libmnl/libmnl.h>
+#include <linux/if_ether.h>
 #include <linux/netlink.h>
+#include <linux/pkt_cls.h>
 #include <linux/pkt_sched.h>
 #include <linux/rtnetlink.h>
 #include <stdalign.h>
@@ -14,6 +16,7 @@ 
 #include <stdlib.h>
 #include <sys/socket.h>
 
+#include <rte_byteorder.h>
 #include <rte_errno.h>
 #include <rte_flow.h>
 
@@ -24,6 +27,258 @@ 
 #define NETLINK_CAP_ACK 10
 #endif
 
+/* Normally found in linux/pkt_sched.h. */
+#ifndef TC_H_MIN_INGRESS
+#define TC_H_MIN_INGRESS 0xfff2u
+#endif
+
+/* Normally found in linux/pkt_cls.h. */
+#ifndef TCA_CLS_FLAGS_SKIP_SW
+#define TCA_CLS_FLAGS_SKIP_SW (1 << 1)
+#endif
+#ifndef HAVE_TCA_FLOWER_ACT
+#define TCA_FLOWER_ACT 3
+#endif
+#ifndef HAVE_TCA_FLOWER_FLAGS
+#define TCA_FLOWER_FLAGS 22
+#endif
+
+/** Parser state definitions for mlx5_nl_flow_trans[]. */
+enum mlx5_nl_flow_trans {
+	INVALID,
+	BACK,
+	ATTR,
+	PATTERN,
+	ITEM_VOID,
+	ACTIONS,
+	ACTION_VOID,
+	END,
+};
+
+#define TRANS(...) (const enum mlx5_nl_flow_trans []){ __VA_ARGS__, INVALID, }
+
+#define PATTERN_COMMON \
+	ITEM_VOID, ACTIONS
+#define ACTIONS_COMMON \
+	ACTION_VOID, END
+
+/** Parser state transitions used by mlx5_nl_flow_transpose(). */
+static const enum mlx5_nl_flow_trans *const mlx5_nl_flow_trans[] = {
+	[INVALID] = NULL,
+	[BACK] = NULL,
+	[ATTR] = TRANS(PATTERN),
+	[PATTERN] = TRANS(PATTERN_COMMON),
+	[ITEM_VOID] = TRANS(BACK),
+	[ACTIONS] = TRANS(ACTIONS_COMMON),
+	[ACTION_VOID] = TRANS(BACK),
+	[END] = NULL,
+};
+
+/**
+ * Transpose flow rule description to rtnetlink message.
+ *
+ * This function transposes a flow rule description to a traffic control
+ * (TC) filter creation message ready to be sent over Netlink.
+ *
+ * Target interface is specified as the first entry of the @p ptoi table.
+ * Subsequent entries enable this function to resolve other DPDK port IDs
+ * found in the flow rule.
+ *
+ * @param[out] buf
+ *   Output message buffer. May be NULL when @p size is 0.
+ * @param size
+ *   Size of @p buf. Message may be truncated if not large enough.
+ * @param[in] ptoi
+ *   DPDK port ID to network interface index translation table. This table
+ *   is terminated by an entry with a zero ifindex value.
+ * @param[in] attr
+ *   Flow rule attributes.
+ * @param[in] pattern
+ *   Pattern specification.
+ * @param[in] actions
+ *   Associated actions.
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   A positive value representing the exact size of the message in bytes
+ *   regardless of the @p size parameter on success, a negative errno value
+ *   otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_transpose(void *buf,
+		       size_t size,
+		       const struct mlx5_nl_flow_ptoi *ptoi,
+		       const struct rte_flow_attr *attr,
+		       const struct rte_flow_item *pattern,
+		       const struct rte_flow_action *actions,
+		       struct rte_flow_error *error)
+{
+	alignas(struct nlmsghdr)
+	uint8_t buf_tmp[mnl_nlmsg_size(sizeof(struct tcmsg) + 1024)];
+	const struct rte_flow_item *item;
+	const struct rte_flow_action *action;
+	unsigned int n;
+	struct nlattr *na_flower;
+	struct nlattr *na_flower_act;
+	const enum mlx5_nl_flow_trans *trans;
+	const enum mlx5_nl_flow_trans *back;
+
+	if (!size)
+		goto error_nobufs;
+init:
+	item = pattern;
+	action = actions;
+	n = 0;
+	na_flower = NULL;
+	na_flower_act = NULL;
+	trans = TRANS(ATTR);
+	back = trans;
+trans:
+	switch (trans[n++]) {
+		struct nlmsghdr *nlh;
+		struct tcmsg *tcm;
+
+	case INVALID:
+		if (item->type)
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ITEM,
+				 item, "unsupported pattern item combination");
+		else if (action->type)
+			return rte_flow_error_set
+				(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_ACTION,
+				 action, "unsupported action combination");
+		return rte_flow_error_set
+			(error, ENOTSUP, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+			 "flow rule lacks some kind of fate action");
+	case BACK:
+		trans = back;
+		n = 0;
+		goto trans;
+	case ATTR:
+		/*
+		 * Supported attributes: no groups, some priorities and
+		 * ingress only. Don't care about transfer as it is the
+		 * caller's problem.
+		 */
+		if (attr->group)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_ATTR_GROUP,
+				 attr, "groups are not supported");
+		if (attr->priority > 0xfffe)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_ATTR_PRIORITY,
+				 attr, "lowest priority level is 0xfffe");
+		if (!attr->ingress)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+				 attr, "only ingress is supported");
+		if (attr->egress)
+			return rte_flow_error_set
+				(error, ENOTSUP,
+				 RTE_FLOW_ERROR_TYPE_ATTR_INGRESS,
+				 attr, "egress is not supported");
+		if (size < mnl_nlmsg_size(sizeof(*tcm)))
+			goto error_nobufs;
+		nlh = mnl_nlmsg_put_header(buf);
+		nlh->nlmsg_type = 0;
+		nlh->nlmsg_flags = 0;
+		nlh->nlmsg_seq = 0;
+		tcm = mnl_nlmsg_put_extra_header(nlh, sizeof(*tcm));
+		tcm->tcm_family = AF_UNSPEC;
+		tcm->tcm_ifindex = ptoi[0].ifindex;
+		/*
+		 * Let kernel pick a handle by default. A predictable handle
+		 * can be set by the caller on the resulting buffer through
+		 * mlx5_nl_flow_brand().
+		 */
+		tcm->tcm_handle = 0;
+		tcm->tcm_parent = TC_H_MAKE(TC_H_INGRESS, TC_H_MIN_INGRESS);
+		/*
+		 * Priority cannot be zero to prevent the kernel from
+		 * picking one automatically.
+		 */
+		tcm->tcm_info = TC_H_MAKE((attr->priority + 1) << 16,
+					  RTE_BE16(ETH_P_ALL));
+		break;
+	case PATTERN:
+		if (!mnl_attr_put_strz_check(buf, size, TCA_KIND, "flower"))
+			goto error_nobufs;
+		na_flower = mnl_attr_nest_start_check(buf, size, TCA_OPTIONS);
+		if (!na_flower)
+			goto error_nobufs;
+		if (!mnl_attr_put_u32_check(buf, size, TCA_FLOWER_FLAGS,
+					    TCA_CLS_FLAGS_SKIP_SW))
+			goto error_nobufs;
+		break;
+	case ITEM_VOID:
+		if (item->type != RTE_FLOW_ITEM_TYPE_VOID)
+			goto trans;
+		++item;
+		break;
+	case ACTIONS:
+		if (item->type != RTE_FLOW_ITEM_TYPE_END)
+			goto trans;
+		assert(na_flower);
+		assert(!na_flower_act);
+		na_flower_act =
+			mnl_attr_nest_start_check(buf, size, TCA_FLOWER_ACT);
+		if (!na_flower_act)
+			goto error_nobufs;
+		break;
+	case ACTION_VOID:
+		if (action->type != RTE_FLOW_ACTION_TYPE_VOID)
+			goto trans;
+		++action;
+		break;
+	case END:
+		if (item->type != RTE_FLOW_ITEM_TYPE_END ||
+		    action->type != RTE_FLOW_ACTION_TYPE_END)
+			goto trans;
+		if (na_flower_act)
+			mnl_attr_nest_end(buf, na_flower_act);
+		if (na_flower)
+			mnl_attr_nest_end(buf, na_flower);
+		nlh = buf;
+		return nlh->nlmsg_len;
+	}
+	back = trans;
+	trans = mlx5_nl_flow_trans[trans[n - 1]];
+	n = 0;
+	goto trans;
+error_nobufs:
+	if (buf != buf_tmp) {
+		buf = buf_tmp;
+		size = sizeof(buf_tmp);
+		goto init;
+	}
+	return rte_flow_error_set
+		(error, ENOBUFS, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "generated TC message is too large");
+}
+
+/**
+ * Brand rtnetlink buffer with unique handle.
+ *
+ * This handle should be unique for a given network interface to avoid
+ * collisions.
+ *
+ * @param buf
+ *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param handle
+ *   Unique 32-bit handle to use.
+ */
+void
+mlx5_nl_flow_brand(void *buf, uint32_t handle)
+{
+	struct tcmsg *tcm = mnl_nlmsg_get_payload(buf);
+
+	tcm->tcm_handle = handle;
+}
+
 /**
  * Send Netlink message with acknowledgment.
  *
@@ -60,6 +315,62 @@  mlx5_nl_flow_nl_ack(struct mnl_socket *nl, struct nlmsghdr *nlh)
 }
 
 /**
+ * Create a Netlink flow rule.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param buf
+ *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_create(struct mnl_socket *nl, void *buf,
+		    struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh = buf;
+
+	nlh->nlmsg_type = RTM_NEWTFILTER;
+	nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE | NLM_F_EXCL;
+	if (!mlx5_nl_flow_nl_ack(nl, nlh))
+		return 0;
+	return rte_flow_error_set
+		(error, rte_errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "netlink: failed to create TC flow rule");
+}
+
+/**
+ * Destroy a Netlink flow rule.
+ *
+ * @param nl
+ *   Libmnl socket to use.
+ * @param buf
+ *   Flow rule buffer previously initialized by mlx5_nl_flow_transpose().
+ * @param[out] error
+ *   Perform verbose error reporting if not NULL.
+ *
+ * @return
+ *   0 on success, a negative errno value otherwise and rte_errno is set.
+ */
+int
+mlx5_nl_flow_destroy(struct mnl_socket *nl, void *buf,
+		     struct rte_flow_error *error)
+{
+	struct nlmsghdr *nlh = buf;
+
+	nlh->nlmsg_type = RTM_DELTFILTER;
+	nlh->nlmsg_flags = NLM_F_REQUEST;
+	if (!mlx5_nl_flow_nl_ack(nl, nlh))
+		return 0;
+	return rte_flow_error_set
+		(error, errno, RTE_FLOW_ERROR_TYPE_UNSPECIFIED, NULL,
+		 "netlink: failed to destroy TC flow rule");
+}
+
+/**
  * Initialize ingress qdisc of a given network interface.
  *
  * @param nl