[dpdk-dev] [PATCH v2 2/2] net/tap: implement RSS with eBPF classifier and action

Ophir Munk ophirmu at mellanox.com
Thu Dec 28 11:09:05 CET 2017


Add BPF classifier for each TAP queue. According to this classifier packets
marked with an RSS queue will be directed to this queue using a traffic
control with "skbedit" action.
The BPF classifier program is downloaded to the kernel once per TAP Rx
queue.

Add RSS BPF map entry for each new RSS rule. A BPF map entry contains the
RSS queues of that rule.

Add BPF action for each new RSS rule in which the Toeplitz RSS hash is
calculated based on L3 addresses and L4 ports. Mark the packet with the RSS
queue according the resulting RSS hash, then reclassify the packet.

Signed-off-by: Ophir Munk <ophirmu at mellanox.com>
---
 drivers/net/tap/Makefile        |  20 ++
 drivers/net/tap/rte_eth_tap.h   |   9 +-
 drivers/net/tap/tap_bpf_insns.c |  60 ++++
 drivers/net/tap/tap_flow.c      | 625 ++++++++++++++++++++++++++++++++++------
 drivers/net/tap/tap_flow.h      |  10 +
 drivers/net/tap/tap_tcmsgs.h    |   4 +
 6 files changed, 632 insertions(+), 96 deletions(-)

diff --git a/drivers/net/tap/Makefile b/drivers/net/tap/Makefile
index d7439b0..ffd8061 100644
--- a/drivers/net/tap/Makefile
+++ b/drivers/net/tap/Makefile
@@ -87,6 +87,26 @@ tap_autoconf.h.new: $(RTE_SDK)/buildtools/auto-config-h.sh
 		enum TCA_FLOWER_KEY_VLAN_PRIO \
 		$(AUTOCONF_OUTPUT)
 	$Q sh -- '$<' '$@' \
+		HAVE_TC_BPF \
+		linux/pkt_cls.h \
+		enum TCA_BPF_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_BPF_FD \
+		linux/pkt_cls.h \
+		enum TCA_BPF_FD \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_ACT_BPF \
+		linux/tc_act/tc_bpf.h \
+		enum TCA_ACT_BPF_UNSPEC \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
+		HAVE_TC_ACT_BPF_FD \
+		linux/tc_act/tc_bpf.h \
+		enum TCA_ACT_BPF_FD \
+		$(AUTOCONF_OUTPUT)
+	$Q sh -- '$<' '$@' \
 		HAVE_BPF_PROG_LOAD \
 		linux/bpf.h \
 		enum BPF_PROG_LOAD \
diff --git a/drivers/net/tap/rte_eth_tap.h b/drivers/net/tap/rte_eth_tap.h
index 829f32f..c185473 100644
--- a/drivers/net/tap/rte_eth_tap.h
+++ b/drivers/net/tap/rte_eth_tap.h
@@ -45,7 +45,7 @@
 #include <rte_ether.h>
 
 #ifdef IFF_MULTI_QUEUE
-#define RTE_PMD_TAP_MAX_QUEUES	16
+#define RTE_PMD_TAP_MAX_QUEUES	TAP_MAX_QUEUES
 #else
 #define RTE_PMD_TAP_MAX_QUEUES	1
 #endif
@@ -90,6 +90,13 @@ struct pmd_internals {
 	int ioctl_sock;                   /* socket for ioctl calls */
 	int nlsk_fd;                      /* Netlink socket fd */
 	int flow_isolate;                 /* 1 if flow isolation is enabled */
+	int flower_support;               /* 1 if kernel supports, else 0 */
+	int flower_vlan_support;          /* 1 if kernel supports, else 0 */
+	int rss_enabled;                  /* 1 if RSS is enabled, else 0 */
+	/* implicit rules set when RSS is enabled */
+	int map_fd;                       /* BPF RSS map fd */
+	int bpf_fd[RTE_PMD_TAP_MAX_QUEUES];/* List of bpf fds per queue */
+	LIST_HEAD(tap_rss_flows, rte_flow) rss_flows;
 	LIST_HEAD(tap_flows, rte_flow) flows;        /* rte_flow rules */
 	/* implicit rte_flow rules set when a remote device is active */
 	LIST_HEAD(tap_implicit_flows, rte_flow) implicit_flows;
diff --git a/drivers/net/tap/tap_bpf_insns.c b/drivers/net/tap/tap_bpf_insns.c
index 2db3fce..409d688 100644
--- a/drivers/net/tap/tap_bpf_insns.c
+++ b/drivers/net/tap/tap_bpf_insns.c
@@ -697,3 +697,63 @@ static int bpf_load(enum bpf_prog_type type,
 
 	return sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
 }
+
+/**
+ * Create BPF map for RSS rules
+ *
+ * @param[in] key_size
+ *   map RSS key size
+ *
+ * @param[in] value_size
+ *   Map RSS value size
+ *
+ * @param[in] max_entries
+ *   Map max number of RSS entries (limit on max RSS rules)
+ *
+ * @return
+ *   -1 if BPF map couldn't be created, map fd otherwise
+ */
+int tap_flow_bpf_rss_map_create(unsigned int key_size,
+		unsigned int value_size,
+		unsigned int max_entries)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+	attr.map_type    = BPF_MAP_TYPE_HASH;
+	attr.key_size    = key_size;
+	attr.value_size  = value_size;
+	attr.max_entries = max_entries;
+
+	return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
+/**
+ * Update RSS entry in BPF map
+ *
+ * @param[in] fd
+ *   RSS map fd
+ *
+ * @param[in] key
+ *   Pointer to RSS key whose entry is updated
+ *
+ * @param[in] value
+ *   Pointer to RSS new updated value
+ *
+ * @return
+ *   -1 if RSS entry failed to be updated, 0 otherwise
+ */
+int tap_flow_bpf_update_rss_elem(int fd, void *key, void *value)
+{
+	union bpf_attr attr;
+
+	bzero(&attr, sizeof(attr));
+
+	attr.map_type = BPF_MAP_TYPE_HASH;
+	attr.map_fd = fd;
+	attr.key = ptr_to_u64(key);
+	attr.value = ptr_to_u64(value);
+	attr.flags = BPF_ANY;
+
+	return sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr));
+}
diff --git a/drivers/net/tap/tap_flow.c b/drivers/net/tap/tap_flow.c
index ffc0b85..8345c86 100644
--- a/drivers/net/tap/tap_flow.c
+++ b/drivers/net/tap/tap_flow.c
@@ -33,6 +33,7 @@
 
 #include <errno.h>
 #include <string.h>
+#include <unistd.h>
 #include <sys/queue.h>
 
 #include <rte_byteorder.h>
@@ -42,6 +43,7 @@
 #include <tap_flow.h>
 #include <tap_autoconf.h>
 #include <tap_tcmsgs.h>
+#include <tap_rss.h>
 
 #ifndef HAVE_TC_FLOWER
 /*
@@ -81,12 +83,78 @@ enum {
 	TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
 };
 #endif
+/*
+ * For kernels < 4.2 BPF related enums may not be defined.
+ * Runtime checks will be carried out to gracefully report on TC messages that
+ * are rejected by the kernel. Rejection reasons may be due to:
+ * 1. enum is not defined
+ * 2. enum is defined but kerenl is not configured to support BPF system calls,
+ *    BPF classifications or BPF actions.
+ */
+#ifndef HAVE_TC_BPF
+enum {
+	TCA_BPF_UNSPEC,
+	TCA_BPF_ACT,
+	TCA_BPF_POLICE,
+	TCA_BPF_CLASSID,
+	TCA_BPF_OPS_LEN,
+	TCA_BPF_OPS,
+};
+#endif
+#ifndef HAVE_TC_BPF_FD
+enum {
+	TCA_BPF_FD = TCA_BPF_OPS + 1,
+	TCA_BPF_NAME,
+};
+#endif
+#ifndef HAVE_TC_ACT_BPF
+#define tc_gen \
+	__u32                 index; \
+	__u32                 capab; \
+	int                   action; \
+	int                   refcnt; \
+	int                   bindcnt
+
+struct tc_act_bpf {
+	tc_gen;
+};
+
+enum {
+	TCA_ACT_BPF_UNSPEC,
+	TCA_ACT_BPF_TM,
+	TCA_ACT_BPF_PARMS,
+	TCA_ACT_BPF_OPS_LEN,
+	TCA_ACT_BPF_OPS,
+};
+
+#endif
+#ifndef HAVE_TC_ACT_BPF_FD
+enum {
+	TCA_ACT_BPF_FD = TCA_ACT_BPF_OPS + 1,
+	TCA_ACT_BPF_NAME,
+};
+#endif
+
+/* RSS key management */
+enum bpf_rss_key_e {
+	KEY_CMD_GET = 1,
+	KEY_CMD_RELEASE,
+	KEY_CMD_INIT,
+	KEY_CMD_DEINIT,
+};
+
+enum key_status_e {
+	KEY_STAT_UNSPEC,
+	KEY_STAT_USED,
+	KEY_STAT_AVAILABLE,
+};
 
 #define ISOLATE_HANDLE 1
 
 struct rte_flow {
 	LIST_ENTRY(rte_flow) next; /* Pointer to the next rte_flow structure */
 	struct rte_flow *remote_flow; /* associated remote flow */
+	int bpf_fd[SEC_MAX]; /* list of bfs fds per ELF section */
 	struct nlmsg msg;
 };
 
@@ -104,6 +172,24 @@ struct remote_rule {
 	int mirred;
 };
 
+struct action_data {
+	char id[16];
+
+	union {
+		struct tc_gact gact;
+		struct tc_mirred mirred;
+		struct skbedit {
+			struct tc_skbedit skbedit;
+			uint16_t queue;
+		} skbedit;
+		struct bpf {
+			struct tc_act_bpf bpf;
+			int bpf_fd;
+			const char *annotation;
+		} bpf;
+	};
+};
+
 static int tap_flow_create_eth(const struct rte_flow_item *item, void *data);
 static int tap_flow_create_vlan(const struct rte_flow_item *item, void *data);
 static int tap_flow_create_ipv4(const struct rte_flow_item *item, void *data);
@@ -134,6 +220,14 @@ tap_flow_isolate(struct rte_eth_dev *dev,
 		 int set,
 		 struct rte_flow_error *error);
 
+static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx);
+static int rss_enable(struct pmd_internals *pmd,
+			const struct rte_flow_attr *attr,
+			struct rte_flow_error *error);
+static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
+			const struct rte_flow_action_rss *rss,
+			struct rte_flow_error *error);
+
 static const struct rte_flow_ops tap_flow_ops = {
 	.validate = tap_flow_validate,
 	.create = tap_flow_create,
@@ -816,111 +910,95 @@ tap_flow_item_validate(const struct rte_flow_item *item,
 }
 
 /**
- * Transform a DROP/PASSTHRU action item in the provided flow for TC.
+ * Configure the kernel with a TC action and its configured parameters
+ * Handled actions: "gact", "mirred", "skbedit", "bpf"
  *
- * @param[in, out] flow
- *   Flow to be filled.
- * @param[in] action
- *   Appropriate action to be set in the TCA_GACT_PARMS structure.
+ * @param[in] flow
+ *   Pointer to rte flow containing the netlink message
  *
- * @return
- *   0 if checks are alright, -1 otherwise.
- */
-static int
-add_action_gact(struct rte_flow *flow, int action)
-{
-	struct nlmsg *msg = &flow->msg;
-	size_t act_index = 1;
-	struct tc_gact p = {
-		.action = action
-	};
-
-	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
-		return -1;
-	if (nlattr_nested_start(msg, act_index++) < 0)
-		return -1;
-	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("gact"), "gact");
-	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
-		return -1;
-	nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(p), &p);
-	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
-	nlattr_nested_finish(msg); /* nested act_index */
-	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
-	return 0;
-}
-
-/**
- * Transform a MIRRED action item in the provided flow for TC.
+ * @param[in, out] act_index
+ *   Pointer to action sequence number in the TC command
  *
- * @param[in, out] flow
- *   Flow to be filled.
- * @param[in] ifindex
- *   Netdevice ifindex, where to mirror/redirect packet to.
- * @param[in] action_type
- *   Either TCA_EGRESS_REDIR for redirection or TCA_EGRESS_MIRROR for mirroring.
+ * @param[in] adata
+ *  Pointer to struct holding the action parameters
  *
  * @return
- *   0 if checks are alright, -1 otherwise.
+ *   -1 on failure, 0 on success
  */
 static int
-add_action_mirred(struct rte_flow *flow, uint16_t ifindex, uint16_t action_type)
+add_action(struct rte_flow *flow, size_t *act_index, struct action_data *adata)
 {
 	struct nlmsg *msg = &flow->msg;
-	size_t act_index = 1;
-	struct tc_mirred p = {
-		.eaction = action_type,
-		.ifindex = ifindex,
-	};
 
-	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
-		return -1;
-	if (nlattr_nested_start(msg, act_index++) < 0)
+	if (nlattr_nested_start(msg, (*act_index)++) < 0)
 		return -1;
-	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("mirred"), "mirred");
+
+	nlattr_add(&msg->nh, TCA_ACT_KIND, strlen(adata->id) + 1, adata->id);
 	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
 		return -1;
-	if (action_type == TCA_EGRESS_MIRROR)
-		p.action = TC_ACT_PIPE;
-	else /* REDIRECT */
-		p.action = TC_ACT_STOLEN;
-	nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(p), &p);
+	if (strcmp("gact", adata->id) == 0) {
+		nlattr_add(&msg->nh, TCA_GACT_PARMS, sizeof(adata->gact),
+			   &adata->gact);
+	} else if (strcmp("mirred", adata->id) == 0) {
+		if (adata->mirred.eaction == TCA_EGRESS_MIRROR)
+			adata->mirred.action = TC_ACT_PIPE;
+		else /* REDIRECT */
+			adata->mirred.action = TC_ACT_STOLEN;
+		nlattr_add(&msg->nh, TCA_MIRRED_PARMS, sizeof(adata->mirred),
+			   &adata->mirred);
+	} else if (strcmp("skbedit", adata->id) == 0) {
+		nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS,
+			   sizeof(adata->skbedit.skbedit),
+			   &adata->skbedit.skbedit);
+		nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING,
+			     adata->skbedit.queue);
+	} else if (strcmp("bpf", adata->id) == 0) {
+		nlattr_add32(&msg->nh, TCA_ACT_BPF_FD, adata->bpf.bpf_fd);
+		nlattr_add(&msg->nh, TCA_ACT_BPF_NAME,
+			   strlen(adata->bpf.annotation) + 1,
+			   adata->bpf.annotation);
+		nlattr_add(&msg->nh, TCA_ACT_BPF_PARMS,
+			   sizeof(adata->bpf.bpf),
+			   &adata->bpf.bpf);
+	} else {
+		return -1;
+	}
 	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
 	nlattr_nested_finish(msg); /* nested act_index */
-	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
 	return 0;
 }
 
 /**
- * Transform a QUEUE action item in the provided flow for TC.
+ * Helper function to send a serie of TC actions to the kernel
  *
- * @param[in, out] flow
- *   Flow to be filled.
- * @param[in] queue
- *   Queue id to use.
+ * @param[in] flow
+ *   Pointer to rte flow containing the netlink message
+ *
+ * @param[in] nb_actions
+ *   Number of actions in an array of action structs
+ *
+ * @param[in] data
+ *   Pointer to an array of action structs
+ *
+ * @param[in] classifier_actions
+ *   The classifier on behave of which the actions are configured
  *
  * @return
- *   0 if checks are alright, -1 otherwise.
+ *   -1 on failure, 0 on success
  */
 static int
-add_action_skbedit(struct rte_flow *flow, uint16_t queue)
+add_actions(struct rte_flow *flow, int nb_actions, struct action_data *data,
+	    int classifier_action)
 {
 	struct nlmsg *msg = &flow->msg;
 	size_t act_index = 1;
-	struct tc_skbedit p = {
-		.action = TC_ACT_PIPE
-	};
+	int i;
 
-	if (nlattr_nested_start(msg, TCA_FLOWER_ACT) < 0)
-		return -1;
-	if (nlattr_nested_start(msg, act_index++) < 0)
+	if (nlattr_nested_start(msg, classifier_action) < 0)
 		return -1;
-	nlattr_add(&msg->nh, TCA_ACT_KIND, sizeof("skbedit"), "skbedit");
-	if (nlattr_nested_start(msg, TCA_ACT_OPTIONS) < 0)
-		return -1;
-	nlattr_add(&msg->nh, TCA_SKBEDIT_PARMS, sizeof(p), &p);
-	nlattr_add16(&msg->nh, TCA_SKBEDIT_QUEUE_MAPPING, queue);
-	nlattr_nested_finish(msg); /* nested TCA_ACT_OPTIONS */
-	nlattr_nested_finish(msg); /* nested act_index */
+	for (i = 0; i < nb_actions; i++)
+		if (add_action(flow, &act_index, data + i) < 0)
+			return -1;
 	nlattr_nested_finish(msg); /* nested TCA_FLOWER_ACT */
 	return 0;
 }
@@ -984,7 +1062,8 @@ priv_flow_process(struct pmd_internals *pmd,
 		return -rte_errno;
 	} else if (flow) {
 		uint16_t group = attr->group << GROUP_SHIFT;
-		uint16_t prio = group | (attr->priority + PRIORITY_OFFSET);
+		uint16_t prio = group | (attr->priority +
+				RSS_PRIORITY_OFFSET + PRIORITY_OFFSET);
 		flow->msg.t.tcm_info = TC_H_MAKE(prio << 16,
 						 flow->msg.t.tcm_info);
 	}
@@ -1053,7 +1132,12 @@ priv_flow_process(struct pmd_internals *pmd,
 		}
 	}
 	if (mirred && flow) {
-		uint16_t if_index = pmd->if_index;
+		struct action_data adata = {
+			.id = "mirred",
+			.mirred = {
+				.eaction = mirred,
+			},
+		};
 
 		/*
 		 * If attr->egress && mirred, then this is a special
@@ -1061,9 +1145,13 @@ priv_flow_process(struct pmd_internals *pmd,
 		 * redirect packets coming from the DPDK App, out
 		 * through the remote netdevice.
 		 */
-		if (attr->egress)
-			if_index = pmd->remote_if_index;
-		if (add_action_mirred(flow, if_index, mirred) < 0)
+		adata.mirred.ifindex = attr->ingress ? pmd->if_index :
+			pmd->remote_if_index;
+		if (mirred == TCA_EGRESS_MIRROR)
+			adata.mirred.action = TC_ACT_PIPE;
+		else
+			adata.mirred.action = TC_ACT_STOLEN;
+		if (add_actions(flow, 1, &adata, TCA_FLOWER_ACT) < 0)
 			goto exit_action_not_supported;
 		else
 			goto end;
@@ -1077,14 +1165,33 @@ priv_flow_process(struct pmd_internals *pmd,
 			if (action)
 				goto exit_action_not_supported;
 			action = 1;
-			if (flow)
-				err = add_action_gact(flow, TC_ACT_SHOT);
+			if (flow) {
+				struct action_data adata = {
+					.id = "gact",
+					.gact = {
+						.action = TC_ACT_SHOT,
+					},
+				};
+
+				err = add_actions(flow, 1, &adata,
+						  TCA_FLOWER_ACT);
+			}
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_PASSTHRU) {
 			if (action)
 				goto exit_action_not_supported;
 			action = 1;
-			if (flow)
-				err = add_action_gact(flow, TC_ACT_UNSPEC);
+			if (flow) {
+				struct action_data adata = {
+					.id = "gact",
+					.gact = {
+						/* continue */
+						.action = TC_ACT_UNSPEC,
+					},
+				};
+
+				err = add_actions(flow, 1, &adata,
+						  TCA_FLOWER_ACT);
+			}
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_QUEUE) {
 			const struct rte_flow_action_queue *queue =
 				(const struct rte_flow_action_queue *)
@@ -1096,22 +1203,35 @@ priv_flow_process(struct pmd_internals *pmd,
 			if (!queue ||
 			    (queue->index > pmd->dev->data->nb_rx_queues - 1))
 				goto exit_action_not_supported;
-			if (flow)
-				err = add_action_skbedit(flow, queue->index);
+			if (flow) {
+				struct action_data adata = {
+					.id = "skbedit",
+					.skbedit = {
+						.skbedit = {
+							.action = TC_ACT_PIPE,
+						},
+						.queue = queue->index,
+					},
+				};
+
+				err = add_actions(flow, 1, &adata,
+					TCA_FLOWER_ACT);
+			}
 		} else if (actions->type == RTE_FLOW_ACTION_TYPE_RSS) {
-			/* Fake RSS support. */
 			const struct rte_flow_action_rss *rss =
 				(const struct rte_flow_action_rss *)
 				actions->conf;
 
-			if (action)
-				goto exit_action_not_supported;
-			action = 1;
-			if (!rss || rss->num < 1 ||
-			    (rss->queue[0] > pmd->dev->data->nb_rx_queues - 1))
+			if (action++)
 				goto exit_action_not_supported;
-			if (flow)
-				err = add_action_skbedit(flow, rss->queue[0]);
+
+			if (!pmd->rss_enabled) {
+				err = rss_enable(pmd, attr, error);
+				if (err)
+					goto exit_action_not_supported;
+			}
+			if (rss)
+				err = rss_add_actions(flow, pmd, rss, error);
 		} else {
 			goto exit_action_not_supported;
 		}
@@ -1323,6 +1443,7 @@ tap_flow_destroy_pmd(struct pmd_internals *pmd,
 		     struct rte_flow_error *error)
 {
 	struct rte_flow *remote_flow = flow->remote_flow;
+	int i;
 	int ret = 0;
 
 	LIST_REMOVE(flow, next);
@@ -1348,6 +1469,13 @@ tap_flow_destroy_pmd(struct pmd_internals *pmd,
 			"couldn't receive kernel ack to our request");
 		goto end;
 	}
+	/* Close opened BPF file descriptors of this flow */
+	for (i = 0; i < SEC_MAX; i++)
+		if (flow->bpf_fd[i] != 0) {
+			close(flow->bpf_fd[i]);
+			flow->bpf_fd[i] = 0;
+		}
+
 	if (remote_flow) {
 		remote_flow->msg.nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK;
 		remote_flow->msg.nh.nlmsg_type = RTM_DELTFILTER;
@@ -1632,6 +1760,313 @@ tap_flow_implicit_flush(struct pmd_internals *pmd, struct rte_flow_error *error)
 	return 0;
 }
 
+#define MAX_RSS_KEYS 256
+#define SEC_NAME_CLS_Q "cls_q"
+
+const char *sec_name[SEC_MAX] = {
+	[SEC_L3_L4] = "l3_l4",
+};
+
+/**
+ * Enable RSS on tap: create TC rules for queuing.
+ *
+ * @param[in, out] pmd
+ *   Pointer to private structure.
+ *
+ * @param[in] attr
+ *   Pointer to rte_flow to get flow group
+ *
+ * @param[out] error
+ *   Pointer to error reporting if not NULL.
+ *
+ * @return 0 on success, negative value on failure.
+ */
+static int rss_enable(struct pmd_internals *pmd,
+			const struct rte_flow_attr *attr,
+			struct rte_flow_error *error)
+{
+	struct rte_flow *rss_flow = NULL;
+	struct nlmsg *msg = NULL;
+	/* 4096 is the maximum number of instructions for a BPF program */
+	char annotation[64];
+	int i;
+	int err = 0;
+
+	 /* Get a new map key for a new RSS rule */
+	err = bpf_rss_key(KEY_CMD_INIT, NULL);
+	if (err < 0) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"Failed to initialize BPF RSS keys");
+
+		return -1;
+	}
+
+	/*
+	 *  Create BPF RSS MAP
+	 */
+	pmd->map_fd = tap_flow_bpf_rss_map_create(sizeof(__u32), /* key size */
+				sizeof(struct rss_key),
+				MAX_RSS_KEYS);
+	if (pmd->map_fd < 0) {
+		RTE_LOG(ERR, PMD,
+			"Failed to create BPF map (%d): %s\n",
+				errno, strerror(errno));
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"Kernel too old or not configured "
+			"to support BPF maps");
+
+		return -ENOTSUP;
+	}
+
+	/*
+	 * Add a rule per queue to match reclassified packets and direct them to
+	 * the correct queue.
+	 */
+	for (i = 0; i < pmd->dev->data->nb_rx_queues; i++) {
+		pmd->bpf_fd[i] = tap_flow_bpf_cls_q(i);
+		if (pmd->bpf_fd[i] < 0) {
+			RTE_LOG(ERR, PMD,
+				"Failed to load BPF section %s for queue %d",
+				SEC_NAME_CLS_Q, i);
+			rte_flow_error_set(
+				error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE,
+				NULL,
+				"Kernel too old or not configured "
+				"to support BPF programs loading");
+
+			return -ENOTSUP;
+		}
+
+		rss_flow = rte_malloc(__func__, sizeof(struct rte_flow), 0);
+		if (!rss_flow) {
+			RTE_LOG(ERR, PMD,
+				"Cannot allocate memory for rte_flow");
+			return -1;
+		}
+		msg = &rss_flow->msg;
+		tc_init_msg(msg, pmd->if_index, RTM_NEWTFILTER, NLM_F_REQUEST |
+			    NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE);
+		msg->t.tcm_info = TC_H_MAKE(0, htons(ETH_P_ALL));
+		tap_flow_set_handle(rss_flow);
+		uint16_t group = attr->group << GROUP_SHIFT;
+		uint16_t prio = group | (i + PRIORITY_OFFSET);
+		msg->t.tcm_info = TC_H_MAKE(prio << 16, msg->t.tcm_info);
+		msg->t.tcm_parent = TC_H_MAKE(MULTIQ_MAJOR_HANDLE, 0);
+
+		nlattr_add(&msg->nh, TCA_KIND, sizeof("bpf"), "bpf");
+		if (nlattr_nested_start(msg, TCA_OPTIONS) < 0)
+			return -1;
+		nlattr_add32(&msg->nh, TCA_BPF_FD, pmd->bpf_fd[i]);
+		snprintf(annotation, sizeof(annotation), "[%s%d]",
+			SEC_NAME_CLS_Q, i);
+		nlattr_add(&msg->nh, TCA_BPF_NAME, strlen(annotation) + 1,
+			   annotation);
+		/* Actions */
+		{
+			struct action_data adata = {
+				.id = "skbedit",
+				.skbedit = {
+					.skbedit = {
+						.action = TC_ACT_PIPE,
+					},
+					.queue = i,
+				},
+			};
+			if (add_actions(rss_flow, 1, &adata, TCA_BPF_ACT) < 0)
+				return -1;
+		}
+		nlattr_nested_finish(msg); /* nested TCA_OPTIONS */
+
+		/* Netlink message is now ready to be sent */
+		if (nl_send(pmd->nlsk_fd, &msg->nh) < 0)
+			return -1;
+		err = nl_recv_ack(pmd->nlsk_fd);
+		if (err < 0) {
+			RTE_LOG(ERR, PMD,
+				"Kernel refused TC filter rule creation (%d): %s\n",
+				errno, strerror(errno));
+			return err;
+		}
+		LIST_INSERT_HEAD(&pmd->rss_flows, rss_flow, next);
+	}
+
+	pmd->rss_enabled = 1;
+	return err;
+}
+
+/**
+ * Manage bpf RSS keys repository with operations: init, get, release
+ *
+ * @param[in] cmd
+ *   Command on RSS keys: init, get, release
+ *
+ * @param[in, out] key_idx
+ *   Pointer to RSS Key index (out for get command, in for release command)
+ *
+ * @return -1 if couldn't get, release or init the RSS keys, 0 otherwise.
+ */
+static int bpf_rss_key(enum bpf_rss_key_e cmd, __u32 *key_idx)
+{
+	__u32 i;
+	int err = -1;
+	static __u32 num_used_keys;
+	static __u32 rss_keys[MAX_RSS_KEYS] = {KEY_STAT_UNSPEC};
+	static __u32 rss_keys_initialized;
+
+	switch (cmd) {
+	case KEY_CMD_GET:
+		if (!rss_keys_initialized)
+			break;
+
+		if (num_used_keys == ARRAY_SIZE(rss_keys))
+			break;
+
+		*key_idx = num_used_keys % ARRAY_SIZE(rss_keys);
+		while (rss_keys[*key_idx] == KEY_STAT_USED)
+			*key_idx = (*key_idx + 1) % ARRAY_SIZE(rss_keys);
+
+		rss_keys[*key_idx] = KEY_STAT_USED;
+		num_used_keys++;
+		err = 0;
+	break;
+
+	case KEY_CMD_RELEASE:
+		if (!rss_keys_initialized)
+			break;
+
+		if (rss_keys[*key_idx] == KEY_STAT_USED) {
+			rss_keys[*key_idx] = KEY_STAT_AVAILABLE;
+			num_used_keys--;
+			err = 0;
+		}
+	break;
+
+	case KEY_CMD_INIT:
+		for (i = 0; i < ARRAY_SIZE(rss_keys); i++)
+			rss_keys[i] = KEY_STAT_AVAILABLE;
+
+		rss_keys_initialized = 1;
+		num_used_keys = 0;
+		err = 0;
+	break;
+
+	case KEY_CMD_DEINIT:
+		for (i = 0; i < ARRAY_SIZE(rss_keys); i++)
+			rss_keys[i] = KEY_STAT_UNSPEC;
+
+		rss_keys_initialized = 0;
+		num_used_keys = 0;
+		err = 0;
+	break;
+
+	default:
+		break;
+	}
+
+	return err;
+}
+
+/**
+ * Add RSS hash calculations and queue selection
+ *
+ * @param[in, out] pmd
+ *   Pointer to internal structure. Used to set/get RSS map fd
+ *
+ * @param[in] rss
+ *   Pointer to RSS flow actions
+ *
+ * @param[out] error
+ *   Pointer to error reporting if not NULL.
+ *
+ * @return 0 on success, negative value on failure
+ */
+static int rss_add_actions(struct rte_flow *flow, struct pmd_internals *pmd,
+			   const struct rte_flow_action_rss *rss,
+			   struct rte_flow_error *error)
+{
+	/* 4096 is the maximum number of instructions for a BPF program */
+	int i;
+	__u32 key_idx;
+	int err;
+	struct rss_key rss_entry = { .hash_fields = 0,
+				     .key_size = 0 };
+
+	/* Get a new map key for a new RSS rule */
+	err = bpf_rss_key(KEY_CMD_GET, &key_idx);
+	if (err < 0) {
+		rte_flow_error_set(
+			error, EINVAL, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"Failed to get BPF RSS key");
+
+		return -1;
+	}
+
+	/* Update RSS map entry with queues */
+	rss_entry.nb_queues = rss->num;
+	for (i = 0; i < rss->num; i++)
+		rss_entry.queues[i] = rss->queue[i];
+	rss_entry.hash_fields =
+		(1 << HASH_FIELD_IPV4_L3_L4) | (1 << HASH_FIELD_IPV6_L3_L4);
+
+	/* Add this RSS entry to map */
+	err = tap_flow_bpf_update_rss_elem(pmd->map_fd, &key_idx, &rss_entry);
+
+	if (err) {
+		RTE_LOG(ERR, PMD,
+			"Failed to update BPF map entry #%u (%d): %s\n",
+			key_idx, errno, strerror(errno));
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"Kernel too old or not configured "
+			"to support BPF maps updates");
+
+		return -ENOTSUP;
+	}
+
+
+	/*
+	 * Load bpf rules to calculate hash for this key_idx
+	 */
+
+	flow->bpf_fd[SEC_L3_L4] =
+		tap_flow_bpf_calc_l3_l4_hash(key_idx, pmd->map_fd);
+	if (flow->bpf_fd[SEC_L3_L4] < 0) {
+		RTE_LOG(ERR, PMD,
+			"Failed to load BPF section %s (%d): %s\n",
+				sec_name[SEC_L3_L4], errno, strerror(errno));
+		rte_flow_error_set(
+			error, ENOTSUP, RTE_FLOW_ERROR_TYPE_HANDLE, NULL,
+			"Kernel too old or not configured "
+			"to support BPF program loading");
+
+		return -ENOTSUP;
+	}
+
+	/* Actions */
+	{
+		struct action_data adata[] = {
+			{
+				.id = "bpf",
+				.bpf = {
+					.bpf_fd = flow->bpf_fd[SEC_L3_L4],
+					.annotation = sec_name[SEC_L3_L4],
+					.bpf = {
+						.action = TC_ACT_PIPE,
+					},
+				},
+			},
+		};
+
+		if (add_actions(flow, ARRAY_SIZE(adata), adata,
+			TCA_FLOWER_ACT) < 0)
+			return -1;
+	}
+
+	return 0;
+}
+
 /**
  * Manage filter operations.
  *
diff --git a/drivers/net/tap/tap_flow.h b/drivers/net/tap/tap_flow.h
index 6cc01b4..dbb7ff6 100644
--- a/drivers/net/tap/tap_flow.h
+++ b/drivers/net/tap/tap_flow.h
@@ -37,6 +37,7 @@
 #include <rte_flow.h>
 #include <rte_flow_driver.h>
 #include <rte_eth_tap.h>
+#include <tap_autoconf.h>
 
 /**
  * In TC, priority 0 means we require the kernel to allocate one for us.
@@ -49,6 +50,7 @@
 #define GROUP_MASK (0xf)
 #define GROUP_SHIFT 12
 #define MAX_GROUP GROUP_MASK
+#define RSS_PRIORITY_OFFSET 256
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
@@ -69,6 +71,11 @@ enum implicit_rule_index {
 	TAP_REMOTE_MAX_IDX,
 };
 
+enum bpf_fd_idx {
+	SEC_L3_L4,
+	SEC_MAX,
+};
+
 int tap_dev_filter_ctrl(struct rte_eth_dev *dev,
 			enum rte_filter_type filter_type,
 			enum rte_filter_op filter_op,
@@ -84,5 +91,8 @@ int tap_flow_implicit_flush(struct pmd_internals *pmd,
 
 int tap_flow_bpf_cls_q(__u32 queue_idx);
 int tap_flow_bpf_calc_l3_l4_hash(__u32 key_idx, int map_fd);
+int tap_flow_bpf_rss_map_create(unsigned int key_size, unsigned int value_size,
+			unsigned int max_entries);
+int tap_flow_bpf_update_rss_elem(int fd, void *key, void *value);
 
 #endif /* _TAP_FLOW_H_ */
diff --git a/drivers/net/tap/tap_tcmsgs.h b/drivers/net/tap/tap_tcmsgs.h
index 7895957..07c5074 100644
--- a/drivers/net/tap/tap_tcmsgs.h
+++ b/drivers/net/tap/tap_tcmsgs.h
@@ -34,6 +34,7 @@
 #ifndef _TAP_TCMSGS_H_
 #define _TAP_TCMSGS_H_
 
+#include <tap_autoconf.h>
 #include <linux/if_ether.h>
 #include <linux/rtnetlink.h>
 #include <linux/pkt_sched.h>
@@ -41,6 +42,9 @@
 #include <linux/tc_act/tc_mirred.h>
 #include <linux/tc_act/tc_gact.h>
 #include <linux/tc_act/tc_skbedit.h>
+#ifdef HAVE_TC_ACT_BPF
+#include <linux/tc_act/tc_bpf.h>
+#endif
 #include <inttypes.h>
 
 #include <rte_ether.h>
-- 
2.7.4



More information about the dev mailing list