@@ -843,6 +843,8 @@ struct mlx5_dev_spawn_data {
close(priv->nl_socket_route);
if (priv->nl_socket_rdma >= 0)
close(priv->nl_socket_rdma);
+ if (priv->esxi_context)
+ mlx5_vlan_esxi_exit(priv->esxi_context);
if (priv->sh) {
/*
* Free the shared context in last turn, because the cleanup
@@ -1989,6 +1991,8 @@ struct mlx5_dev_spawn_data {
mlx5_set_min_inline(spawn, &config);
/* Store device configuration on private structure. */
priv->config = config;
+ /* Create context for virtual machine VLAN workaround. */
+ priv->esxi_context = mlx5_vlan_esxi_init(eth_dev, spawn->ifindex);
if (config.dv_flow_en) {
err = mlx5_alloc_shared_dr(priv);
if (err)
@@ -2015,6 +2019,8 @@ struct mlx5_dev_spawn_data {
close(priv->nl_socket_route);
if (priv->nl_socket_rdma >= 0)
close(priv->nl_socket_rdma);
+ if (priv->esxi_context)
+ mlx5_vlan_esxi_exit(priv->esxi_context);
if (own_domain_id)
claim_zero(rte_eth_switch_domain_free(priv->domain_id));
rte_free(priv);
@@ -355,6 +355,27 @@ enum mlx5_verbs_alloc_type {
MLX5_VERBS_ALLOC_TYPE_RX_QUEUE,
};
+/* VLAN netdev for ESXi VLAN workaround. */
+struct mlx5_vlan_dev {
+ uint32_t refcnt;
+ uint32_t ifindex; /**< Own interface index. */
+};
+
+/* Structure for VF ESXi VLAN workaround. */
+struct mlx5_vf_vlan {
+ uint32_t tag:12;
+ uint32_t created:1;
+};
+
+/* Array of VLAN devices created on the base of VF */
+struct mlx5_vlan_esxi_context {
+ int nl_socket;
+ uint32_t nl_sn;
+ uint32_t vf_ifindex;
+ struct rte_eth_dev *dev;
+ struct mlx5_vlan_dev vlan_dev[4096];
+};
+
/**
* Verbs allocator needs a context to know in the callback which kind of
* resources it is allocating.
@@ -631,6 +652,7 @@ struct mlx5_priv {
int nl_socket_route; /* Netlink socket (NETLINK_ROUTE). */
uint32_t nl_sn; /* Netlink message sequence number. */
LIST_HEAD(dbrpage, mlx5_devx_dbr_page) dbrpgs; /* Door-bell pages. */
+ struct mlx5_vlan_esxi_context *esxi_context; /* ESXi VLAN context. */
#ifndef RTE_ARCH_64
rte_spinlock_t uar_lock_cq; /* CQs share a common distinct UAR */
rte_spinlock_t uar_lock[MLX5_UAR_PAGE_NUM_MAX];
@@ -830,6 +852,14 @@ int mlx5_nl_mac_addr_remove(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
int mlx5_nl_switch_info(int nl, unsigned int ifindex,
struct mlx5_switch_info *info);
+struct mlx5_vlan_esxi_context *mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+ uint32_t ifindex);
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *ctx);
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vf_vlan);
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vf_vlan);
+
/* mlx5_devx_cmds.c */
struct mlx5_devx_obj *mlx5_devx_cmd_flow_counter_alloc(struct ibv_context *ctx,
@@ -1204,6 +1204,8 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
* Item specification.
* @param[in] item_flags
* Bit-fields that holds the items detected until now.
+ * @param[in] dev
+ * Ethernet device flow is being created on.
* @param[out] error
* Pointer to error structure.
*
@@ -1213,6 +1215,7 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
int
mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
uint64_t item_flags,
+ struct rte_eth_dev *dev,
struct rte_flow_error *error)
{
const struct rte_flow_item_vlan *spec = item->spec;
@@ -1247,6 +1250,25 @@ uint32_t mlx5_flow_adjust_priority(struct rte_eth_dev *dev, int32_t priority,
error);
if (ret)
return ret;
+ if (!tunnel && mask->tci != RTE_BE16(0x0fff)) {
+ struct mlx5_priv *priv = dev->data->dev_private;
+
+ if (priv->esxi_context) {
+ /*
+ * Non-NULL context means we have a virtual machine
+ * and SR-IOV enabled, we have to create VLAN interface
+ * to make hypervisor (ESXi) to setup E-Switch vport
+ * context correctly. We avoid creating the multiple
+ * VLAN interfaces, so we cannot support VLAN tag mask.
+ */
+ return rte_flow_error_set(error, EINVAL,
+ RTE_FLOW_ERROR_TYPE_ITEM,
+ item,
+ "VLAN tag mask is not"
+ " supported in virtual"
+ " environment");
+ }
+ }
if (spec) {
vlan_tag = spec->tci;
vlan_tag &= mask->tci;
@@ -330,6 +330,8 @@ struct mlx5_flow_dv {
/**< Pointer to the jump action resource. */
struct mlx5_flow_dv_port_id_action_resource *port_id_action;
/**< Pointer to port ID action resource. */
+ struct mlx5_vf_vlan vf_vlan;
+ /**< Structure for VF ESXi VLAN workaround. */
#ifdef HAVE_IBV_FLOW_DV_SUPPORT
void *actions[MLX5_DV_MAX_NUMBER_OF_ACTIONS];
/**< Action list. */
@@ -355,6 +357,8 @@ struct mlx5_flow_verbs {
struct ibv_flow *flow; /**< Verbs flow pointer. */
struct mlx5_hrxq *hrxq; /**< Hash Rx queue object. */
uint64_t hash_fields; /**< Verbs hash Rx queue hash fields. */
+ struct mlx5_vf_vlan vf_vlan;
+ /**< Structure for VF ESXi VLAN workaround. */
};
/** Device flow structure. */
@@ -505,6 +509,7 @@ int mlx5_flow_validate_item_udp(const struct rte_flow_item *item,
struct rte_flow_error *error);
int mlx5_flow_validate_item_vlan(const struct rte_flow_item *item,
uint64_t item_flags,
+ struct rte_eth_dev *dev,
struct rte_flow_error *error);
int mlx5_flow_validate_item_vxlan(const struct rte_flow_item *item,
uint64_t item_flags,
@@ -2892,7 +2892,7 @@ struct field_modify_info modify_tcp[] = {
break;
case RTE_FLOW_ITEM_TYPE_VLAN:
ret = mlx5_flow_validate_item_vlan(items, item_flags,
- error);
+ dev, error);
if (ret < 0)
return ret;
last_item = tunnel ? MLX5_FLOW_LAYER_INNER_VLAN :
@@ -3450,6 +3450,8 @@ struct field_modify_info modify_tcp[] = {
/**
* Add VLAN item to matcher and to the value.
*
+ * @param[in, out] dev_flow
+ * Flow descriptor.
* @param[in, out] matcher
* Flow matcher.
* @param[in, out] key
@@ -3460,7 +3462,8 @@ struct field_modify_info modify_tcp[] = {
* Item is inner pattern.
*/
static void
-flow_dv_translate_item_vlan(void *matcher, void *key,
+flow_dv_translate_item_vlan(struct mlx5_flow *dev_flow,
+ void *matcher, void *key,
const struct rte_flow_item *item,
int inner)
{
@@ -3487,6 +3490,12 @@ struct field_modify_info modify_tcp[] = {
headers_m = MLX5_ADDR_OF(fte_match_param, matcher,
outer_headers);
headers_v = MLX5_ADDR_OF(fte_match_param, key, outer_headers);
+ /*
+ * This is workaround, masks are not supported,
+ * and pre-validated.
+ */
+ dev_flow->dv.vf_vlan.tag =
+ rte_be_to_cpu_16(vlan_v->tci) & 0x0fff;
}
tci_m = rte_be_to_cpu_16(vlan_m->tci);
tci_v = rte_be_to_cpu_16(vlan_m->tci & vlan_v->tci);
@@ -4995,7 +5004,8 @@ struct field_modify_info modify_tcp[] = {
MLX5_FLOW_LAYER_OUTER_L2;
break;
case RTE_FLOW_ITEM_TYPE_VLAN:
- flow_dv_translate_item_vlan(match_mask, match_value,
+ flow_dv_translate_item_vlan(dev_flow,
+ match_mask, match_value,
items, tunnel);
matcher.priority = MLX5_PRIORITY_MAP_L2;
last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -5211,6 +5221,17 @@ struct field_modify_info modify_tcp[] = {
"hardware refuses to create flow");
goto error;
}
+ if (priv->esxi_context &&
+ dev_flow->dv.vf_vlan.tag &&
+ !dev_flow->dv.vf_vlan.created) {
+ /*
+ * The rule contains the VLAN pattern.
+ * For VF we are going to create VLAN
+ * interface to make ESXi set correct
+ * e-Switch vport context.
+ */
+ mlx5_vlan_esxi_acquire(dev, &dev_flow->dv.vf_vlan);
+ }
}
return 0;
error:
@@ -5224,6 +5245,9 @@ struct field_modify_info modify_tcp[] = {
mlx5_hrxq_release(dev, dv->hrxq);
dv->hrxq = NULL;
}
+ if (dev_flow->dv.vf_vlan.tag &&
+ dev_flow->dv.vf_vlan.created)
+ mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
}
rte_errno = err; /* Restore rte_errno. */
return -rte_errno;
@@ -5424,6 +5448,9 @@ struct field_modify_info modify_tcp[] = {
mlx5_hrxq_release(dev, dv->hrxq);
dv->hrxq = NULL;
}
+ if (dev_flow->dv.vf_vlan.tag &&
+ dev_flow->dv.vf_vlan.created)
+ mlx5_vlan_esxi_release(dev, &dev_flow->dv.vf_vlan);
}
}
@@ -391,6 +391,9 @@
flow_verbs_spec_add(&dev_flow->verbs, ð, size);
else
flow_verbs_item_vlan_update(dev_flow->verbs.attr, ð);
+ if (!tunnel)
+ dev_flow->verbs.vf_vlan.tag =
+ rte_be_to_cpu_16(spec->tci) & 0x0fff;
}
/**
@@ -1054,7 +1057,7 @@
break;
case RTE_FLOW_ITEM_TYPE_VLAN:
ret = mlx5_flow_validate_item_vlan(items, item_flags,
- error);
+ dev, error);
if (ret < 0)
return ret;
last_item = tunnel ? (MLX5_FLOW_LAYER_INNER_L2 |
@@ -1592,6 +1595,10 @@
mlx5_hrxq_release(dev, verbs->hrxq);
verbs->hrxq = NULL;
}
+ if (dev_flow->verbs.vf_vlan.tag &&
+ dev_flow->verbs.vf_vlan.created) {
+ mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+ }
}
}
@@ -1639,6 +1646,7 @@
flow_verbs_apply(struct rte_eth_dev *dev, struct rte_flow *flow,
struct rte_flow_error *error)
{
+ struct mlx5_priv *priv = dev->data->dev_private;
struct mlx5_flow_verbs *verbs;
struct mlx5_flow *dev_flow;
int err;
@@ -1688,6 +1696,17 @@
"hardware refuses to create flow");
goto error;
}
+ if (priv->esxi_context &&
+ dev_flow->verbs.vf_vlan.tag &&
+ !dev_flow->verbs.vf_vlan.created) {
+ /*
+ * The rule contains the VLAN pattern.
+ * For VF we are going to create VLAN
+ * interface to make ESXi set correct
+ * e-Switch vport context.
+ */
+ mlx5_vlan_esxi_acquire(dev, &dev_flow->verbs.vf_vlan);
+ }
}
return 0;
error:
@@ -1701,6 +1720,10 @@
mlx5_hrxq_release(dev, verbs->hrxq);
verbs->hrxq = NULL;
}
+ if (dev_flow->verbs.vf_vlan.tag &&
+ dev_flow->verbs.vf_vlan.created) {
+ mlx5_vlan_esxi_release(dev, &dev_flow->verbs.vf_vlan);
+ }
}
rte_errno = err; /* Restore rte_errno. */
return -rte_errno;
@@ -12,11 +12,14 @@
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
+#include <stdalign.h>
#include <string.h>
#include <sys/socket.h>
#include <unistd.h>
#include <rte_errno.h>
+#include <rte_malloc.h>
+#include <rte_hypervisor.h>
#include "mlx5.h"
#include "mlx5_utils.h"
@@ -28,6 +31,8 @@
/* Receive buffer size for the Netlink socket */
#define MLX5_RECV_BUF_SIZE 32768
+/** Parameters of VLAN devices created by driver. */
+#define MLX5_ESXI_VLAN_DEVICE_PFX "evmlx"
/*
* Define NDA_RTA as defined in iproute2 sources.
*
@@ -987,3 +992,277 @@ struct mlx5_nl_ifindex_data {
}
return ret;
}
+
+/*
+ * Delete VLAN network device by ifindex.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ * Interface index of network device to delete.
+ */
+static void
+mlx5_vlan_esxi_delete(struct mlx5_vlan_esxi_context *esxi,
+ uint32_t ifindex)
+{
+ int ret;
+ struct {
+ struct nlmsghdr nh;
+ struct ifinfomsg info;
+ } req = {
+ .nh = {
+ .nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
+ .nlmsg_type = RTM_DELLINK,
+ .nlmsg_flags = NLM_F_REQUEST | NLM_F_ACK,
+ },
+ .info = {
+ .ifi_family = AF_UNSPEC,
+ .ifi_index = ifindex,
+ },
+ };
+
+ if (ifindex) {
+ ++esxi->nl_sn;
+ if (!esxi->nl_sn)
+ ++esxi->nl_sn;
+ ret = mlx5_nl_send(esxi->nl_socket, &req.nh, esxi->nl_sn);
+ if (ret >= 0)
+ ret = mlx5_nl_recv(esxi->nl_socket,
+ esxi->nl_sn,
+ NULL, NULL);
+ if (ret < 0)
+ DRV_LOG(WARNING, "netlink: error deleting"
+ " VLAN ESXi ifindex %u, %d",
+ ifindex, ret);
+ }
+}
+
+/* Set of subroutines to build Netlink message. */
+static struct nlattr *
+nl_msg_tail(struct nlmsghdr *nlh)
+{
+ return (struct nlattr *)
+ (((uint8_t *)nlh) + NLMSG_ALIGN(nlh->nlmsg_len));
+}
+
+static void
+nl_attr_put(struct nlmsghdr *nlh, int type, const void *data, int alen)
+{
+ struct nlattr *nla = nl_msg_tail(nlh);
+
+ nla->nla_type = type;
+ nla->nla_len = NLMSG_ALIGN(sizeof(struct nlattr) + alen);
+ nlh->nlmsg_len = NLMSG_ALIGN(nlh->nlmsg_len) + nla->nla_len;
+
+ if (alen)
+ memcpy((uint8_t *)nla + sizeof(struct nlattr), data, alen);
+}
+
+static struct nlattr *
+nl_attr_nest_start(struct nlmsghdr *nlh, int type)
+{
+ struct nlattr *nest = (struct nlattr *)nl_msg_tail(nlh);
+
+ nl_attr_put(nlh, type, NULL, 0);
+ return nest;
+}
+
+static void
+nl_attr_nest_end(struct nlmsghdr *nlh, struct nlattr *nest)
+{
+ nest->nla_len = (uint8_t *)nl_msg_tail(nlh) - (uint8_t *)nest;
+}
+
+/*
+ * Create network VLAN device with specified VLAN tag.
+ *
+ * @param[in] tcf
+ * Context object initialized by mlx5_vlan_esxi_init().
+ * @param[in] ifindex
+ * Base network interface index.
+ * @param[in] tag
+ * VLAN tag for VLAN network device to create.
+ */
+static uint32_t
+mlx5_vlan_esxi_create(struct mlx5_vlan_esxi_context *esxi,
+ uint32_t ifindex,
+ uint16_t tag)
+{
+ struct nlmsghdr *nlh;
+ struct ifinfomsg *ifm;
+ char name[sizeof(MLX5_ESXI_VLAN_DEVICE_PFX) + 32];
+
+ alignas(RTE_CACHE_LINE_SIZE)
+ uint8_t buf[NLMSG_ALIGN(sizeof(struct nlmsghdr)) +
+ NLMSG_ALIGN(sizeof(struct ifinfomsg)) +
+ NLMSG_ALIGN(sizeof(struct nlattr)) * 8 +
+ NLMSG_ALIGN(sizeof(uint32_t)) +
+ NLMSG_ALIGN(sizeof(name)) +
+ NLMSG_ALIGN(sizeof("vlan")) +
+ NLMSG_ALIGN(sizeof(uint32_t)) +
+ NLMSG_ALIGN(sizeof(uint16_t)) + 16];
+ struct nlattr *na_info;
+ struct nlattr *na_vlan;
+ int ret;
+
+ memset(buf, 0, sizeof(buf));
+ ++esxi->nl_sn;
+ if (!esxi->nl_sn)
+ ++esxi->nl_sn;
+ nlh = (struct nlmsghdr *)buf;
+ nlh->nlmsg_len = sizeof(struct nlmsghdr);
+ nlh->nlmsg_type = RTM_NEWLINK;
+ nlh->nlmsg_flags = NLM_F_REQUEST | NLM_F_CREATE |
+ NLM_F_EXCL | NLM_F_ACK;
+ ifm = (struct ifinfomsg *)nl_msg_tail(nlh);
+ nlh->nlmsg_len += sizeof(struct ifinfomsg);
+ ifm->ifi_family = AF_UNSPEC;
+ ifm->ifi_type = 0;
+ ifm->ifi_index = 0;
+ ifm->ifi_flags = IFF_UP;
+ ifm->ifi_change = 0xffffffff;
+ nl_attr_put(nlh, IFLA_LINK, &ifindex, sizeof(ifindex));
+ ret = snprintf(name, sizeof(name), "%s.%u.%u",
+ MLX5_ESXI_VLAN_DEVICE_PFX, ifindex, tag);
+ nl_attr_put(nlh, IFLA_IFNAME, name, ret + 1);
+ na_info = nl_attr_nest_start(nlh, IFLA_LINKINFO);
+ nl_attr_put(nlh, IFLA_INFO_KIND, "vlan", sizeof("vlan"));
+ na_vlan = nl_attr_nest_start(nlh, IFLA_INFO_DATA);
+ nl_attr_put(nlh, IFLA_VLAN_ID, &tag, sizeof(tag));
+ nl_attr_nest_end(nlh, na_vlan);
+ nl_attr_nest_end(nlh, na_info);
+ assert(sizeof(buf) >= nlh->nlmsg_len);
+ ret = mlx5_nl_send(esxi->nl_socket, nlh, esxi->nl_sn);
+ if (ret >= 0)
+ ret = mlx5_nl_recv(esxi->nl_socket, esxi->nl_sn, NULL, NULL);
+ if (ret < 0) {
+ DRV_LOG(WARNING,
+ "netlink: VLAN %s create failure (%d)",
+ name, ret);
+ }
+ // Try to get ifindex of created or pre-existing device.
+ ret = if_nametoindex(name);
+ if (!ret) {
+ DRV_LOG(WARNING,
+ "VLAN %s failed to get index (%d)",
+ name, errno);
+ return 0;
+ }
+ return ret;
+}
+
+/*
+ * Release VLAN network device, created for ESXi workaround.
+ *
+ * @param[in] dev
+ * Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ * Object representing the network device to release.
+ */
+void mlx5_vlan_esxi_release(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vlan)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+ struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+ assert(vlan->created);
+ assert(priv->esxi_context);
+ if (!vlan->created || !esxi)
+ return;
+ vlan->created = 0;
+ assert(vlan_dev[vlan->tag].refcnt);
+ if (--vlan_dev[vlan->tag].refcnt == 0 &&
+ vlan_dev[vlan->tag].ifindex) {
+ mlx5_vlan_esxi_delete(esxi, vlan_dev[vlan->tag].ifindex);
+ vlan_dev[vlan->tag].ifindex = 0;
+ }
+}
+
+/**
+ * Acquire VLAN interface with specified tag for ESXi workaround.
+ *
+ * @param[in] dev
+ * Ethernet device object, Netlink context provider.
+ * @param[in] vlan
+ * Object representing the network device to acquire.
+ */
+void mlx5_vlan_esxi_acquire(struct rte_eth_dev *dev,
+ struct mlx5_vf_vlan *vlan)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_vlan_esxi_context *esxi = priv->esxi_context;
+ struct mlx5_vlan_dev *vlan_dev = &esxi->vlan_dev[0];
+
+ assert(!vlan->created);
+ assert(priv->esxi_context);
+ if (vlan->created || !esxi)
+ return;
+ if (vlan_dev[vlan->tag].refcnt == 0) {
+ assert(!vlan_dev[vlan->tag].ifindex);
+ vlan_dev[vlan->tag].ifindex =
+ mlx5_vlan_esxi_create(esxi,
+ esxi->vf_ifindex,
+ vlan->tag);
+ }
+ if (vlan_dev[vlan->tag].ifindex) {
+ vlan_dev[vlan->tag].refcnt++;
+ vlan->created = 1;
+ }
+}
+
+/*
+ * Create per ethernet device VLAN ESXi workaround context
+ */
+struct mlx5_vlan_esxi_context *
+mlx5_vlan_esxi_init(struct rte_eth_dev *dev,
+ uint32_t ifindex)
+{
+ struct mlx5_priv *priv = dev->data->dev_private;
+ struct mlx5_dev_config *config = &priv->config;
+ struct mlx5_vlan_esxi_context *esxi;
+
+ /* Do not engage workaround over PF. */
+ if (!config->vf)
+ return NULL;
+ /* Check whether there is virtual environment */
+ if (rte_hypervisor_get() == RTE_HYPERVISOR_NONE)
+ return NULL;
+ esxi = rte_zmalloc(__func__, sizeof(*esxi), sizeof(uint32_t));
+ if (!esxi) {
+ DRV_LOG(WARNING,
+ "Can not allocate memory"
+ " for ESXi VLAN context");
+ return NULL;
+ }
+ esxi->nl_socket = mlx5_nl_init(NETLINK_ROUTE);
+ if (esxi->nl_socket < 0) {
+ DRV_LOG(WARNING,
+ "Can not create Netlink socket"
+ " for ESXi VLAN context");
+ rte_free(esxi);
+ return NULL;
+ }
+ esxi->nl_sn = random();
+ esxi->vf_ifindex = ifindex;
+ esxi->dev = dev;
+ /* Cleanup for existing VLAN devices. */
+ return esxi;
+}
+
+/*
+ * Destroy per ethernet device VLAN ESXi workaround context
+ */
+void mlx5_vlan_esxi_exit(struct mlx5_vlan_esxi_context *esxi)
+{
+ unsigned int i;
+
+ /* Delete all remaining VLAN devices. */
+ for (i = 0; i < RTE_DIM(esxi->vlan_dev); i++) {
+ if (esxi->vlan_dev[i].ifindex)
+ mlx5_vlan_esxi_delete(esxi, esxi->vlan_dev[i].ifindex);
+ }
+ if (esxi->nl_socket >= 0)
+ close(esxi->nl_socket);
+ rte_free(esxi);
+}