[dpdk-dev] [RFC][PATCH V2 3/3] examples/vhost: Add VMDQ vswitch device

Pankaj Chauhan pankaj.chauhan at nxp.com
Mon Sep 5 12:54:31 CEST 2016


Add support for VMDQ vswitch device. This patch takes
out all VMDQ specefic code from vhost/main.[c,h] and
move it to vmdq.[c,h]. Moreover vmdq.[c,h] files plug
the VMDQ vswitch device implmentation to the vhost-switch
using vswitch framework.

The main vhost/main.[c,h] code is now generic and can support
any switch implementation that conforms with vswitch framework.

Please note that the core VMDQ logic remains as it is, as it
was in vhost/main.c, this patch just moves it to different
file and fits into ops provided by framework.

Signed-off-by: Pankaj Chauhan <pankaj.chauhan at nxp.com>
---
 examples/vhost/main.c | 486 +++++-------------------------------
 examples/vhost/main.h |  10 +
 examples/vhost/vmdq.c | 669 ++++++++++++++++++++++++++++++++++++++++++++++++++
 examples/vhost/vmdq.h |  57 +++++
 4 files changed, 794 insertions(+), 428 deletions(-)
 create mode 100644 examples/vhost/vmdq.c
 create mode 100644 examples/vhost/vmdq.h

diff --git a/examples/vhost/main.c b/examples/vhost/main.c
index a4e51ae..096339b 100644
--- a/examples/vhost/main.c
+++ b/examples/vhost/main.c
@@ -54,6 +54,7 @@
 #include <rte_tcp.h>
 
 #include "main.h"
+#include "vswitch_common.h"
 
 #ifndef MAX_QUEUES
 #define MAX_QUEUES 128
@@ -65,7 +66,6 @@
 #define MBUF_CACHE_SIZE	128
 #define MBUF_DATA_SIZE	RTE_MBUF_DEFAULT_BUF_SIZE
 
-#define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
 #define BURST_TX_DRAIN_US 100	/* TX drain every ~100us */
 
 #define BURST_RX_WAIT_US 15	/* Defines how long we wait between retries on RX */
@@ -103,7 +103,6 @@ static uint32_t enabled_port_mask = 0;
 static uint32_t promiscuous;
 
 /* number of devices/queues to support*/
-static uint32_t num_queues = 0;
 static uint32_t num_devices;
 
 static struct rte_mempool *mbuf_pool;
@@ -112,6 +111,8 @@ static int mergeable;
 /* Do vlan strip on host, enabled on default */
 static uint32_t vlan_strip = 1;
 
+static uint32_t jumbo_frame_en = 0;
+
 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
 typedef enum {
 	VM2VM_DISABLED = 0,
@@ -146,74 +147,16 @@ static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
 static char switch_dev[MAX_BASENAME_SZ] = "vmdq";
 static uint32_t switch_max_ports = MAX_DEVICES;
 
-/* empty vmdq configuration structure. Filled in programatically */
-static struct rte_eth_conf vmdq_conf_default = {
-	.rxmode = {
-		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
-		.split_hdr_size = 0,
-		.header_split   = 0, /**< Header Split disabled */
-		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
-		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
-		/*
-		 * It is necessary for 1G NIC such as I350,
-		 * this fixes bug of ipv4 forwarding in guest can't
-		 * forward pakets from one virtio dev to another virtio dev.
-		 */
-		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
-		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
-		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
-	},
-
-	.txmode = {
-		.mq_mode = ETH_MQ_TX_NONE,
-	},
-	.rx_adv_conf = {
-		/*
-		 * should be overridden separately in code with
-		 * appropriate values
-		 */
-		.vmdq_rx_conf = {
-			.nb_queue_pools = ETH_8_POOLS,
-			.enable_default_pool = 0,
-			.default_pool = 0,
-			.nb_pool_maps = 0,
-			.pool_map = {{0, 0},},
-		},
-	},
-};
-
+struct vswitch_dev *vswitch_dev_g;
 static unsigned lcore_ids[RTE_MAX_LCORE];
 static uint8_t ports[RTE_MAX_ETHPORTS];
 static unsigned num_ports = 0; /**< The number of ports specified in command line */
-static uint16_t num_pf_queues, num_vmdq_queues;
-static uint16_t vmdq_pool_base, vmdq_queue_base;
-static uint16_t queues_per_pool;
-
-const uint16_t vlan_tags[] = {
-	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
-	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
-	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
-	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
-	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
-	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
-	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
-	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
-};
-
-/* ethernet addresses of ports */
-static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
 
 static struct vhost_dev_tailq_list vhost_dev_list =
 	TAILQ_HEAD_INITIALIZER(vhost_dev_list);
 
 static struct lcore_info lcore_info[RTE_MAX_LCORE];
 
-/* Used for queueing bursts of TX packets. */
-struct mbuf_table {
-	unsigned len;
-	unsigned txq_id;
-	struct rte_mbuf *m_table[MAX_PKT_BURST];
-};
 
 /* TX queue for each data core. */
 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
@@ -223,35 +166,6 @@ struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
 #define VLAN_HLEN       4
 
 /*
- * Builds up the correct configuration for VMDQ VLAN pool map
- * according to the pool & queue limits.
- */
-static inline int
-get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
-{
-	struct rte_eth_vmdq_rx_conf conf;
-	struct rte_eth_vmdq_rx_conf *def_conf =
-		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
-	unsigned i;
-
-	memset(&conf, 0, sizeof(conf));
-	conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
-	conf.nb_pool_maps = num_devices;
-	conf.enable_loop_back = def_conf->enable_loop_back;
-	conf.rx_mode = def_conf->rx_mode;
-
-	for (i = 0; i < conf.nb_pool_maps; i++) {
-		conf.pool_map[i].vlan_id = vlan_tags[ i ];
-		conf.pool_map[i].pools = (1UL << i);
-	}
-
-	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
-	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
-		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
-	return 0;
-}
-
-/*
  * Validate the device number according to the max pool number gotten form
  * dev_info. If the device number is invalid, give the error message and
  * return -1. Each device must have its own pool.
@@ -274,16 +188,25 @@ static inline int
 port_init(uint8_t port)
 {
 	struct rte_eth_dev_info dev_info;
-	struct rte_eth_conf port_conf;
 	struct rte_eth_rxconf *rxconf;
 	struct rte_eth_txconf *txconf;
 	int16_t rx_rings, tx_rings;
 	uint16_t rx_ring_size, tx_ring_size;
+	struct vswitch_port *vs_port;
 	int retval;
 	uint16_t q;
 
+	if (port >= rte_eth_dev_count()) return -1;
+
+	vs_port = vs_add_port(vswitch_dev_g, port, VSWITCH_PTYPE_PHYS, NULL);
+
+	if (!vs_port) {
+		rte_exit(EXIT_FAILURE, "Failed to add port [%d] to vsdev %s\n",
+			 port, vswitch_dev_g->name);
+	}
+
 	/* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
-	rte_eth_dev_info_get (port, &dev_info);
+	rte_eth_dev_info_get (vs_port->port_id, &dev_info);
 
 	if (dev_info.max_rx_queues > MAX_QUEUES) {
 		rte_exit(EXIT_FAILURE,
@@ -298,33 +221,10 @@ port_init(uint8_t port)
 	/* Enable vlan offload */
 	txconf->txq_flags &= ~ETH_TXQ_FLAGS_NOVLANOFFL;
 
-	/*configure the number of supported virtio devices based on VMDQ limits */
-	num_devices = dev_info.max_vmdq_pools;
-
 	rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
 	tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
 	tx_rings = (uint16_t)rte_lcore_count();
 
-	retval = validate_num_devices(MAX_DEVICES);
-	if (retval < 0)
-		return retval;
-
-	/* Get port configuration. */
-	retval = get_eth_conf(&port_conf, num_devices);
-	if (retval < 0)
-		return retval;
-	/* NIC queues are divided into pf queues and vmdq queues.  */
-	num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
-	queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
-	num_vmdq_queues = num_devices * queues_per_pool;
-	num_queues = num_pf_queues + num_vmdq_queues;
-	vmdq_queue_base = dev_info.vmdq_queue_base;
-	vmdq_pool_base  = dev_info.vmdq_pool_base;
-	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
-		num_pf_queues, num_devices, queues_per_pool);
-
-	if (port >= rte_eth_dev_count()) return -1;
-
 	if (enable_tx_csum == 0)
 		rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_CSUM);
 
@@ -337,10 +237,11 @@ port_init(uint8_t port)
 
 	rx_rings = (uint16_t)dev_info.max_rx_queues;
 	/* Configure ethernet device. */
-	retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
+	retval = rte_eth_dev_configure(vs_port->port_id, rx_rings, tx_rings,
+				       &vs_port->port_conf);
 	if (retval != 0) {
 		RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
-			port, strerror(-retval));
+			vs_port->port_id, strerror(-retval));
 		return retval;
 	}
 
@@ -353,7 +254,7 @@ port_init(uint8_t port)
 		if (retval < 0) {
 			RTE_LOG(ERR, VHOST_PORT,
 				"Failed to setup rx queue %u of port %u: %s.\n",
-				q, port, strerror(-retval));
+				q, vs_port->port_id, strerror(-retval));
 			return retval;
 		}
 	}
@@ -369,28 +270,10 @@ port_init(uint8_t port)
 		}
 	}
 
-	/* Start the device. */
-	retval  = rte_eth_dev_start(port);
-	if (retval < 0) {
-		RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
-			port, strerror(-retval));
-		return retval;
-	}
-
 	if (promiscuous)
 		rte_eth_promiscuous_enable(port);
 
-	rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
-	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
-	RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
-			" %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
-			(unsigned)port,
-			vmdq_ports_eth_addr[port].addr_bytes[0],
-			vmdq_ports_eth_addr[port].addr_bytes[1],
-			vmdq_ports_eth_addr[port].addr_bytes[2],
-			vmdq_ports_eth_addr[port].addr_bytes[3],
-			vmdq_ports_eth_addr[port].addr_bytes[4],
-			vmdq_ports_eth_addr[port].addr_bytes[5]);
+	vs_port_start(vs_port);
 
 	return 0;
 }
@@ -542,9 +425,6 @@ us_vhost_parse_args(int argc, char **argv)
 
 		case 'P':
 			promiscuous = 1;
-			vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
-				ETH_VMDQ_ACCEPT_BROADCAST |
-				ETH_VMDQ_ACCEPT_MULTICAST;
 			rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
 
 			break;
@@ -632,11 +512,8 @@ us_vhost_parse_args(int argc, char **argv)
 					return -1;
 				} else {
 					mergeable = !!ret;
-					if (ret) {
-						vmdq_conf_default.rxmode.jumbo_frame = 1;
-						vmdq_conf_default.rxmode.max_rx_pkt_len
-							= JUMBO_FRAME_MAX_SIZE;
-					}
+					if (ret)
+						jumbo_frame_en = 1;
 				}
 			}
 
@@ -651,8 +528,6 @@ us_vhost_parse_args(int argc, char **argv)
 					return -1;
 				} else {
 					vlan_strip = !!ret;
-					vmdq_conf_default.rxmode.hw_vlan_strip =
-						vlan_strip;
 				}
 			}
 
@@ -747,8 +622,7 @@ static unsigned check_ports_num(unsigned nb_ports)
 	return valid_num_ports;
 }
 
-static inline struct vhost_dev *__attribute__((always_inline))
-find_vhost_dev(struct ether_addr *mac)
+struct vhost_dev *find_vhost_dev(struct ether_addr *mac)
 {
 	struct vhost_dev *vdev;
 
@@ -761,95 +635,6 @@ find_vhost_dev(struct ether_addr *mac)
 	return NULL;
 }
 
-/*
- * This function learns the MAC address of the device and registers this along with a
- * vlan tag to a VMDQ.
- */
-static int
-link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
-{
-	struct ether_hdr *pkt_hdr;
-	int i, ret;
-
-	/* Learn MAC address of guest device from packet */
-	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
-	if (find_vhost_dev(&pkt_hdr->s_addr)) {
-		RTE_LOG(ERR, VHOST_DATA,
-			"(%d) device is using a registered MAC!\n",
-			vdev->vid);
-		return -1;
-	}
-
-	for (i = 0; i < ETHER_ADDR_LEN; i++)
-		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
-
-	/* vlan_tag currently uses the device_id. */
-	vdev->vlan_tag = vlan_tags[vdev->vid];
-
-	/* Print out VMDQ registration info. */
-	RTE_LOG(INFO, VHOST_DATA,
-		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
-		vdev->vid,
-		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
-		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
-		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
-		vdev->vlan_tag);
-
-	/* Register the MAC address. */
-	ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
-				(uint32_t)vdev->vid + vmdq_pool_base);
-	if (ret)
-		RTE_LOG(ERR, VHOST_DATA,
-			"(%d) failed to add device MAC address to VMDQ\n",
-			vdev->vid);
-
-	/* Enable stripping of the vlan tag as we handle routing. */
-	if (vlan_strip)
-		rte_eth_dev_set_vlan_strip_on_queue(ports[0],
-			(uint16_t)vdev->vmdq_rx_q, 1);
-
-	/* Set device as ready for RX. */
-	vdev->ready = DEVICE_RX;
-
-	return 0;
-}
-
-/*
- * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
- * queue before disabling RX on the device.
- */
-static inline void
-unlink_vmdq(struct vhost_dev *vdev)
-{
-	unsigned i = 0;
-	unsigned rx_count;
-	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
-
-	if (vdev->ready == DEVICE_RX) {
-		/*clear MAC and VLAN settings*/
-		rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
-		for (i = 0; i < 6; i++)
-			vdev->mac_address.addr_bytes[i] = 0;
-
-		vdev->vlan_tag = 0;
-
-		/*Clear out the receive buffers*/
-		rx_count = rte_eth_rx_burst(ports[0],
-					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
-
-		while (rx_count) {
-			for (i = 0; i < rx_count; i++)
-				rte_pktmbuf_free(pkts_burst[i]);
-
-			rx_count = rte_eth_rx_burst(ports[0],
-					(uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
-		}
-
-		vdev->ready = DEVICE_MAC_LEARNING;
-	}
-}
-
 static inline void __attribute__((always_inline))
 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
 	    struct rte_mbuf *m)
@@ -876,8 +661,7 @@ virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
  * Check if the packet destination MAC address is for a local device. If so then put
  * the packet on that devices RX queue. If not then return.
  */
-static inline int __attribute__((always_inline))
-virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
+int virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
 {
 	struct ether_hdr *pkt_hdr;
 	struct vhost_dev *dst_vdev;
@@ -908,69 +692,9 @@ virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
 	return 0;
 }
 
-/*
- * Check if the destination MAC of a packet is one local VM,
- * and get its vlan tag, and offset if it is.
- */
-static inline int __attribute__((always_inline))
-find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
-	uint32_t *offset, uint16_t *vlan_tag)
+struct mbuf_table *vhost_switch_get_txq(uint16_t core_id)
 {
-	struct vhost_dev *dst_vdev;
-	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
-	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
-	if (!dst_vdev)
-		return 0;
-
-	if (vdev->vid == dst_vdev->vid) {
-		RTE_LOG(DEBUG, VHOST_DATA,
-			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
-			vdev->vid);
-		return -1;
-	}
-
-	/*
-	 * HW vlan strip will reduce the packet length
-	 * by minus length of vlan tag, so need restore
-	 * the packet length by plus it.
-	 */
-	*offset  = VLAN_HLEN;
-	*vlan_tag = vlan_tags[vdev->vid];
-
-	RTE_LOG(DEBUG, VHOST_DATA,
-		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
-		vdev->vid, dst_vdev->vid, *vlan_tag);
-
-	return 0;
-}
-
-static uint16_t
-get_psd_sum(void *l3_hdr, uint64_t ol_flags)
-{
-	if (ol_flags & PKT_TX_IPV4)
-		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
-	else /* assume ethertype == ETHER_TYPE_IPv6 */
-		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
-}
-
-static void virtio_tx_offload(struct rte_mbuf *m)
-{
-	void *l3_hdr;
-	struct ipv4_hdr *ipv4_hdr = NULL;
-	struct tcp_hdr *tcp_hdr = NULL;
-	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
-
-	l3_hdr = (char *)eth_hdr + m->l2_len;
-
-	if (m->ol_flags & PKT_TX_IPV4) {
-		ipv4_hdr = l3_hdr;
-		ipv4_hdr->hdr_checksum = 0;
-		m->ol_flags |= PKT_TX_IP_CKSUM;
-	}
-
-	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
-	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
+	return &lcore_tx_queue[core_id];
 }
 
 static inline void
@@ -980,8 +704,7 @@ free_pkts(struct rte_mbuf **pkts, uint16_t n)
 		rte_pktmbuf_free(pkts[n]);
 }
 
-static inline void __attribute__((always_inline))
-do_drain_mbuf_table(struct mbuf_table *tx_q)
+void do_drain_mbuf_table(struct mbuf_table *tx_q)
 {
 	uint16_t count;
 	struct vswitch_port *tx_port;
@@ -1006,100 +729,10 @@ do_drain_mbuf_table(struct mbuf_table *tx_q)
 	}
 
 	tx_q->len = 0;
+out:
+	return;
 }
 
-/*
- * This function routes the TX packet to the correct interface. This
- * may be a local device or the physical port.
- */
-static inline void __attribute__((always_inline))
-virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
-{
-	struct mbuf_table *tx_q;
-	unsigned offset = 0;
-	const uint16_t lcore_id = rte_lcore_id();
-	struct ether_hdr *nh;
-
-
-	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
-	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
-		struct vhost_dev *vdev2;
-
-		TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
-			virtio_xmit(vdev2, vdev, m);
-		}
-		goto queue2nic;
-	}
-
-	/*check if destination is local VM*/
-	if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
-		rte_pktmbuf_free(m);
-		return;
-	}
-
-	if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
-		if (unlikely(find_local_dest(vdev, m, &offset,
-					     &vlan_tag) != 0)) {
-			rte_pktmbuf_free(m);
-			return;
-		}
-	}
-
-	RTE_LOG(DEBUG, VHOST_DATA,
-		"(%d) TX: MAC address is external\n", vdev->vid);
-
-queue2nic:
-
-	/*Add packet to the port tx queue*/
-	tx_q = &lcore_tx_queue[lcore_id];
-
-	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
-	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
-		/* Guest has inserted the vlan tag. */
-		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
-		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
-		if ((vm2vm_mode == VM2VM_HARDWARE) &&
-			(vh->vlan_tci != vlan_tag_be))
-			vh->vlan_tci = vlan_tag_be;
-	} else {
-		m->ol_flags |= PKT_TX_VLAN_PKT;
-
-		/*
-		 * Find the right seg to adjust the data len when offset is
-		 * bigger than tail room size.
-		 */
-		if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
-			if (likely(offset <= rte_pktmbuf_tailroom(m)))
-				m->data_len += offset;
-			else {
-				struct rte_mbuf *seg = m;
-
-				while ((seg->next != NULL) &&
-					(offset > rte_pktmbuf_tailroom(seg)))
-					seg = seg->next;
-
-				seg->data_len += offset;
-			}
-			m->pkt_len += offset;
-		}
-
-		m->vlan_tci = vlan_tag;
-	}
-
-	if (m->ol_flags & PKT_TX_TCP_SEG)
-		virtio_tx_offload(m);
-
-	tx_q->m_table[tx_q->len++] = m;
-	if (enable_stats) {
-		vdev->stats.tx_total++;
-		vdev->stats.tx++;
-	}
-
-	if (unlikely(tx_q->len == MAX_PKT_BURST))
-		do_drain_mbuf_table(tx_q);
-}
-
-
 static inline void __attribute__((always_inline))
 drain_mbuf_table(struct mbuf_table *tx_q)
 {
@@ -1123,7 +756,7 @@ drain_mbuf_table(struct mbuf_table *tx_q)
 static inline void __attribute__((always_inline))
 drain_eth_rx(struct vhost_dev *vdev)
 {
-	uint16_t rx_count, enqueue_count;
+	uint16_t rx_count;
 	struct rte_mbuf *pkts[MAX_PKT_BURST];
 	uint16_t rxq, core_id;
 	struct vswitch_port *rx_port;
@@ -1148,37 +781,18 @@ drain_eth_rx(struct vhost_dev *vdev)
 	rxq = rx_port->get_rxq(rx_port, vdev, core_id);
 	rx_count = rx_port->do_rx(rx_port, rxq, NULL, pkts, MAX_PKT_BURST);
 
-	rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
-				    pkts, MAX_PKT_BURST);
 	if (!rx_count)
 		return;
 
-	/*
-	 * When "enable_retry" is set, here we wait and retry when there
-	 * is no enough free slots in the queue to hold @rx_count packets,
-	 * to diminish packet loss.
-	 */
-	if (enable_retry &&
-	    unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
-			VIRTIO_RXQ))) {
-		uint32_t retry;
-
-		for (retry = 0; retry < burst_rx_retry_num; retry++) {
-			rte_delay_us(burst_rx_delay_time);
-			if (rx_count <= rte_vhost_avail_entries(vdev->vid,
-					VIRTIO_RXQ))
-				break;
-		}
-	}
+	vs_lookup_n_fwd(rx_port, pkts, rx_count, rxq);
 
-	enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
-						pkts, rx_count);
 	if (enable_stats) {
 		rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
-		rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
 	}
 
 	free_pkts(pkts, rx_count);
+out:
+	return;
 }
 
 static inline void __attribute__((always_inline))
@@ -1263,7 +877,7 @@ switch_worker(void *arg __rte_unused)
 		TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
 			      lcore_vdev_entry) {
 			if (unlikely(vdev->remove)) {
-				unlink_vmdq(vdev);
+				vs_unlearn_port(vdev->vs_port);
 				vdev->ready = DEVICE_SAFE_REMOVE;
 				continue;
 			}
@@ -1289,6 +903,7 @@ static void
 destroy_device(int vid)
 {
 	struct vhost_dev *vdev = NULL;
+	struct vswitch_port *vs_port;
 	int lcore;
 
 	TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
@@ -1324,6 +939,10 @@ destroy_device(int vid)
 
 	lcore_info[vdev->coreid].device_num--;
 
+	vs_port = vdev->vs_port;
+	vs_port_stop(vs_port);
+	vs_del_port(vs_port);
+
 	RTE_LOG(INFO, VHOST_DATA,
 		"(%d) device has been removed from data core\n",
 		vdev->vid);
@@ -1341,6 +960,7 @@ new_device(int vid)
 	int lcore, core_add = 0;
 	uint32_t device_num_min;
 	struct vhost_dev *vdev;
+	struct vswitch_port *vs_port;
 
 	vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
 	if (vdev == NULL) {
@@ -1362,7 +982,6 @@ new_device(int vid)
 	vdev->vs_port = vs_port;
 
 	TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
-	vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
 
 	/*reset ready flag*/
 	vdev->ready = DEVICE_MAC_LEARNING;
@@ -1552,7 +1171,7 @@ main(int argc, char *argv[])
 	uint8_t portid;
 	static pthread_t tid;
 	char thread_name[RTE_MAX_THREAD_NAME_LEN];
-	uint64_t flags = 0;
+	uint64_t flags = 0, vswitch_conf_flags;
 
 	signal(SIGINT, sigint_handler);
 
@@ -1563,11 +1182,19 @@ main(int argc, char *argv[])
 	argc -= ret;
 	argv += ret;
 
+	vs_vswitch_init();
+
+	/* TBD:XXX: This needs to be removed here, when constructor mechanism
+	 * for registering swittches is in place
+	 */
+	vmdq_switch_impl_init();
+
 	/* parse app arguments */
 	ret = us_vhost_parse_args(argc, argv);
 	if (ret < 0)
 		rte_exit(EXIT_FAILURE, "Invalid argument\n");
 
+	/*TBD:XXX: vdev list or the vdev ports */
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
 		TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
 
@@ -1592,6 +1219,13 @@ main(int argc, char *argv[])
 		return -1;
 	}
 
+	vswitch_dev_g = vs_get_vswitch_dev(switch_dev);
+	if (!vswitch_dev_g) {
+		RTE_LOG(INFO, VHOST_CONFIG, "switch dev %s not supported\n",
+			switch_dev);
+		return -1;
+	}
+
 	/*
 	 * FIXME: here we are trying to allocate mbufs big enough for
 	 * @MAX_QUEUES, but the truth is we're never going to use that
@@ -1601,12 +1235,8 @@ main(int argc, char *argv[])
 	create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
 			 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
 
-	if (vm2vm_mode == VM2VM_HARDWARE) {
-		/* Enable VT loop back to let L2 switch to do it. */
-		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
-		RTE_LOG(DEBUG, VHOST_CONFIG,
-			"Enable loop back for L2 switch in vmdq.\n");
-	}
+	vswitch_conf_flags = get_vswitch_conf_flags();
+	vs_switch_dev_init(vswitch_dev_g, vswitch_conf_flags);
 
 	/* initialize all ports */
 	for (portid = 0; portid < nb_ports; portid++) {
diff --git a/examples/vhost/main.h b/examples/vhost/main.h
index 6bb42e8..d6b23f6 100644
--- a/examples/vhost/main.h
+++ b/examples/vhost/main.h
@@ -68,6 +68,7 @@ struct vhost_dev {
 	struct device_statistics stats;
 	TAILQ_ENTRY(vhost_dev) global_vdev_entry;
 	TAILQ_ENTRY(vhost_dev) lcore_vdev_entry;
+	struct vswitch_port *vs_port;
 } __rte_cache_aligned;
 
 TAILQ_HEAD(vhost_dev_tailq_list, vhost_dev);
@@ -88,4 +89,13 @@ struct lcore_info {
 	struct vhost_dev_tailq_list vdev_list;
 };
 
+#define MAX_PKT_BURST 32		/* Max burst size for RX/TX */
+
+/* Used for queueing bursts of TX packets. */
+struct mbuf_table {
+	unsigned len;
+	unsigned txq_id;
+	struct rte_mbuf *m_table[MAX_PKT_BURST];
+};
+
 #endif /* _MAIN_H_ */
diff --git a/examples/vhost/vmdq.c b/examples/vhost/vmdq.c
new file mode 100644
index 0000000..ca26195
--- /dev/null
+++ b/examples/vhost/vmdq.c
@@ -0,0 +1,669 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <rte_atomic.h>
+#include <rte_cycles.h>
+#include <rte_ethdev.h>
+#include <rte_log.h>
+#include <rte_string_fns.h>
+#include <rte_malloc.h>
+#include <rte_virtio_net.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+
+#include "vswitch_common.h"
+#include "vmdq.h"
+
+#define JUMBO_FRAME_MAX_SIZE    0x2600
+/* State of virtio device. */
+#define DEVICE_MAC_LEARNING 0
+#define DEVICE_RX			1
+#define DEVICE_SAFE_REMOVE	2
+
+#define VLAN_HLEN       4
+
+static struct vswitch_dev *vmdq_switch_dev_g;
+
+const uint16_t vlan_tags[] = {
+	1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
+	1008, 1009, 1010, 1011,	1012, 1013, 1014, 1015,
+	1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+	1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
+	1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
+	1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
+	1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
+	1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
+};
+
+/* empty vmdq configuration structure. Filled in programatically */
+static struct rte_eth_conf vmdq_conf_default = {
+	.rxmode = {
+		.mq_mode        = ETH_MQ_RX_VMDQ_ONLY,
+		.split_hdr_size = 0,
+		.header_split   = 0, /**< Header Split disabled */
+		.hw_ip_checksum = 0, /**< IP checksum offload disabled */
+		.hw_vlan_filter = 0, /**< VLAN filtering disabled */
+		/*
+		 * It is necessary for 1G NIC such as I350,
+		 * this fixes bug of ipv4 forwarding in guest can't
+		 * forward pakets from one virtio dev to another virtio dev.
+		 */
+		.hw_vlan_strip  = 1, /**< VLAN strip enabled. */
+		.jumbo_frame    = 0, /**< Jumbo Frame Support disabled */
+		.hw_strip_crc   = 0, /**< CRC stripped by hardware */
+	},
+
+	.txmode = {
+		.mq_mode = ETH_MQ_TX_NONE,
+	},
+	.rx_adv_conf = {
+		/*
+		 * should be overridden separately in code with
+		 * appropriate values
+		 */
+		.vmdq_rx_conf = {
+			.nb_queue_pools = ETH_8_POOLS,
+			.enable_default_pool = 0,
+			.default_pool = 0,
+			.nb_pool_maps = 0,
+			.pool_map = {{0, 0},},
+		},
+	},
+};
+
+
+static int vmdq_switch_init(__attribute__((unused))struct vswitch_dev *vs_dev,
+			    uint64_t conf_flags)
+{
+	uint32_t enable;
+
+	if (conf_flags & VS_CNF_FLG_VM2VM_HARDWARE) {
+		/* Enable VT loop back to let L2 switch to do it. */
+		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
+		RTE_LOG(DEBUG, VHOST_CONFIG,
+			"Enable loop back for L2 switch in vmdq.\n");
+	}
+
+	if (conf_flags & VS_CNF_FLG_PROMISCOUS_EN) {
+		vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
+				ETH_VMDQ_ACCEPT_BROADCAST |
+				ETH_VMDQ_ACCEPT_MULTICAST;
+	}
+
+	if (conf_flags & VS_CNF_FLG_JUMBO_EN) {
+		vmdq_conf_default.rxmode.jumbo_frame = 1;
+		vmdq_conf_default.rxmode.max_rx_pkt_len = JUMBO_FRAME_MAX_SIZE;
+	}
+
+	enable = !!(conf_flags & VS_CNF_FLG_VLAN_STRIP_EN);
+	vmdq_conf_default.rxmode.hw_vlan_strip = enable;
+
+	return 0;
+}
+
+
+static int vmdq_get_max_vdevs(struct vswitch_dev *vs_dev)
+{
+	struct vmdq_switch_priv *priv = vs_dev->priv;
+
+	return priv->num_devices;
+}
+
+/*
+ * Builds up the correct configuration for VMDQ VLAN pool map
+ * according to the pool & queue limits.
+ */
+static inline int
+vmdq_get_eth_conf(struct vswitch_port *vs_port, struct rte_eth_conf *eth_conf)
+{
+	struct rte_eth_vmdq_rx_conf conf;
+	struct rte_eth_vmdq_rx_conf *def_conf =
+		&vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	unsigned i;
+
+	memset(&conf, 0, sizeof(conf));
+	conf.nb_queue_pools = (enum rte_eth_nb_pools)priv->num_devices;
+	conf.nb_pool_maps = priv->num_devices;
+	conf.enable_loop_back = def_conf->enable_loop_back;
+	conf.rx_mode = def_conf->rx_mode;
+
+	for (i = 0; i < conf.nb_pool_maps; i++) {
+		conf.pool_map[i].vlan_id = vlan_tags[ i ];
+		conf.pool_map[i].pools = (1UL << i);
+	}
+
+	(void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
+	(void)(rte_memcpy(&eth_conf->rx_adv_conf.vmdq_rx_conf, &conf,
+		   sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
+	return 0;
+}
+
+static int vmdq_add_port_phys(struct vswitch_port *vs_port)
+{
+	struct rte_eth_dev_info dev_info;
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	uint16_t queues_per_pool;
+	int rc = 0;
+
+	if (priv->phys_port_count >= VMDQ_MAX_PHYS_PORTS) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"Physical ports greater than max devices(%d)\n",
+			VMDQ_MAX_PHYS_PORTS);
+		rc = -EBUSY;
+		goto out;
+	}
+
+	rte_eth_dev_info_get (vs_port->port_id, &dev_info);
+	if (dev_info.max_vmdq_pools > VMDQ_MAX_VIRTIO_PORTS) {
+		RTE_LOG(INFO, VHOST_CONFIG,
+			"Num devices (%d) greater than Max (%d)\n",
+			dev_info.max_vmdq_pools, VMDQ_MAX_VIRTIO_PORTS);
+		rc = -EINVAL;
+		goto out;
+
+	}
+
+	priv->num_devices = dev_info.max_vmdq_pools;
+	priv->phys_port_count++;
+
+	priv->num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
+	queues_per_pool =  dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
+	priv->queues_per_pool = queues_per_pool;
+	priv->num_vmdq_queues = priv->phys_port_count * queues_per_pool;
+	priv->num_queues = priv->num_pf_queues + priv->num_vmdq_queues;
+	priv->vmdq_queue_base = dev_info.vmdq_queue_base;
+	priv->vmdq_pool_base  = dev_info.vmdq_pool_base;
+
+	rc = vmdq_get_eth_conf(vs_port, &vs_port->port_conf);
+	if (rc < 0) {
+		goto out;
+	}
+
+	/*In VMDQ vhost_switch only one physical port is required, keep it
+	 * global so that it can be accessed while doing tx/rx to physical port
+	 */
+	priv->phys_port = vs_port;
+
+
+	printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
+		priv->num_pf_queues, priv->num_devices, priv->queues_per_pool);
+
+	RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n",
+		priv->num_devices);
+out:
+	if (rc) {
+		priv->phys_port_count--;
+	}
+	return rc;
+}
+
+static int vmdq_add_port_virtio(struct vswitch_port *vs_port)
+{
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	uint16_t rxq;
+
+	rxq = vs_port->port_id * priv->queues_per_pool + priv->vmdq_queue_base;
+	vs_port->phys_port_rxq = rxq;
+
+	priv->virtio_port_map[rxq] = vs_port;
+
+	RTE_LOG(INFO, VHOST_PORT, "Added virtio port %d, vmdq rxq %d\n",
+	vs_port->port_id, rxq);
+
+	return 0;
+}
+
+static int vmdq_add_port(struct vswitch_port *port)
+{
+	int rc = 0;
+
+	switch(port->type) {
+	case VSWITCH_PTYPE_PHYS:
+		rc = vmdq_add_port_phys(port);
+		break;
+	case VSWITCH_PTYPE_VIRTIO:
+		rc = vmdq_add_port_virtio(port);
+		break;
+	default:
+		RTE_LOG(INFO, VHOST_CONFIG, "Unkown port[id %d] type %d\n",
+			port->port_id, port->type);
+		rc = -EINVAL;
+	}
+
+	return rc;
+}
+
+static int vmdq_port_start(struct vswitch_port *port)
+{
+	int rc = 0;
+
+	switch(port->type) {
+	case VSWITCH_PTYPE_PHYS:
+		rc  = rte_eth_dev_start(port->port_id);
+		RTE_LOG(INFO, VHOST_PORT, "Started PHYS port %d, rc %d\n", port->port_id, rc);
+		break;
+	case VSWITCH_PTYPE_VIRTIO:
+		/*No specefic function to start virtio dev (?? check) */
+		rc = 0;
+		break;
+	default:
+		RTE_LOG(INFO, VHOST_CONFIG, "Unkown port[id %d] type %d\n",
+			port->port_id, port->type);
+		rc = -EINVAL;
+	}
+
+
+	/* Start the device. */
+	if (rc) {
+		RTE_LOG(ERR, VHOST_PORT, "Failed to init port[id %d, typ %d]\n",
+			port->port_id, port->type);
+	}
+
+	return rc;
+}
+
+/*
+ * This function learns the MAC address of the device and registers this along with a
+ * vlan tag to a VMDQ.
+ */
+static int
+link_vmdq(struct vswitch_port *vs_port, struct rte_mbuf *m)
+{
+	struct vhost_dev *vdev = vs_port->priv;
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	struct vswitch_dev *vs_dev = vs_port->vs_dev;
+	struct vswitch_port *phys_port = priv->phys_port;
+	struct ether_hdr *pkt_hdr;
+	int i, ret;
+
+	/* Learn MAC address of guest device from packet */
+	pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+	if (find_vhost_dev(&pkt_hdr->s_addr)) {
+		RTE_LOG(ERR, VHOST_DATA,
+			"(%d) device is using a registered MAC!\n",
+			vdev->vid);
+		return -1;
+	}
+
+	for (i = 0; i < ETHER_ADDR_LEN; i++) {
+		vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
+		vs_port->mac_addr.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
+	}
+
+	/* vlan_tag currently uses the device_id. */
+	vdev->vlan_tag = vlan_tags[vdev->vid];
+
+	/* Print out VMDQ registration info. */
+	RTE_LOG(INFO, VHOST_DATA,
+		"(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
+		vdev->vid,
+		vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
+		vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
+		vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
+		vdev->vlan_tag);
+
+	/* Register the MAC address. */
+	ret = rte_eth_dev_mac_addr_add(phys_port->port_id, &vdev->mac_address,
+				(uint32_t)vdev->vid + priv->vmdq_pool_base);
+	if (ret)
+		RTE_LOG(ERR, VHOST_DATA,
+			"(%d) failed to add device MAC address to VMDQ\n",
+			vdev->vid);
+
+	/* Enable stripping of the vlan tag as we handle routing. */
+	if (vs_dev->conf_flags & VS_CNF_FLG_VLAN_STRIP_EN)
+		rte_eth_dev_set_vlan_strip_on_queue(phys_port->port_id,
+			(uint16_t)vs_port->phys_port_rxq, 1);
+
+	/* Set device as ready for RX. */
+	vdev->ready = DEVICE_RX;
+
+	return 0;
+}
+
+static int vmdq_learn_port (struct vswitch_port *vs_port,
+			    struct rte_mbuf **pkts,
+		     __attribute__((unused))uint16_t count)
+{
+	int rc  = 0;
+
+	if (vs_port->type == VSWITCH_PTYPE_VIRTIO)
+		rc = link_vmdq(vs_port, pkts[0]);
+
+	return rc;
+}
+
+/*
+ * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
+ * queue before disabling RX on the device.
+ */
+static void unlink_vmdq(struct vswitch_port *vs_port)
+{
+	struct vhost_dev *vdev = vs_port->priv;
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	struct vswitch_port *phys_port = priv->phys_port;
+	unsigned i = 0;
+	unsigned rx_count;
+	struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
+
+	if (vdev->ready == DEVICE_RX) {
+		/*clear MAC and VLAN settings*/
+		rte_eth_dev_mac_addr_remove(phys_port->port_id,
+					    &vdev->mac_address);
+		for (i = 0; i < 6; i++) {
+			vdev->mac_address.addr_bytes[i] = 0;
+			vs_port->mac_addr.addr_bytes[i] = 0;
+		}
+
+		vdev->vlan_tag = 0;
+
+		/*Clear out the receive buffers*/
+		rx_count = rte_eth_rx_burst(phys_port->port_id,
+					(uint16_t)vs_port->phys_port_rxq,
+					pkts_burst, MAX_PKT_BURST);
+
+		while (rx_count) {
+			for (i = 0; i < rx_count; i++)
+				rte_pktmbuf_free(pkts_burst[i]);
+
+			rx_count = rte_eth_rx_burst(phys_port->port_id,
+					(uint16_t)vdev->vmdq_rx_q, pkts_burst,
+					MAX_PKT_BURST);
+		}
+
+		vdev->ready = DEVICE_MAC_LEARNING;
+	}
+}
+
+static int vmdq_unlearn_port (struct vswitch_port *vs_port)
+{
+	int rc = 0;
+
+	if (vs_port->type == VSWITCH_PTYPE_VIRTIO)
+		unlink_vmdq(vs_port);
+
+	return rc;
+}
+
+/*
+ * Check if the destination MAC of a packet is one local VM,
+ * and get its vlan tag, and offset if it is.
+ */
+static inline int __attribute__((always_inline))
+find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
+	uint32_t *offset, uint16_t *vlan_tag)
+{
+	struct vhost_dev *dst_vdev;
+	struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+	dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
+	if (!dst_vdev)
+		return 0;
+
+	if (vdev->vid == dst_vdev->vid) {
+		RTE_LOG(DEBUG, VHOST_DATA,
+			"(%d) TX: src and dst MAC is same. Dropping packet.\n",
+			vdev->vid);
+		return -1;
+	}
+
+	/*
+	 * HW vlan strip will reduce the packet length
+	 * by minus length of vlan tag, so need restore
+	 * the packet length by plus it.
+	 */
+	*offset  = VLAN_HLEN;
+	*vlan_tag = vlan_tags[vdev->vid];
+
+	RTE_LOG(DEBUG, VHOST_DATA,
+		"(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
+		vdev->vid, dst_vdev->vid, *vlan_tag);
+
+	return 0;
+}
+
+static uint16_t
+get_psd_sum(void *l3_hdr, uint64_t ol_flags)
+{
+	if (ol_flags & PKT_TX_IPV4)
+		return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
+	else /* assume ethertype == ETHER_TYPE_IPv6 */
+		return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
+}
+
+static void virtio_tx_offload(struct rte_mbuf *m)
+{
+	void *l3_hdr;
+	struct ipv4_hdr *ipv4_hdr = NULL;
+	struct tcp_hdr *tcp_hdr = NULL;
+	struct ether_hdr *eth_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
+
+	l3_hdr = (char *)eth_hdr + m->l2_len;
+
+	if (m->ol_flags & PKT_TX_IPV4) {
+		ipv4_hdr = l3_hdr;
+		ipv4_hdr->hdr_checksum = 0;
+		m->ol_flags |= PKT_TX_IP_CKSUM;
+	}
+
+	tcp_hdr = (struct tcp_hdr *)((char *)l3_hdr + m->l3_len);
+	tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
+}
+
+/*
+ * This function routes the TX packet to the correct interface. This
+ * may be a local device or the physical port.
+ */
+static inline void __attribute__((always_inline))
+virtio_tx_route(struct vswitch_port *vs_port,
+		struct rte_mbuf *m, uint16_t vlan_tag)
+{
+	struct vhost_dev *vdev = vs_port->priv;
+	struct vswitch_dev *vs_dev = vs_port->vs_dev;
+	struct mbuf_table *tx_q;
+	unsigned offset = 0;
+	const uint16_t lcore_id = rte_lcore_id();
+	struct ether_hdr *nh;
+	uint32_t tx_ptype_mask;
+
+	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
+	if (unlikely(is_broadcast_ether_addr(&nh->d_addr))) {
+
+		/*broad cast to virtio ports only, for physical port
+		 * qeueu2nic will do it after adding vlan tag of device
+		 */
+		tx_ptype_mask = VS_PTYPE_MASK(VSWITCH_PTYPE_VIRTIO);
+		vs_do_broadcast_fwd(vs_port->vs_dev, vs_port,
+			tx_ptype_mask, m);
+
+		goto queue2nic;
+	}
+
+	/*check if destination is local VM*/
+	if ((vs_dev->conf_flags & VS_CNF_FLG_VM2VM_SOFTWARE)) {
+		if (!virtio_tx_local(vdev, m)) {
+			rte_pktmbuf_free(m);
+			return;
+		}
+	}
+
+	if (unlikely(vs_dev->conf_flags & VS_CNF_FLG_VM2VM_HARDWARE)) {
+		if (unlikely(find_local_dest(vdev, m, &offset,
+					     &vlan_tag) != 0)) {
+			rte_pktmbuf_free(m);
+			return;
+		}
+	}
+
+	RTE_LOG(DEBUG, VHOST_DATA,
+		"(%d) TX: MAC address is external\n", vdev->vid);
+
+queue2nic:
+
+	/*Add packet to the port tx queue*/
+	tx_q = vhost_switch_get_txq(lcore_id);
+
+	nh = rte_pktmbuf_mtod(m, struct ether_hdr *);
+	if (unlikely(nh->ether_type == rte_cpu_to_be_16(ETHER_TYPE_VLAN))) {
+		/* Guest has inserted the vlan tag. */
+		struct vlan_hdr *vh = (struct vlan_hdr *) (nh + 1);
+		uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
+		if ((vs_dev->conf_flags & VS_CNF_FLG_VM2VM_HARDWARE) &&
+			(vh->vlan_tci != vlan_tag_be))
+			vh->vlan_tci = vlan_tag_be;
+	} else {
+		m->ol_flags |= PKT_TX_VLAN_PKT;
+
+		/*
+		 * Find the right seg to adjust the data len when offset is
+		 * bigger than tail room size.
+		 */
+		if (unlikely((vs_dev->conf_flags & VS_CNF_FLG_VM2VM_HARDWARE))){
+			if (likely(offset <= rte_pktmbuf_tailroom(m)))
+				m->data_len += offset;
+			else {
+				struct rte_mbuf *seg = m;
+
+				while ((seg->next != NULL) &&
+					(offset > rte_pktmbuf_tailroom(seg)))
+					seg = seg->next;
+
+				seg->data_len += offset;
+			}
+			m->pkt_len += offset;
+		}
+
+		m->vlan_tci = vlan_tag;
+	}
+
+	if (m->ol_flags & PKT_TX_TCP_SEG)
+		virtio_tx_offload(m);
+
+	tx_q->m_table[tx_q->len++] = m;
+	if (vs_dev->conf_flags & VS_CNF_FLG_STATS_EN) {
+		vdev->stats.tx_total++;
+		vdev->stats.tx++;
+	}
+
+	if (unlikely(tx_q->len == MAX_PKT_BURST))
+		do_drain_mbuf_table(tx_q);
+}
+
+static int vmdq_lookup_n_fwd_virtio(struct vswitch_port *vs_port,
+			struct rte_mbuf **pkts, uint16_t count,
+			__attribute__((unused)) uint16_t in_rxq)
+{
+	int i;
+	struct vhost_dev *vdev = vs_port->priv;
+
+	for (i = 0; i < count; ++i)
+		virtio_tx_route(vs_port, pkts[i], vlan_tags[vdev->vid]);
+
+	return 0;
+}
+
+static int vmdq_lookup_n_fwd_phys(struct vswitch_port *vs_port,
+			struct rte_mbuf **pkts, uint16_t count, uint16_t in_rxq)
+{
+	struct vswitch_port *dest_port;
+	struct vhost_dev *dest_vdev;
+	struct vmdq_switch_priv *priv = vs_port->vs_dev->priv;
+	uint16_t enqueue_count;
+
+	dest_port = priv->virtio_port_map[in_rxq];
+	dest_vdev = (struct vhost_dev *)dest_port->priv;
+	enqueue_count = dest_port->do_tx(dest_port, VIRTIO_RXQ,
+					 NULL, pkts, count);
+
+	rte_atomic64_add(&dest_vdev->stats.rx_atomic, enqueue_count);
+
+	return 0;
+}
+
+static int vmdq_lookup_n_fwd(struct vswitch_port *vs_port,
+		struct rte_mbuf **pkts, uint16_t count, uint16_t in_rxq)
+{
+	int rc;
+
+	switch(vs_port->type) {
+	case VSWITCH_PTYPE_VIRTIO:
+		rc = vmdq_lookup_n_fwd_virtio(vs_port, pkts, count, in_rxq);
+		break;
+	case VSWITCH_PTYPE_PHYS:
+		rc = vmdq_lookup_n_fwd_phys(vs_port, pkts, count, in_rxq);
+		break;
+	default:
+		rc = -EINVAL;
+		break;
+	}
+
+	return rc;
+}
+
+static struct vswitch_port *vmdq_sched_phys_port(struct vswitch_dev *vs_dev,
+			__attribute__((unused))enum vswitch_port_type ptype,
+			__attribute__((unused))uint16_t core_id)
+{
+	struct vmdq_switch_priv *priv = vs_dev->priv;
+
+	/*With VMDQ do rx/tx with the only one physical port (non virtio)*/
+
+	return priv->phys_port;
+}
+
+struct vswitch_ops vmdq_switch_ops = {
+	.add_port = vmdq_add_port,
+	.lookup_n_fwd = vmdq_lookup_n_fwd,
+	.port_start = vmdq_port_start,
+	.switch_init = vmdq_switch_init,
+	.learn_port = vmdq_learn_port,
+	.unlearn_port = vmdq_unlearn_port,
+	.sched_rx_port = vmdq_sched_phys_port,
+	.sched_tx_port = vmdq_sched_phys_port,
+	.get_max_vdevs = vmdq_get_max_vdevs,
+};
+
+void vmdq_switch_impl_init(void)
+{
+	vmdq_switch_dev_g = vs_register_switch("vmdq",
+			sizeof(struct vmdq_switch_priv), VMDQ_MAX_VIRTIO_PORTS,
+			&vmdq_switch_ops);
+	if (!vmdq_switch_dev_g) {
+		RTE_LOG(DEBUG, VHOST_CONFIG, "VMDQ switch registration failure\n");
+		goto out;
+	}
+
+out:
+	return;
+}
diff --git a/examples/vhost/vmdq.h b/examples/vhost/vmdq.h
new file mode 100644
index 0000000..bff8535
--- /dev/null
+++ b/examples/vhost/vmdq.h
@@ -0,0 +1,57 @@
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Freescale Semiconductor. All rights reserved.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Freescale Semiconductor nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __VHOST_VMDQ_H__
+#define __VHOST_VMDQ_H__
+
+#include "vswitch_common.h"
+
+#define VMDQ_MAX_PHYS_PORTS	1
+#define VMDQ_MAX_VIRTIO_PORTS	64
+
+struct vmdq_switch_priv {
+	struct vswitch_dev *vs_dev;
+	struct vswitch_port *phys_port;
+	struct vswitch_port *virtio_port_map[VMDQ_MAX_VIRTIO_PORTS];
+	int phys_port_count;
+	int num_devices;
+	uint16_t num_pf_queues;
+	uint16_t num_queues;
+	uint16_t num_vmdq_queues;
+	uint16_t vmdq_pool_base;
+	uint16_t vmdq_queue_base;
+	uint16_t queues_per_pool;
+	uint16_t conf_flags;
+};
+
+#endif
-- 
1.9.1



More information about the dev mailing list