[dpdk-dev] [PATCH 2/4] bond: added link bonding mode 6 implementation.

Michal Jastrzebski michalx.k.jastrzebski at intel.com
Fri Jan 30 11:57:42 CET 2015


This mode includes adaptive TLB and receive load balancing (RLB). In RLB
the bonding driver intercepts ARP replies send by local system and
overwrites its source MAC address, so that different peers send data to
the server on different slave interfaces. When local system sends ARP
request, it saves IP information from it. When ARP reply from that peer
is received, its MAC is stored, one of slave MACs assigned and ARP reply
send to that peer.

Signed-off-by: Maciej Gajdzica  <maciejx.t.gajdzica at intel.com>
---
 lib/librte_pmd_bond/Makefile               |    1 +
 lib/librte_pmd_bond/rte_eth_bond.h         |    9 +
 lib/librte_pmd_bond/rte_eth_bond_alb.c     |  251 ++++++++++++++++++++++++++++
 lib/librte_pmd_bond/rte_eth_bond_alb.h     |  109 ++++++++++++
 lib/librte_pmd_bond/rte_eth_bond_api.c     |    6 +
 lib/librte_pmd_bond/rte_eth_bond_args.c    |    1 +
 lib/librte_pmd_bond/rte_eth_bond_pmd.c     |  231 ++++++++++++++++++++++---
 lib/librte_pmd_bond/rte_eth_bond_private.h |    2 +
 8 files changed, 589 insertions(+), 21 deletions(-)
 create mode 100644 lib/librte_pmd_bond/rte_eth_bond_alb.c
 create mode 100644 lib/librte_pmd_bond/rte_eth_bond_alb.h

diff --git a/lib/librte_pmd_bond/Makefile b/lib/librte_pmd_bond/Makefile
index cdff126..d111f0c 100644
--- a/lib/librte_pmd_bond/Makefile
+++ b/lib/librte_pmd_bond/Makefile
@@ -46,6 +46,7 @@ SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_api.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_pmd.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_args.c
 SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_8023ad.c
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_BOND) += rte_eth_bond_alb.c
 
 ifeq ($(CONFIG_RTE_MBUF_REFCNT),n)
 $(info WARNING: Link Bonding Broadcast mode is disabled because it needs MBUF_REFCNT.)
diff --git a/lib/librte_pmd_bond/rte_eth_bond.h b/lib/librte_pmd_bond/rte_eth_bond.h
index 7177983..13581cb 100644
--- a/lib/librte_pmd_bond/rte_eth_bond.h
+++ b/lib/librte_pmd_bond/rte_eth_bond.h
@@ -101,6 +101,15 @@ extern "C" {
  * This mode provides an adaptive transmit load balancing. It dynamically
  * changes the transmitting slave, according to the computed load. Statistics
  * are collected in 100ms intervals and scheduled every 10ms */
+#define BONDING_MODE_ALB	(6)
+/**< Adaptive Load Balancing (Mode 6)
+ * This mode includes adaptive TLB and receive load balancing (RLB). In RLB the
+ * bonding driver intercepts ARP replies send by local system and overwrites its
+ * source MAC address, so that different peers send data to the server on
+ * different slave interfaces. When local system sends ARP request, it saves IP
+ * information from it. When ARP reply from that peer is received, its MAC is
+ * stored, one of slave MACs assigned and ARP reply send to that peer.
+ */
 
 /* Balance Mode Transmit Policies */
 #define BALANCE_XMIT_POLICY_LAYER2		(0)
diff --git a/lib/librte_pmd_bond/rte_eth_bond_alb.c b/lib/librte_pmd_bond/rte_eth_bond_alb.c
new file mode 100644
index 0000000..449b2f8
--- /dev/null
+++ b/lib/librte_pmd_bond/rte_eth_bond_alb.c
@@ -0,0 +1,251 @@
+#include "rte_eth_bond_private.h"
+#include "rte_eth_bond_alb.h"
+
+static inline uint8_t
+simple_hash(uint8_t *hash_start, int hash_size)
+{
+	int i;
+	uint8_t hash;
+
+	hash = 0;
+	for (i = 0; i < hash_size; ++i)
+		hash ^= hash_start[i];
+
+	return hash;
+}
+
+static uint8_t
+calculate_slave(struct bond_dev_private *internals)
+{
+	uint8_t idx;
+
+	idx = (internals->mode6.last_slave + 1)%internals->active_slave_count;
+	return internals->active_slaves[idx];
+}
+
+int
+bond_mode_alb_enable(struct rte_eth_dev *bond_dev)
+{
+	struct bond_dev_private *internals = bond_dev->data->dev_private;
+	struct client_data *hash_table = internals->mode6.client_table;
+
+	uint16_t element_size;
+	char mem_name[RTE_ETH_NAME_MAX_LEN];
+	int socket_id = bond_dev->pci_dev->numa_node;
+
+	/* Fill hash table with initial values */
+	memset(hash_table, 0, sizeof(struct client_data) * ALB_HASH_TABLE_SIZE);
+
+	internals->mode6.last_slave = ALB_NULL_INDEX;
+	internals->mode6.ntt = 0;
+
+	/* Initialize memory pool for ARP packets to send */
+	if (internals->mode6.mempool == NULL) {
+		/*
+		 * 256 is size of ETH header, ARP header and nested VLAN headers.
+		 * The value is chosen to be cache aligned.
+		 */
+		element_size = 256 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM;
+		snprintf(mem_name, sizeof(mem_name), "%s_MODE6", bond_dev->data->name);
+		internals->mode6.mempool = rte_mempool_create(mem_name,
+				512 * RTE_MAX_ETHPORTS,
+				element_size,
+				RTE_MEMPOOL_CACHE_MAX_SIZE >= 32 ?
+						32 : RTE_MEMPOOL_CACHE_MAX_SIZE,
+				sizeof(struct rte_pktmbuf_pool_private), rte_pktmbuf_pool_init,
+				NULL, rte_pktmbuf_init, NULL, socket_id, 0);
+
+		if (internals->mode6.mempool == NULL) {
+			RTE_LOG(ERR, PMD, "%s: Failed to initialize ALB mempool.\n",
+					bond_dev->data->name);
+			rte_panic(
+					"Failed to alocate memory pool ('%s')\n" "for bond device '%s'\n",
+					mem_name, bond_dev->data->name);
+		}
+	}
+
+	return 0;
+}
+
+void
+bond_mode_alb_arp_recv(struct ether_hdr *eth_h, uint16_t offset,
+		struct bond_dev_private *internals)
+{
+	struct arp_hdr *arp;
+
+	struct client_data *hash_table = internals->mode6.client_table;
+	struct client_data *client_info;
+
+	uint8_t hash_index;
+
+	arp = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
+
+	hash_index = simple_hash((uint8_t *)&arp->arp_data.arp_sip,
+			sizeof(uint32_t));
+	client_info = &hash_table[hash_index];
+
+	if (arp->arp_op == rte_cpu_to_be_16(ARP_OP_REPLY)) {
+		/*
+		 * We got reply for ARP Request send by the application. We need to
+		 * update client table and issue sending update packet to that slave.
+		 */
+		rte_spinlock_lock(&internals->mode6.lock);
+		if (client_info->in_use == 0 ||
+			client_info->app_ip != arp->arp_data.arp_tip ||
+			client_info->cli_ip != arp->arp_data.arp_sip) {
+			client_info->in_use = 1;
+			client_info->app_ip = arp->arp_data.arp_tip;
+			client_info->cli_ip = arp->arp_data.arp_sip;
+			ether_addr_copy(&arp->arp_data.arp_sha, &client_info->cli_mac);
+			client_info->slave_idx = calculate_slave(internals);
+			internals->mode6.last_slave = client_info->slave_idx;
+			rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac);
+			ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_tha);
+			memcpy(client_info->vlan, eth_h + 1, offset);
+		} else if (!is_same_ether_addr(&client_info->cli_mac,
+				&arp->arp_data.arp_sha)) {
+			/*
+			 *  We received response to broadcast message and must update
+			 *  only client MAC.
+			 */
+			ether_addr_copy(&arp->arp_data.arp_sha, &client_info->cli_mac);
+		}
+		internals->mode6.ntt = 1;
+		rte_spinlock_unlock(&internals->mode6.lock);
+	}
+	/* ARP Requests are forwarded to the application with no changes */
+}
+
+uint8_t
+bond_mode_alb_arp_xmit(struct ether_hdr *eth_h, uint16_t offset,
+		struct bond_dev_private *internals)
+{
+	struct arp_hdr *arp;
+
+	struct client_data *hash_table = internals->mode6.client_table;
+	struct client_data *client_info;
+
+	uint8_t hash_index;
+
+	struct ether_addr bonding_mac;
+
+	arp = (struct arp_hdr *)((char *)(eth_h + 1) + offset);
+
+	/*
+	 * Traffic with src MAC other than bonding should be sent on
+	 * current primary port.
+	 */
+	rte_eth_macaddr_get(internals->port_id, &bonding_mac);
+	if (!is_same_ether_addr(&bonding_mac, &arp->arp_data.arp_sha)) {
+		rte_eth_macaddr_get(internals->current_primary_port,
+				&arp->arp_data.arp_sha);
+		return internals->current_primary_port;
+	}
+
+	hash_index = simple_hash((uint8_t *)&arp->arp_data.arp_tip,
+			sizeof(uint32_t));
+	client_info = &hash_table[hash_index];
+
+	rte_spinlock_lock(&internals->mode6.lock);
+	if (arp->arp_op == rte_cpu_to_be_16(ARP_OP_REPLY)) {
+		if (client_info->in_use) {
+			if (client_info->app_ip == arp->arp_data.arp_sip &&
+				client_info->cli_ip == arp->arp_data.arp_tip) {
+				/* Entry is already assigned to this client */
+				if (!is_broadcast_ether_addr(&arp->arp_data.arp_tha)) {
+					ether_addr_copy(&arp->arp_data.arp_tha,
+							&client_info->cli_mac);
+				}
+				rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac);
+				ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_sha);
+				memcpy(client_info->vlan, eth_h + 1, offset);
+				rte_spinlock_unlock(&internals->mode6.lock);
+				return client_info->slave_idx;
+			}
+		}
+
+		/* Assign new slave to this client and update src mac in ARP */
+		client_info->in_use = 1;
+		client_info->ntt = 0;
+		client_info->app_ip = arp->arp_data.arp_sip;
+		ether_addr_copy(&arp->arp_data.arp_tha, &client_info->cli_mac);
+		client_info->cli_ip = arp->arp_data.arp_tip;
+		client_info->slave_idx = calculate_slave(internals);
+		internals->mode6.last_slave = client_info->slave_idx;
+		rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac);
+		ether_addr_copy(&client_info->app_mac, &arp->arp_data.arp_sha);
+		memcpy(client_info->vlan, eth_h + 1, offset);
+		rte_spinlock_unlock(&internals->mode6.lock);
+		return client_info->slave_idx;
+	}
+
+	/* If packet is not ARP Reply, send it on current primary port. */
+	rte_spinlock_unlock(&internals->mode6.lock);
+	rte_eth_macaddr_get(internals->current_primary_port,
+			&arp->arp_data.arp_sha);
+	return internals->current_primary_port;
+}
+
+uint8_t
+bond_mode_alb_arp_upd(struct client_data *client_info,
+		struct rte_mbuf *pkt, struct bond_dev_private *internals)
+{
+	struct ether_hdr *eth_h;
+	struct arp_hdr *arp_h;
+	uint8_t slave_idx;
+
+	rte_spinlock_lock(&internals->mode6.lock);
+	eth_h = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	ether_addr_copy(&client_info->app_mac, &eth_h->s_addr);
+	ether_addr_copy(&client_info->cli_mac, &eth_h->d_addr);
+	eth_h->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP);
+
+	arp_h = (struct arp_hdr *)((char *)eth_h + sizeof(struct ether_hdr)
+			+ client_info->vlan_count * sizeof(struct vlan_hdr));
+
+	memcpy(eth_h + 1, client_info->vlan,
+			client_info->vlan_count * sizeof(struct vlan_hdr));
+
+	ether_addr_copy(&client_info->app_mac, &arp_h->arp_data.arp_sha);
+	arp_h->arp_data.arp_sip = client_info->app_ip;
+	ether_addr_copy(&client_info->cli_mac, &arp_h->arp_data.arp_tha);
+	arp_h->arp_data.arp_tip = client_info->cli_ip;
+
+	arp_h->arp_hrd = rte_cpu_to_be_16(ARP_HRD_ETHER);
+	arp_h->arp_pro = rte_cpu_to_be_16(ETHER_TYPE_IPv4);
+	arp_h->arp_hln = ETHER_ADDR_LEN;
+	arp_h->arp_pln = sizeof(uint32_t);
+	arp_h->arp_op = rte_cpu_to_be_16(ARP_OP_REPLY);
+
+	slave_idx = client_info->slave_idx;
+	rte_spinlock_unlock(&internals->mode6.lock);
+
+	return slave_idx;
+}
+
+void
+bond_mode_alb_client_list_upd(struct rte_eth_dev *bond_dev)
+{
+	struct bond_dev_private *internals = bond_dev->data->dev_private;
+	struct client_data *client_info;
+
+	int i;
+	/* If active slave count is 0, it's pointless to refresh alb table */
+	if (internals->active_slave_count <= 0)
+		return;
+
+	rte_spinlock_lock(&internals->mode6.lock);
+	internals->mode6.last_slave = ALB_NULL_INDEX;
+
+	for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
+		client_info = &internals->mode6.client_table[i];
+		if (client_info->in_use) {
+			client_info->slave_idx = calculate_slave(internals);
+			internals->mode6.last_slave = client_info->slave_idx;
+			rte_eth_macaddr_get(client_info->slave_idx, &client_info->app_mac);
+			internals->mode6.ntt = 1;
+		}
+	}
+	rte_spinlock_unlock(&internals->mode6.lock);
+}
diff --git a/lib/librte_pmd_bond/rte_eth_bond_alb.h b/lib/librte_pmd_bond/rte_eth_bond_alb.h
new file mode 100644
index 0000000..0cfe942
--- /dev/null
+++ b/lib/librte_pmd_bond/rte_eth_bond_alb.h
@@ -0,0 +1,109 @@
+#ifndef RTE_ETH_BOND_ALB_H_
+#define RTE_ETH_BOND_ALB_H_
+
+#include <rte_ether.h>
+#include <rte_arp.h>
+
+#define ALB_HASH_TABLE_SIZE	256
+#define ALB_NULL_INDEX		0xFFFFFFFF
+
+struct client_data {
+	/** ARP data of single client */
+	struct ether_addr app_mac;
+	/**< MAC address of application running DPDK */
+	uint32_t app_ip;
+	/**< IP address of application running DPDK */
+	struct ether_addr cli_mac;
+	/**< Client MAC address */
+	uint32_t cli_ip;
+	/**< Client IP address */
+
+	uint8_t slave_idx;
+	/**< Index of slave on which we connect with that client */
+	uint8_t in_use;
+	/**< Flag indicating if entry in client table is currently used */
+	uint8_t ntt;
+	/**< Flag indicating if we need to send update to this client on next tx */
+
+	struct vlan_hdr vlan[2];
+	/**< Content of vlan headers */
+	uint8_t vlan_count;
+	/**< Number of nested vlan headers */
+};
+
+struct mode_alb_private {
+	struct client_data client_table[ALB_HASH_TABLE_SIZE];
+	/**< Hash table storing ARP data of every client connected */
+	struct rte_mempool *mempool;
+	/**< Mempool for creating ARP update packets */
+	uint8_t ntt;
+	/**< Flag indicating if we need to send update to any client on next tx */
+	uint32_t last_slave;
+	/**< Index of last used slave in client table */
+	rte_spinlock_t lock;
+};
+
+/**
+ * ALB mode initialization.
+ *
+ * @param bond_dev		Pointer to bonding device.
+ *
+ * @return
+ * Error code - 0 on success.
+ */
+int
+bond_mode_alb_enable(struct rte_eth_dev *bond_dev);
+
+/**
+ * Function handles ARP packet reception. If received ARP request, it is
+ * forwarded to application without changes. If it is ARP reply, client table
+ * is updated.
+ *
+ * @param eth_h			ETH header of received packet.
+ * @param offset		Vlan header offset.
+ * @param internals		Bonding data.
+ */
+void
+bond_mode_alb_arp_recv(struct ether_hdr *eth_h, uint16_t offset,
+		struct bond_dev_private *internals);
+
+/**
+ * Function handles ARP packet transmission. It also decides on which slave
+ * send that packet. If packet is ARP Request, it is send on primary slave.
+ * If it is ARP Reply, it is send on slave stored in client table for that
+ * connection. On Reply function also updates data in client table.
+ *
+ * @param eth_h			ETH header of transmitted packet.
+ * @param offset		Vlan header offset.
+ * @param internals		Bonding data.
+ *
+ * @return
+ * Index of slave on which packet should be sent.
+ */
+uint8_t
+bond_mode_alb_arp_xmit(struct ether_hdr *eth_h, uint16_t offset,
+		struct bond_dev_private *internals);
+
+/**
+ * Function fills packet with ARP data from client_info.
+ *
+ * @param client_info	Data of client to which packet is sent.
+ * @param pkt			Pointer to packet which is sent.
+ * @param internals		Bonding data.
+ *
+ * @return
+ * Index of slawe on which packet should be sent.
+ */
+uint8_t
+bond_mode_alb_arp_upd(struct client_data *client_info,
+		struct rte_mbuf *pkt, struct bond_dev_private *internals);
+
+/**
+ * Function updates slave indexes of active connections.
+ *
+ * @param bond_dev		Pointer to bonded device struct.
+ */
+void
+bond_mode_alb_client_list_upd(struct rte_eth_dev *bond_dev);
+
+#endif /* RTE_ETH_BOND_ALB_H_ */
diff --git a/lib/librte_pmd_bond/rte_eth_bond_api.c b/lib/librte_pmd_bond/rte_eth_bond_api.c
index 4ab3267..92ef3ae 100644
--- a/lib/librte_pmd_bond/rte_eth_bond_api.c
+++ b/lib/librte_pmd_bond/rte_eth_bond_api.c
@@ -120,6 +120,9 @@ activate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id)
 
 	internals->active_slaves[internals->active_slave_count] = port_id;
 	internals->active_slave_count++;
+
+	if (internals->mode == BONDING_MODE_ALB)
+		bond_mode_alb_client_list_upd(eth_dev);
 }
 
 void
@@ -152,6 +155,9 @@ deactivate_slave(struct rte_eth_dev *eth_dev, uint8_t port_id)
 
 	if (eth_dev->data->dev_started && internals->mode == BONDING_MODE_8023AD)
 		bond_mode_8023ad_start(eth_dev);
+
+	if (internals->mode == BONDING_MODE_ALB)
+		bond_mode_alb_client_list_upd(eth_dev);
 }
 
 uint8_t
diff --git a/lib/librte_pmd_bond/rte_eth_bond_args.c b/lib/librte_pmd_bond/rte_eth_bond_args.c
index ca4de38..a3f7f55 100644
--- a/lib/librte_pmd_bond/rte_eth_bond_args.c
+++ b/lib/librte_pmd_bond/rte_eth_bond_args.c
@@ -175,6 +175,7 @@ bond_ethdev_parse_slave_mode_kvarg(const char *key __rte_unused,
 #endif
 	case BONDING_MODE_8023AD:
 	case BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING:
+	case BONDING_MODE_ALB:
 		return 0;
 	default:
 		RTE_BOND_LOG(ERR, "Invalid slave mode value (%s) specified", value);
diff --git a/lib/librte_pmd_bond/rte_eth_bond_pmd.c b/lib/librte_pmd_bond/rte_eth_bond_pmd.c
index 8b80297..b0525cc 100644
--- a/lib/librte_pmd_bond/rte_eth_bond_pmd.c
+++ b/lib/librte_pmd_bond/rte_eth_bond_pmd.c
@@ -56,6 +56,42 @@
 /* Table for statistics in mode 5 TLB */
 static uint64_t tlb_last_obytets[RTE_MAX_ETHPORTS];
 
+static inline size_t
+get_vlan_offset(struct ether_hdr *eth_hdr)
+{
+	size_t vlan_offset = 0;
+
+	/* Calculate VLAN offset */
+	if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == eth_hdr->ether_type) {
+		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+		vlan_offset = sizeof(struct vlan_hdr);
+
+		while (rte_cpu_to_be_16(ETHER_TYPE_VLAN) ==
+				vlan_hdr->eth_proto) {
+			vlan_hdr = vlan_hdr + 1;
+			vlan_offset += sizeof(struct vlan_hdr);
+		}
+	}
+	return vlan_offset;
+}
+
+static uint16_t
+get_vlan_ethertype(struct ether_hdr *eth_hdr)
+{
+	if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == eth_hdr->ether_type) {
+		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
+
+		while (rte_cpu_to_be_16(ETHER_TYPE_VLAN) ==
+				vlan_hdr->eth_proto) {
+			vlan_hdr = vlan_hdr + 1;
+		}
+
+		return vlan_hdr->eth_proto;
+	} else {
+		return eth_hdr->ether_type;
+	}
+}
+
 static uint16_t
 bond_ethdev_rx_burst(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 {
@@ -173,6 +209,34 @@ bond_ethdev_rx_burst_8023ad(void *queue, struct rte_mbuf **bufs,
 }
 
 static uint16_t
+bond_ethdev_rx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
+	struct bond_dev_private *internals = bd_tx_q->dev_private;
+
+	struct ether_hdr *eth_h;
+
+	uint16_t ether_type, offset;
+	uint16_t nb_recv_pkts;
+
+	int i;
+
+	nb_recv_pkts = bond_ethdev_rx_burst(queue, bufs, nb_pkts);
+
+	for (i = 0; i < nb_recv_pkts; i++) {
+		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
+		offset = get_vlan_offset(eth_h);
+		ether_type = get_vlan_ethertype(eth_h);
+
+		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
+			bond_mode_alb_arp_recv(eth_h, offset, internals);
+		}
+	}
+
+	return nb_recv_pkts;
+}
+
+static uint16_t
 bond_ethdev_tx_burst_round_robin(void *queue, struct rte_mbuf **bufs,
 		uint16_t nb_pkts)
 {
@@ -281,25 +345,6 @@ ipv6_hash(struct ipv6_hdr *ipv6_hdr)
 			(word_src_addr[3] ^ word_dst_addr[3]);
 }
 
-static inline size_t
-get_vlan_offset(struct ether_hdr *eth_hdr)
-{
-	size_t vlan_offset = 0;
-
-	/* Calculate VLAN offset */
-	if (rte_cpu_to_be_16(ETHER_TYPE_VLAN) == eth_hdr->ether_type) {
-		struct vlan_hdr *vlan_hdr = (struct vlan_hdr *)(eth_hdr + 1);
-		vlan_offset = sizeof(struct vlan_hdr);
-
-		while (rte_cpu_to_be_16(ETHER_TYPE_VLAN) ==
-				vlan_hdr->eth_proto) {
-			vlan_hdr = vlan_hdr + 1;
-			vlan_offset += sizeof(struct vlan_hdr);
-		}
-	}
-	return vlan_offset;
-}
-
 uint16_t
 xmit_l2_hash(const struct rte_mbuf *buf, uint8_t slave_count)
 {
@@ -525,6 +570,134 @@ bond_ethdev_tx_burst_tlb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
 }
 
 static uint16_t
+bond_ethdev_tx_burst_alb(void *queue, struct rte_mbuf **bufs, uint16_t nb_pkts)
+{
+	struct bond_tx_queue *bd_tx_q = (struct bond_tx_queue *)queue;
+	struct bond_dev_private *internals = bd_tx_q->dev_private;
+
+	struct ether_hdr *eth_h;
+	uint16_t ether_type, offset;
+
+	struct client_data *client_info;
+
+	/*
+	 * We create transmit buffers for every slave and one additional to send
+	 * through tlb. In worst case every packet will be send on one port.
+	 */
+	struct rte_mbuf *slave_bufs[RTE_MAX_ETHPORTS + 1][nb_pkts];
+	uint16_t slave_bufs_pkts[RTE_MAX_ETHPORTS + 1] = { 0 };
+
+	/*
+	 * We create separate transmit buffers for update packets as they wont be
+	 * counted in num_tx_total.
+	 */
+	struct rte_mbuf *update_bufs[RTE_MAX_ETHPORTS][ALB_HASH_TABLE_SIZE];
+	uint16_t update_bufs_pkts[RTE_MAX_ETHPORTS] = { 0 };
+
+	struct rte_mbuf *upd_pkt;
+	size_t pkt_size;
+
+	uint16_t num_send, num_not_send = 0;
+	uint16_t num_tx_total = 0;
+	uint8_t slave_idx;
+
+	int i, j;
+
+	/* Search tx buffer for ARP packets and forward them to alb */
+	for (i = 0; i < nb_pkts; i++) {
+		eth_h = rte_pktmbuf_mtod(bufs[i], struct ether_hdr *);
+		offset = get_vlan_offset(eth_h);
+		ether_type = get_vlan_ethertype(eth_h);
+
+		if (ether_type == rte_cpu_to_be_16(ETHER_TYPE_ARP)) {
+			slave_idx = bond_mode_alb_arp_xmit(eth_h, offset, internals);
+
+			/* Change src mac in eth header */
+			rte_eth_macaddr_get(slave_idx, &eth_h->s_addr);
+
+			/* Add packet to slave tx buffer */
+			slave_bufs[slave_idx][slave_bufs_pkts[slave_idx]] = bufs[i];
+			slave_bufs_pkts[slave_idx]++;
+		} else {
+			/* If packet is not ARP, send it with TLB policy */
+			slave_bufs[RTE_MAX_ETHPORTS][slave_bufs_pkts[RTE_MAX_ETHPORTS]] =
+					bufs[i];
+			slave_bufs_pkts[RTE_MAX_ETHPORTS]++;
+		}
+	}
+
+	/* Update connected client ARP tables */
+	if (internals->mode6.ntt) {
+		for (i = 0; i < ALB_HASH_TABLE_SIZE; i++) {
+			client_info = &internals->mode6.client_table[i];
+
+			if (client_info->in_use) {
+				/* Allocate new packet to send ARP update on current slave */
+				upd_pkt = rte_pktmbuf_alloc(internals->mode6.mempool);
+				if (upd_pkt == NULL) {
+					RTE_LOG(ERR, PMD, "Failed to allocate ARP packet from pool\n");
+					continue;
+				}
+				pkt_size = sizeof(struct ether_hdr) + sizeof(struct arp_hdr);
+				upd_pkt->data_len = pkt_size;
+				upd_pkt->pkt_len = pkt_size;
+
+				slave_idx = bond_mode_alb_arp_upd(client_info, upd_pkt,
+						internals);
+
+				/* Add packet to update tx buffer */
+				update_bufs[slave_idx][update_bufs_pkts[slave_idx]] = upd_pkt;
+				update_bufs_pkts[slave_idx]++;
+			}
+		}
+		internals->mode6.ntt = 0;
+	}
+
+	/* Send ARP packets on proper slaves */
+	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
+		if (slave_bufs_pkts[i] > 0) {
+			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id,
+					slave_bufs[i], slave_bufs_pkts[i]);
+			for (j = 0; j < slave_bufs_pkts[i] - num_send; j++) {
+				bufs[nb_pkts - 1 - num_not_send - j] =
+						slave_bufs[i][nb_pkts - 1 - j];
+			}
+
+			num_tx_total += num_send;
+			num_not_send += slave_bufs_pkts[i] - num_send;
+		}
+	}
+
+	/* Send update packets on proper slaves */
+	for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
+		if (update_bufs_pkts[i] > 0) {
+			num_send = rte_eth_tx_burst(i, bd_tx_q->queue_id, update_bufs[i],
+					update_bufs_pkts[i]);
+			for (j = num_send; j < update_bufs_pkts[i]; j++) {
+				rte_pktmbuf_free(update_bufs[i][j]);
+			}
+		}
+	}
+
+	/* Send non-ARP packets using tlb policy */
+	if (slave_bufs_pkts[RTE_MAX_ETHPORTS] > 0) {
+		num_send = bond_ethdev_tx_burst_tlb(queue,
+				slave_bufs[RTE_MAX_ETHPORTS],
+				slave_bufs_pkts[RTE_MAX_ETHPORTS]);
+
+		for (j = 0; j < slave_bufs_pkts[RTE_MAX_ETHPORTS]; j++) {
+			bufs[nb_pkts - 1 - num_not_send - j] =
+					slave_bufs[RTE_MAX_ETHPORTS][nb_pkts - 1 - j];
+		}
+
+		num_tx_total += num_send;
+		num_not_send += slave_bufs_pkts[RTE_MAX_ETHPORTS] - num_send;
+	}
+
+	return num_tx_total;
+}
+
+static uint16_t
 bond_ethdev_tx_burst_balance(void *queue, struct rte_mbuf **bufs,
 		uint16_t nb_pkts)
 {
@@ -852,6 +1025,7 @@ mac_address_slaves_update(struct rte_eth_dev *bonded_eth_dev)
 		break;
 	case BONDING_MODE_ACTIVE_BACKUP:
 	case BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING:
+	case BONDING_MODE_ALB:
 	default:
 		for (i = 0; i < internals->slave_count; i++) {
 			if (internals->slaves[i].port_id ==
@@ -917,6 +1091,13 @@ bond_ethdev_mode_set(struct rte_eth_dev *eth_dev, int mode)
 		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_tlb;
 		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_active_backup;
 		break;
+	case BONDING_MODE_ALB:
+		if (bond_mode_alb_enable(eth_dev) != 0)
+			return -1;
+
+		eth_dev->tx_pkt_burst = bond_ethdev_tx_burst_alb;
+		eth_dev->rx_pkt_burst = bond_ethdev_rx_burst_alb;
+		break;
 	default:
 		return -1;
 	}
@@ -1132,7 +1313,8 @@ bond_ethdev_start(struct rte_eth_dev *eth_dev)
 	if (internals->mode == BONDING_MODE_8023AD)
 		bond_mode_8023ad_start(eth_dev);
 
-	if (internals->mode == BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING)
+	if (internals->mode == BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING ||
+			internals->mode == BONDING_MODE_ALB)
 		bond_ethdev_update_tlb_slave_cb(internals);
 
 	return 0;
@@ -1164,7 +1346,8 @@ bond_ethdev_stop(struct rte_eth_dev *eth_dev)
 		}
 	}
 
-	if (internals->mode == BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING) {
+	if (internals->mode == BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING ||
+			internals->mode == BONDING_MODE_ALB) {
 		rte_eal_alarm_cancel(bond_ethdev_update_tlb_slave_cb, internals);
 	}
 
@@ -1362,8 +1545,12 @@ bond_ethdev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
 {
 	struct bond_dev_private *internals = dev->data->dev_private;
 	struct rte_eth_stats slave_stats;
+
 	int i;
 
+	/* clear bonded stats before populating from slaves */
+	memset(stats, 0, sizeof(*stats));
+
 	for (i = 0; i < internals->slave_count; i++) {
 		rte_eth_stats_get(internals->slaves[i].port_id, &slave_stats);
 
@@ -1418,6 +1605,7 @@ bond_ethdev_promiscuous_enable(struct rte_eth_dev *eth_dev)
 	/* Promiscuous mode is propagated only to primary slave */
 	case BONDING_MODE_ACTIVE_BACKUP:
 	case BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING:
+	case BONDING_MODE_ALB:
 	default:
 		rte_eth_promiscuous_enable(internals->current_primary_port);
 	}
@@ -1447,6 +1635,7 @@ bond_ethdev_promiscuous_disable(struct rte_eth_dev *dev)
 	/* Promiscuous mode is propagated only to primary slave */
 	case BONDING_MODE_ACTIVE_BACKUP:
 	case BONDING_MODE_ADAPTIVE_TRANSMIT_LOAD_BALANCING:
+	case BONDING_MODE_ALB:
 	default:
 		rte_eth_promiscuous_disable(internals->current_primary_port);
 	}
diff --git a/lib/librte_pmd_bond/rte_eth_bond_private.h b/lib/librte_pmd_bond/rte_eth_bond_private.h
index e01e66b..e69e301 100644
--- a/lib/librte_pmd_bond/rte_eth_bond_private.h
+++ b/lib/librte_pmd_bond/rte_eth_bond_private.h
@@ -43,6 +43,7 @@ extern "C" {
 
 #include "rte_eth_bond.h"
 #include "rte_eth_bond_8023ad_private.h"
+#include "rte_eth_bond_alb.h"
 
 #define PMD_BOND_SLAVE_PORT_KVARG			("slave")
 #define PMD_BOND_PRIMARY_SLAVE_KVARG		("primary")
@@ -152,6 +153,7 @@ struct bond_dev_private {
 	/**< Arary of bonded slaves details */
 
 	struct mode8023ad_private mode4;
+	struct mode_alb_private mode6;
 
 	uint32_t rx_offload_capa;            /** Rx offload capability */
 	uint32_t tx_offload_capa;            /** Tx offload capability */
-- 
1.7.9.5



More information about the dev mailing list