[dpdk-dev] [PATCH] examples/l3fwd: em path performance fix

Xu, Qian Q qian.q.xu at intel.com
Mon Mar 7 07:19:43 CET 2016


Tested-by: Qian Xu <qian.q.xu at intel.com>

- Test Commit: 8f6f24342281f59de0df7bd976a32f714d39b9a9
- OS/Kernel: Fedora 21/4.1.13
- GCC: gcc (GCC) 4.9.2 20141101 (Red Hat 4.9.2-1)
- CPU: Intel(R) Xeon(R) CPU E5-2695 v4 @ 2.10
- NIC: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection (rev 01)
- Target: Intel Corporation 82599ES 10-Gigabit SFI/SFP+ Network Connection (rev 01)
- Total 1 cases, 1 passed, 0 failed. 

Test Case1: test_l3fwd_exact_match_perf 
==================================
Compare two sample performance report on the same platform, same traffic, the only difference is the l3fwd sample. 
1. Using the dpdk-2.2 l3fwd sample, compiled it and run the l3fwd as below:
 ./examples/l3fwd/build/l3fwd -c 0x3c0000 -n 4 -- -p 0xf --config '(0,0,18),(1,0,19),(2,0,20),(3,0,21)' --hash-entry-num 0x400000

2. Using the latest dpdk with the updated l3fwd sample, run it: 
 ./examples/l3fwd/build/l3fwd -c 0x3c0000 -n 4 -- -E -p 0xf --config '(0,0,18),(1,0,19),(2,0,20),(3,0,21)' --hash-entry-num 0x400000

The traffic is same for both samples: 
 The traffic I send to port0 is as below:
 DEST MAC= PORT0's MAC
 Protocol: IPV4
 TCP/IP
 DEST IP: 201.0.0.0     Mode: Continuous Increment Host, Mask: 255.240.0.0---  So about 0x100000 different DEST IP items. 

 Source IP: 200.20.0.1  Mode: Fixed
 SRC PORT: 12
 DEST PORT: 102
 Packet size=64B, sending at line rate,

Test results, the performance with the fix is similar as before, without fix there will be 11% performance drop. 


Thanks
Qian


-----Original Message-----
From: dev [mailto:dev-bounces at dpdk.org] On Behalf Of Tomasz Kulasek
Sent: Friday, March 04, 2016 1:23 AM
To: dev at dpdk.org
Subject: [dpdk-dev] [PATCH] examples/l3fwd: em path performance fix

It seems that for the most use cases, previous hash_multi_lookup provides better performance, and more, sequential lookup can cause significant performance drop.

This patch sets previously optional hash_multi_lookup method as default.
It also provides some minor optimizations such as queue drain only on used tx ports.

Fixes: 94c54b4158d5 ("examples/l3fwd: rework exact-match")

Reported-by: Qian Xu <qian.q.xu at intel.com>
Signed-off-by: Tomasz Kulasek <tomaszx.kulasek at intel.com>
---
 examples/l3fwd/l3fwd.h            |    2 ++
 examples/l3fwd/l3fwd_em.c         |    6 +++---
 examples/l3fwd/l3fwd_em_hlm_sse.h |   12 ++----------
 examples/l3fwd/l3fwd_em_sse.h     |    9 +++++++++
 examples/l3fwd/l3fwd_lpm.c        |    4 ++--
 examples/l3fwd/main.c             |    7 +++++++
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/examples/l3fwd/l3fwd.h b/examples/l3fwd/l3fwd.h index da6d369..207a60a 100644
--- a/examples/l3fwd/l3fwd.h
+++ b/examples/l3fwd/l3fwd.h
@@ -84,6 +84,8 @@ struct lcore_rx_queue {  struct lcore_conf {
 	uint16_t n_rx_queue;
 	struct lcore_rx_queue rx_queue_list[MAX_RX_QUEUE_PER_LCORE];
+	uint16_t n_tx_port;
+	uint16_t tx_port_id[RTE_MAX_ETHPORTS];
 	uint16_t tx_queue_id[RTE_MAX_ETHPORTS];
 	struct mbuf_table tx_mbufs[RTE_MAX_ETHPORTS];
 	void *ipv4_lookup_struct;
diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c index f6a65d8..c8c781d 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -305,7 +305,7 @@ em_get_ipv6_dst_port(void *ipv6_hdr,  uint8_t portid, void *lookup_struct)
  * buffer optimization i.e. ENABLE_MULTI_BUFFER_OPTIMIZE=1.
  */
 #if defined(__SSE4_1__)
-#ifndef HASH_MULTI_LOOKUP
+#ifdef NO_HASH_MULTI_LOOKUP
 #include "l3fwd_em_sse.h"
 #else
 #include "l3fwd_em_hlm_sse.h"
@@ -552,8 +552,8 @@ em_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/l3fwd_em_hlm_sse.h b/examples/l3fwd/l3fwd_em_hlm_sse.h
index d3388da..517815a 100644
--- a/examples/l3fwd/l3fwd_em_hlm_sse.h
+++ b/examples/l3fwd/l3fwd_em_hlm_sse.h
@@ -34,17 +34,9 @@
 #ifndef __L3FWD_EM_HLM_SSE_H__
 #define __L3FWD_EM_HLM_SSE_H__
 
-/**
- * @file
- * This is an optional implementation of packet classification in Exact-Match
- * path using rte_hash_lookup_multi method from previous implementation.
- * While sequential classification seems to be faster, it's disabled by default
- * and can be enabled with HASH_LOOKUP_MULTI global define in compilation time.
- */
-
 #include "l3fwd_sse.h"
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])  { @@ -168,7 +160,7 @@ get_ipv6_5tuple(struct rte_mbuf *m0, __m128i mask0,
 	key->xmm[2] = _mm_and_si128(tmpdata2, mask1);  }
 
-static inline void
+static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv6x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
 		uint8_t portid, uint16_t dst_port[8])  { diff --git a/examples/l3fwd/l3fwd_em_sse.h b/examples/l3fwd/l3fwd_em_sse.h index 4c6d14f..7f10af4 100644
--- a/examples/l3fwd/l3fwd_em_sse.h
+++ b/examples/l3fwd/l3fwd_em_sse.h
@@ -34,6 +34,15 @@
 #ifndef __L3FWD_EM_SSE_H__
 #define __L3FWD_EM_SSE_H__
 
+/**
+ * @file
+ * This is an optional implementation of packet classification in 
+Exact-Match
+ * path using sequential packet classification method.
+ * While hash lookup multi seems to provide better performance, it's 
+disabled
+ * by default and can be enabled with NO_HASH_LOOKUP_MULTI global 
+define in
+ * compilation time.
+ */
+
 #include "l3fwd_sse.h"
 
 static inline __attribute__((always_inline)) uint16_t diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c index e0ed3c4..8df762d 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -158,8 +158,8 @@ lpm_main_loop(__attribute__((unused)) void *dummy)
 		diff_tsc = cur_tsc - prev_tsc;
 		if (unlikely(diff_tsc > drain_tsc)) {
 
-			for (i = 0; i < qconf->n_rx_queue; i++) {
-				portid = qconf->rx_queue_list[i].port_id;
+			for (i = 0; i < qconf->n_tx_port; ++i) {
+				portid = qconf->tx_port_id[i];
 				if (qconf->tx_mbufs[portid].len == 0)
 					continue;
 				send_burst(qconf,
diff --git a/examples/l3fwd/main.c b/examples/l3fwd/main.c index 0e33039..130817c 100644
--- a/examples/l3fwd/main.c
+++ b/examples/l3fwd/main.c
@@ -791,6 +791,7 @@ main(int argc, char **argv)
 	unsigned lcore_id;
 	uint32_t n_tx_queue, nb_lcores;
 	uint8_t portid, nb_rx_queue, queue, socketid;
+	uint8_t nb_tx_port;
 
 	/* init EAL */
 	ret = rte_eal_init(argc, argv);
@@ -830,6 +831,7 @@ main(int argc, char **argv)
 		rte_exit(EXIT_FAILURE, "check_port_config failed\n");
 
 	nb_lcores = rte_lcore_count();
+	nb_tx_port = 0;
 
 	/* Setup function pointers for lookup method. */
 	setup_l3fwd_lookup_tables();
@@ -906,8 +908,13 @@ main(int argc, char **argv)
 			qconf = &lcore_conf[lcore_id];
 			qconf->tx_queue_id[portid] = queueid;
 			queueid++;
+
+			qconf->n_tx_port = nb_tx_port;
+			qconf->tx_port_id[qconf->n_tx_port] = portid;
 		}
 		printf("\n");
+
+		nb_tx_port++;
 	}
 
 	for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
--
1.7.9.5



More information about the dev mailing list