[dpdk-dev,v2,5/7] examples/l3fwd: add neon support for l3fwd

Message ID 1494383419-9677-6-git-send-email-jianbo.liu@linaro.org (mailing list archive)
State Superseded, archived
Headers

Checks

Context Check Description
ci/checkpatch success coding style OK
ci/Intel-compilation success Compilation OK

Commit Message

Jianbo Liu May 10, 2017, 2:30 a.m. UTC
  Use ARM NEON intrinsics to accelerate l3 fowarding.

Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
---
 examples/l3fwd/l3fwd_em.c            |   4 +-
 examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
 examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
 examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
 examples/l3fwd/l3fwd_lpm.c           |   4 +-
 examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++
 examples/l3fwd/l3fwd_neon.h          | 259 +++++++++++++++++++++++++++++++++++
 7 files changed, 539 insertions(+), 6 deletions(-)
 create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
 create mode 100644 examples/l3fwd/l3fwd_neon.h
  

Comments

Sekhar, Ashwin May 10, 2017, 3 p.m. UTC | #1
Hi Jianbo,

Thanks for version v2. Addition of the prefetch instructions is
definitely helping performance on ThunderX. But still performance is
slightly less than that of scalar.

I tried few small tweaks which helped improve performance on my
Thunderx setup. For details see comments inline.


On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
> Use ARM NEON intrinsics to accelerate l3 fowarding.

> 

> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

> ---

>  examples/l3fwd/l3fwd_em.c            |   4 +-

>  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-

>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++

>  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-

>  examples/l3fwd/l3fwd_lpm.c           |   4 +-

>  examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++

>  examples/l3fwd/l3fwd_neon.h          | 259

> +++++++++++++++++++++++++++++++++++

>  7 files changed, 539 insertions(+), 6 deletions(-)

>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h

>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h

>  create mode 100644 examples/l3fwd/l3fwd_neon.h

> 

> [...]

> diff --git a/examples/l3fwd/l3fwd_em_hlm.h

> b/examples/l3fwd/l3fwd_em_hlm.h

> index 636dea4..4ec600a 100644

> --- a/examples/l3fwd/l3fwd_em_hlm.h

> +++ b/examples/l3fwd/l3fwd_em_hlm.h

> @@ -35,8 +35,13 @@

>  #ifndef __L3FWD_EM_HLM_H__

>  #define __L3FWD_EM_HLM_H__

>  

> +#if defined(__SSE4_1__)

>  #include "l3fwd_sse.h"

>  #include "l3fwd_em_hlm_sse.h"

> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> +#include "l3fwd_neon.h"

> +#include "l3fwd_em_hlm_neon.h"

> +#endif

>  

>  static inline __attribute__((always_inline)) void

>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf

> *m[8],

> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))

> uint16_t

>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

>  		uint8_t portid, struct lcore_conf *qconf)

>  {

> -	int32_t j;

> +	int32_t i, j, pos;

>  	uint16_t dst_port[MAX_PKT_BURST];

>  

>  	/*

> @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))

> uint16_t

>  	 */

>  	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);

>  

> +	for (j = 0; j < 8 && j < nb_rx; j++) {

> +		rte_prefetch0(pkts_burst[j]);

The above prefetch of rte_mbuf struct is unnecessary. With this we wont
see any performance improvement as the contents of rte_mbuf (buf_addr
and data_off) is used in right next instruction. Removing the above
prefetch and similar prefetches at multiple places was improving
performance on my ThunderX setup.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],

> +					       struct ether_hdr *) + 

> 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
process_packet in l3fwd_neon.h, eth_header is accessed.

> +	}

> +

>  	for (j = 0; j < n; j += 8) {

>  

>  		uint32_t pkt_type =

> @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))

> uint16_t

>  		uint32_t tcp_or_udp = pkt_type &

>  			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);

>  

> +		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, 

> pos++) {

> +			rte_prefetch0(pkts_burst[pos]);

The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po

> s],

> +						       struct

> ether_hdr *) + 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}

> +

>  		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {

>  

>  			em_get_dst_port_ipv4x8(qconf,

> &pkts_burst[j], portid,

> 

> [...]


> diff --git a/examples/l3fwd/l3fwd_em_sequential.h

> b/examples/l3fwd/l3fwd_em_sequential.h

> index c0a9725..c3df473 100644

> --- a/examples/l3fwd/l3fwd_em_sequential.h

> +++ b/examples/l3fwd/l3fwd_em_sequential.h

> @@ -43,7 +43,11 @@

>   * compilation time.

>   */

>  

> +#if defined(__SSE4_1__)

>  #include "l3fwd_sse.h"

> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> +#include "l3fwd_neon.h"

> +#endif

>  

>  static inline __attribute__((always_inline)) uint16_t

>  em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf

> *pkt,

> @@ -101,11 +105,23 @@ static inline __attribute__((always_inline))

> uint16_t

>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

>  			uint8_t portid, struct lcore_conf *qconf)

>  {

> -	int32_t j;

> +	int32_t i, j;

>  	uint16_t dst_port[MAX_PKT_BURST];

>  

> -	for (j = 0; j < nb_rx; j++)

> +	if (nb_rx > 0) {

> +		rte_prefetch0(pkts_burst[0]);

The above prefetch of rte_mbuf struct is unnecessary.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],

> +					       struct ether_hdr *) +

> 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +	}

> +

> +	for (i = 1, j = 0; j < nb_rx; i++, j++) {

> +		if (i < nb_rx) {

> +			rte_prefetch0(pkts_burst[i]);

The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i]

> ,

> +						       struct

> ether_hdr *) + 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}

>  		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j],

> portid);

> +	}

>  

>  	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);

>  }

> [...]


> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h

> b/examples/l3fwd/l3fwd_lpm_neon.h

> new file mode 100644

> index 0000000..2f047b3

> --- /dev/null

> +++ b/examples/l3fwd/l3fwd_lpm_neon.h

> 

> [...]


> +/*

> + * Buffer optimized handling of packets, invoked

> + * from main_loop.

> + */

> +static inline void

> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

> +			uint8_t portid, struct lcore_conf *qconf)

> +{

> +	int32_t i, j, pos;

> +	uint16_t dst_port[MAX_PKT_BURST];

> +	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];

If you see carefully, we dont need an array of dip. We just need a
single element. dip value is calculated in processx4_step1 and consumed
in processx4_step2, and thats it. No need to save it in an array.

> +	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];

Same as dip. We dont need an array of ipv4_flag.

> +	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);

> +

> +	for (j = 0; j < FWDSTEP && j < nb_rx; j++) {

> +		rte_prefetch0(pkts_burst[j]);

The above prefetch of rte_mbuf struct is unnecessary.

> +		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],

> +					       struct ether_hdr *) +

> 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +	}

> +

> +	for (j = 0; j != k; j += FWDSTEP) {

> +		for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <

> nb_rx;

> +		     i++, pos++) {

> +			rte_prefetch0(pkts_burst[pos]);

The above prefetch of rte_mbuf struct is unnecessary.

> +			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po

> s],

> +						       struct

> ether_hdr *) + 1);

Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> +		}

> +		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],

> +				&ipv4_flag[j / FWDSTEP]);

> +

> +		processx4_step2(qconf, dip[j / FWDSTEP],

> +				ipv4_flag[j / FWDSTEP], portid,

> &pkts_burst[j],

> +				&dst_port[j]);

> +	}

> +

> +	/* Classify last up to 3 packets one by one */

> +	switch (nb_rx % FWDSTEP) {

> +	case 3:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +		/* fallthrough */

> +	case 2:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +		/* fallthrough */

> +	case 1:

> +		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],

> portid);

> +		j++;

> +	}

> +

> +	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);

> +}

> +

> +#endif /* __L3FWD_LPM_NEON_H__ */

> diff --git a/examples/l3fwd/l3fwd_neon.h

> b/examples/l3fwd/l3fwd_neon.h

> new file mode 100644

> index 0000000..75c8976

> --- /dev/null

> +++ b/examples/l3fwd/l3fwd_neon.h

> [...]


> +

> +/**

> + * Process one packet:

> + * Update source and destination MAC addresses in the ethernet

> header.

> + * Perform RFC1812 checks and updates for IPV4 packets.

> + */

> +static inline void

> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)

> +{

> +	struct ether_hdr *eth_hdr;

> +	uint32x4_t te, ve;

> +

> +	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);

eth_hdr accessed here. Hence the earlier comments about prefetching at
eth header.

> +

> +	te = vld1q_u32((uint32_t *)eth_hdr);

> +	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);

> +

> +

> +	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,

> +			pkt->packet_type);

> +

> +	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);

Use vcopyq_laneq_u32 for easily doing the above.

> +	vst1q_u32((uint32_t *)eth_hdr, ve);

> +}

> +

> [...]

> +#endif /* _L3FWD_NEON_H_ */


Combining all the above comments, I made some changes on top of your
patch. These changes are giving 3-4% improvement over your version.

You may find the changes at
https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc

Please check it out and let me know your comments.

Thanks
Ashwin
  
Jianbo Liu May 11, 2017, 3:16 a.m. UTC | #2
Hi Ashwin,

On 10 May 2017 at 23:00, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
> Hi Jianbo,
>
> Thanks for version v2. Addition of the prefetch instructions is
> definitely helping performance on ThunderX. But still performance is
> slightly less than that of scalar.
>
> I tried few small tweaks which helped improve performance on my
> Thunderx setup. For details see comments inline.
>
>
> On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:
>> Use ARM NEON intrinsics to accelerate l3 fowarding.
>>
>> Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>
>> ---
>>  examples/l3fwd/l3fwd_em.c            |   4 +-
>>  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-
>>  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++
>>  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-
>>  examples/l3fwd/l3fwd_lpm.c           |   4 +-
>>  examples/l3fwd/l3fwd_lpm_neon.h      | 165 ++++++++++++++++++++++
>>  examples/l3fwd/l3fwd_neon.h          | 259
>> +++++++++++++++++++++++++++++++++++
>>  7 files changed, 539 insertions(+), 6 deletions(-)
>>  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h
>>  create mode 100644 examples/l3fwd/l3fwd_neon.h
>>
>> [...]
>> diff --git a/examples/l3fwd/l3fwd_em_hlm.h
>> b/examples/l3fwd/l3fwd_em_hlm.h
>> index 636dea4..4ec600a 100644
>> --- a/examples/l3fwd/l3fwd_em_hlm.h
>> +++ b/examples/l3fwd/l3fwd_em_hlm.h
>> @@ -35,8 +35,13 @@
>>  #ifndef __L3FWD_EM_HLM_H__
>>  #define __L3FWD_EM_HLM_H__
>>
>> +#if defined(__SSE4_1__)
>>  #include "l3fwd_sse.h"
>>  #include "l3fwd_em_hlm_sse.h"
>> +#elif defined(RTE_MACHINE_CPUFLAG_NEON)
>> +#include "l3fwd_neon.h"
>> +#include "l3fwd_em_hlm_neon.h"
>> +#endif
>>
>>  static inline __attribute__((always_inline)) void
>>  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf
>> *m[8],
>> @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))
>> uint16_t
>>  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>>               uint8_t portid, struct lcore_conf *qconf)
>>  {
>> -     int32_t j;
>> +     int32_t i, j, pos;
>>       uint16_t dst_port[MAX_PKT_BURST];
>>
>>       /*
>> @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>        */
>>       int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
>>
>> +     for (j = 0; j < 8 && j < nb_rx; j++) {
>> +             rte_prefetch0(pkts_burst[j]);
> The above prefetch of rte_mbuf struct is unnecessary. With this we wont
> see any performance improvement as the contents of rte_mbuf (buf_addr
> and data_off) is used in right next instruction. Removing the above
> prefetch and similar prefetches at multiple places was improving
> performance on my ThunderX setup.

Yes, will remove them.

>
>> +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>> +                                            struct ether_hdr *) +
>> 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In
> process_packet in l3fwd_neon.h, eth_header is accessed in
>

But ip headers are used right in each 8/FWDSTEP loop.
Since ip headers are accessed first, we should prefetch eth_hdr + 1 first.
After all nb_rx packets are handled in above small loop, their
eth_header are then accessed in processx4_step3 over again.
I'm not sure prefretching eth_hdr still works if we prefetch eth_hdr
in first step,  as cache may be already filled with new data at that
time.

>> +     }
>> +
>>       for (j = 0; j < n; j += 8) {
>>
>>               uint32_t pkt_type =
>> @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))
>> uint16_t
>>               uint32_t tcp_or_udp = pkt_type &
>>                       (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
>>
>> +             for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,
>> pos++) {
>> +                     rte_prefetch0(pkts_burst[pos]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
>> s],
>> +                                                    struct
>> ether_hdr *) + 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +             }
>> +
>>               if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
>>
>>                       em_get_dst_port_ipv4x8(qconf,
>> &pkts_burst[j], portid,
>>
>> [...]
>

....

>> diff --git a/examples/l3fwd/l3fwd_lpm_neon.h
>> b/examples/l3fwd/l3fwd_lpm_neon.h
>> new file mode 100644
>> index 0000000..2f047b3
>> --- /dev/null
>> +++ b/examples/l3fwd/l3fwd_lpm_neon.h
>>
>> [...]
>
>> +/*
>> + * Buffer optimized handling of packets, invoked
>> + * from main_loop.
>> + */
>> +static inline void
>> +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
>> +                     uint8_t portid, struct lcore_conf *qconf)
>> +{
>> +     int32_t i, j, pos;
>> +     uint16_t dst_port[MAX_PKT_BURST];
>> +     int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
> If you see carefully, we dont need an array of dip. We just need a
> single element. dip value is calculated in processx4_step1 and consumed
> in processx4_step2, and thats it. No need to save it in an array.
>

Will change, thanks!

>> +     uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
> Same as dip. We dont need an array of ipv4_flag.
>
>> +     const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
>> +
>> +     for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
>> +             rte_prefetch0(pkts_burst[j]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
>> +                                            struct ether_hdr *) +
>> 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +     }
>> +
>> +     for (j = 0; j != k; j += FWDSTEP) {
>> +             for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <
>> nb_rx;
>> +                  i++, pos++) {
>> +                     rte_prefetch0(pkts_burst[pos]);
> The above prefetch of rte_mbuf struct is unnecessary.
>
>> +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[po
>> s],
>> +                                                    struct
>> ether_hdr *) + 1);
> Better to prefetch at eth_hdr itself and not at eth_hdr + 1
>
>> +             }
>> +             processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
>> +                             &ipv4_flag[j / FWDSTEP]);
>> +
>> +             processx4_step2(qconf, dip[j / FWDSTEP],
>> +                             ipv4_flag[j / FWDSTEP], portid,
>> &pkts_burst[j],
>> +                             &dst_port[j]);
>> +     }
>> +
>> +     /* Classify last up to 3 packets one by one */
>> +     switch (nb_rx % FWDSTEP) {
>> +     case 3:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +             /* fallthrough */
>> +     case 2:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +             /* fallthrough */
>> +     case 1:
>> +             dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j],
>> portid);
>> +             j++;
>> +     }
>> +
>> +     send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
>> +}
>> +
>> +#endif /* __L3FWD_LPM_NEON_H__ */
>> diff --git a/examples/l3fwd/l3fwd_neon.h
>> b/examples/l3fwd/l3fwd_neon.h
>> new file mode 100644
>> index 0000000..75c8976
>> --- /dev/null
>> +++ b/examples/l3fwd/l3fwd_neon.h
>> [...]
>
>> +
>> +/**
>> + * Process one packet:
>> + * Update source and destination MAC addresses in the ethernet
>> header.
>> + * Perform RFC1812 checks and updates for IPV4 packets.
>> + */
>> +static inline void
>> +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
>> +{
>> +     struct ether_hdr *eth_hdr;
>> +     uint32x4_t te, ve;
>> +
>> +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
> eth_hdr accessed here. Hence the earlier comments about prefetching at
> eth header.
>

process_packet is called only for the last 1-3 packets, most are
handled in processx4_step3.
As these 2 functions access packets from the first one once again, the
prefetch may not work.
Please see my explanation in the above...

>> +
>> +     te = vld1q_u32((uint32_t *)eth_hdr);
>> +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
>> +
>> +
>> +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
>> +                     pkt->packet_type);
>> +
>> +     ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
> Use vcopyq_laneq_u32 for easily doing the above.
>

Will change. Thanks!

>> +     vst1q_u32((uint32_t *)eth_hdr, ve);
>> +}
>> +
>> [...]
>> +#endif /* _L3FWD_NEON_H_ */
>
> Combining all the above comments, I made some changes on top of your
> patch. These changes are giving 3-4% improvement over your version.
>
> You may find the changes at
> https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc
>

Is the correct in Line 103/104, you only process one packets in the
last FWDSTEP packets?
Actually, I don't like your change in l3fwd_lpm_send_packets, making
the simple logic complicated. And I don't think it can help to improve
performance. :-)

> Please check it out and let me know your comments.
>
> Thanks
> Ashwin
  
Sekhar, Ashwin May 11, 2017, 4:14 a.m. UTC | #3
On Thu, 2017-05-11 at 11:16 +0800, Jianbo Liu wrote:
> Hi Ashwin,

> 

> On 10 May 2017 at 23:00, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com>

> wrote:

> > 

> > Hi Jianbo,

> > 

> > Thanks for version v2. Addition of the prefetch instructions is

> > definitely helping performance on ThunderX. But still performance

> > is

> > slightly less than that of scalar.

> > 

> > I tried few small tweaks which helped improve performance on my

> > Thunderx setup. For details see comments inline.

> > 

> > 

> > On Wed, 2017-05-10 at 10:30 +0800, Jianbo Liu wrote:

> > > 

> > > Use ARM NEON intrinsics to accelerate l3 fowarding.

> > > 

> > > Signed-off-by: Jianbo Liu <jianbo.liu@linaro.org>

> > > ---

> > >  examples/l3fwd/l3fwd_em.c            |   4 +-

> > >  examples/l3fwd/l3fwd_em_hlm.h        |  19 ++-

> > >  examples/l3fwd/l3fwd_em_hlm_neon.h   |  74 ++++++++++

> > >  examples/l3fwd/l3fwd_em_sequential.h |  20 ++-

> > >  examples/l3fwd/l3fwd_lpm.c           |   4 +-

> > >  examples/l3fwd/l3fwd_lpm_neon.h      | 165

> > > ++++++++++++++++++++++

> > >  examples/l3fwd/l3fwd_neon.h          | 259

> > > +++++++++++++++++++++++++++++++++++

> > >  7 files changed, 539 insertions(+), 6 deletions(-)

> > >  create mode 100644 examples/l3fwd/l3fwd_em_hlm_neon.h

> > >  create mode 100644 examples/l3fwd/l3fwd_lpm_neon.h

> > >  create mode 100644 examples/l3fwd/l3fwd_neon.h

> > > 

> > > [...]

> > > diff --git a/examples/l3fwd/l3fwd_em_hlm.h

> > > b/examples/l3fwd/l3fwd_em_hlm.h

> > > index 636dea4..4ec600a 100644

> > > --- a/examples/l3fwd/l3fwd_em_hlm.h

> > > +++ b/examples/l3fwd/l3fwd_em_hlm.h

> > > @@ -35,8 +35,13 @@

> > >  #ifndef __L3FWD_EM_HLM_H__

> > >  #define __L3FWD_EM_HLM_H__

> > > 

> > > +#if defined(__SSE4_1__)

> > >  #include "l3fwd_sse.h"

> > >  #include "l3fwd_em_hlm_sse.h"

> > > +#elif defined(RTE_MACHINE_CPUFLAG_NEON)

> > > +#include "l3fwd_neon.h"

> > > +#include "l3fwd_em_hlm_neon.h"

> > > +#endif

> > > 

> > >  static inline __attribute__((always_inline)) void

> > >  em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf

> > > *m[8],

> > > @@ -238,7 +243,7 @@ static inline __attribute__((always_inline))

> > > uint16_t

> > >  l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

> > >               uint8_t portid, struct lcore_conf *qconf)

> > >  {

> > > -     int32_t j;

> > > +     int32_t i, j, pos;

> > >       uint16_t dst_port[MAX_PKT_BURST];

> > > 

> > >       /*

> > > @@ -247,6 +252,12 @@ static inline __attribute__((always_inline))

> > > uint16_t

> > >        */

> > >       int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);

> > > 

> > > +     for (j = 0; j < 8 && j < nb_rx; j++) {

> > > +             rte_prefetch0(pkts_burst[j]);

> > The above prefetch of rte_mbuf struct is unnecessary. With this we

> > wont

> > see any performance improvement as the contents of rte_mbuf

> > (buf_addr

> > and data_off) is used in right next instruction. Removing the above

> > prefetch and similar prefetches at multiple places was improving

> > performance on my ThunderX setup.

> Yes, will remove them.

> 

> > 

> > 

> > > 

> > > +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],

> > > +                                            struct ether_hdr *)

> > > +

> > > 1);

> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1. In

> > process_packet in l3fwd_neon.h, eth_header is accessed in

> > 

> But ip headers are used right in each 8/FWDSTEP loop.

> Since ip headers are accessed first, we should prefetch eth_hdr + 1

> first.

> After all nb_rx packets are handled in above small loop, their

> eth_header are then accessed in processx4_step3 over again.

> I'm not sure prefretching eth_hdr still works if we prefetch eth_hdr

> in first step,  as cache may be already filled with new data at that

> time.

> 

Okay. 
Also, I guess if the ethernet header and ip header falls in the same
cache line (which I think would be the case mostly as I hope the packet
data will be cache aligned), it doesn't make much of a  difference
whether you prefetch at ethernet header address or ip header address.
> > 

> > > 

> > > +     }

> > > +

> > >       for (j = 0; j < n; j += 8) {

> > > 

> > >               uint32_t pkt_type =

> > > @@ -263,6 +274,12 @@ static inline __attribute__((always_inline))

> > > uint16_t

> > >               uint32_t tcp_or_udp = pkt_type &

> > >                       (RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);

> > > 

> > > +             for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++,

> > > pos++) {

> > > +                     rte_prefetch0(pkts_burst[pos]);

> > The above prefetch of rte_mbuf struct is unnecessary.

> > 

> > > 

> > > +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[p

> > > o

> > > s],

> > > +                                                    struct

> > > ether_hdr *) + 1);

> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> > 

> > > 

> > > +             }

> > > +

> > >               if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {

> > > 

> > >                       em_get_dst_port_ipv4x8(qconf,

> > > &pkts_burst[j], portid,

> > > 

> > > [...]

> ....

> 

> > 

> > > 

> > > diff --git a/examples/l3fwd/l3fwd_lpm_neon.h

> > > b/examples/l3fwd/l3fwd_lpm_neon.h

> > > new file mode 100644

> > > index 0000000..2f047b3

> > > --- /dev/null

> > > +++ b/examples/l3fwd/l3fwd_lpm_neon.h

> > > 

> > > [...]

> > > 

> > > +/*

> > > + * Buffer optimized handling of packets, invoked

> > > + * from main_loop.

> > > + */

> > > +static inline void

> > > +l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,

> > > +                     uint8_t portid, struct lcore_conf *qconf)

> > > +{

> > > +     int32_t i, j, pos;

> > > +     uint16_t dst_port[MAX_PKT_BURST];

> > > +     int32x4_t dip[MAX_PKT_BURST / FWDSTEP];

> > If you see carefully, we dont need an array of dip. We just need a

> > single element. dip value is calculated in processx4_step1 and

> > consumed

> > in processx4_step2, and thats it. No need to save it in an array.

> > 

> Will change, thanks!

> 

> > 

> > > 

> > > +     uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];

> > Same as dip. We dont need an array of ipv4_flag.

> > 

> > > 

> > > +     const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);

> > > +

> > > +     for (j = 0; j < FWDSTEP && j < nb_rx; j++) {

> > > +             rte_prefetch0(pkts_burst[j]);

> > The above prefetch of rte_mbuf struct is unnecessary.

> > 

> > > 

> > > +             rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],

> > > +                                            struct ether_hdr *)

> > > +

> > > 1);

> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> > 

> > > 

> > > +     }

> > > +

> > > +     for (j = 0; j != k; j += FWDSTEP) {

> > > +             for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos <

> > > nb_rx;

> > > +                  i++, pos++) {

> > > +                     rte_prefetch0(pkts_burst[pos]);

> > The above prefetch of rte_mbuf struct is unnecessary.

> > 

> > > 

> > > +                     rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[p

> > > o

> > > s],

> > > +                                                    struct

> > > ether_hdr *) + 1);

> > Better to prefetch at eth_hdr itself and not at eth_hdr + 1

> > 

> > > 

> > > +             }

> > > +             processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],

> > > +                             &ipv4_flag[j / FWDSTEP]);

> > > +

> > > +             processx4_step2(qconf, dip[j / FWDSTEP],

> > > +                             ipv4_flag[j / FWDSTEP], portid,

> > > &pkts_burst[j],

> > > +                             &dst_port[j]);

> > > +     }

> > > +

> > > +     /* Classify last up to 3 packets one by one */

> > > +     switch (nb_rx % FWDSTEP) {

> > > +     case 3:

> > > +             dst_port[j] = lpm_get_dst_port(qconf,

> > > pkts_burst[j],

> > > portid);

> > > +             j++;

> > > +             /* fallthrough */

> > > +     case 2:

> > > +             dst_port[j] = lpm_get_dst_port(qconf,

> > > pkts_burst[j],

> > > portid);

> > > +             j++;

> > > +             /* fallthrough */

> > > +     case 1:

> > > +             dst_port[j] = lpm_get_dst_port(qconf,

> > > pkts_burst[j],

> > > portid);

> > > +             j++;

> > > +     }

> > > +

> > > +     send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);

> > > +}

> > > +

> > > +#endif /* __L3FWD_LPM_NEON_H__ */

> > > diff --git a/examples/l3fwd/l3fwd_neon.h

> > > b/examples/l3fwd/l3fwd_neon.h

> > > new file mode 100644

> > > index 0000000..75c8976

> > > --- /dev/null

> > > +++ b/examples/l3fwd/l3fwd_neon.h

> > > [...]

> > > 

> > > +

> > > +/**

> > > + * Process one packet:

> > > + * Update source and destination MAC addresses in the ethernet

> > > header.

> > > + * Perform RFC1812 checks and updates for IPV4 packets.

> > > + */

> > > +static inline void

> > > +process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)

> > > +{

> > > +     struct ether_hdr *eth_hdr;

> > > +     uint32x4_t te, ve;

> > > +

> > > +     eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);

> > eth_hdr accessed here. Hence the earlier comments about prefetching

> > at

> > eth header.

> > 

> process_packet is called only for the last 1-3 packets, most are

> handled in processx4_step3.

> As these 2 functions access packets from the first one once again,

> the

> prefetch may not work.

> Please see my explanation in the above...

> 

Okay.
> > 

> > > 

> > > +

> > > +     te = vld1q_u32((uint32_t *)eth_hdr);

> > > +     ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);

> > > +

> > > +

> > > +     rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,

> > > +                     pkt->packet_type);

> > > +

> > > +     ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);

> > Use vcopyq_laneq_u32 for easily doing the above.

> > 

> Will change. Thanks!

> 

> > 

> > > 

> > > +     vst1q_u32((uint32_t *)eth_hdr, ve);

> > > +}

> > > +

> > > [...]

> > > +#endif /* _L3FWD_NEON_H_ */

> > Combining all the above comments, I made some changes on top of

> > your

> > patch. These changes are giving 3-4% improvement over your version.

> > 

> > You may find the changes at

> > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587fafc

> > 

> Is the correct in Line 103/104, you only process one packets in the

> last FWDSTEP packets?

Its doing processx4_* there. So its processing 4 packets.

> Actually, I don't like your change in l3fwd_lpm_send_packets, making

> the simple logic complicated. And I don't think it can help to

> improve

> performance. :-)

Its not making it complicated. The number of lines of code may be
higher by may be 10 lines, but the conditions of the loops are
simplified which reduces the number of branch instructions and helps
the processor to go through them faster.

If possible, please try it out on your machine.
> 

> > 

> > Please check it out and let me know your comments.

> > 

> > Thanks

> > Ashwin
  
Sekhar, Ashwin May 11, 2017, 4:27 a.m. UTC | #4
On Thu, 2017-05-11 at 04:14 +0000, Sekhar, Ashwin wrote:
...
> > > Combining all the above comments, I made some changes on top of

> > > your

> > > patch. These changes are giving 3-4% improvement over your

> > > version.

> > > 

> > > You may find the changes at

> > > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587faf

> > > c

> > > 

> > Is the correct in Line 103/104, you only process one packets in the

> > last FWDSTEP packets?

> Its doing processx4_* there. So its processing 4 packets.

> 

> > 

> > Actually, I don't like your change in l3fwd_lpm_send_packets,

> > making

> > the simple logic complicated. And I don't think it can help to

> > improve

> > performance. :-)

> Its not making it complicated. The number of lines of code may be

> higher by may be 10 lines, but the conditions of the loops are

> simplified which reduces the number of branch instructions and helps

> the processor to go through them faster.

> 

> If possible, please try it out on your machine.


Missed out one point.
Since 2 loops are form "for (i = 0; i < FWDSTEP; i++)" i.e. looping for
constant number of iterations, compiler will easily unroll them.

Thanks
Ashwin
> > 

> > 

> > > 

> > > 

> > > Please check it out and let me know your comments.

> > > 

> > > Thanks

> > > Ashwin
  
Jianbo Liu May 11, 2017, 6:11 a.m. UTC | #5
On 11 May 2017 at 12:27, Sekhar, Ashwin <Ashwin.Sekhar@cavium.com> wrote:
>
> On Thu, 2017-05-11 at 04:14 +0000, Sekhar, Ashwin wrote:
> ...
>> > > Combining all the above comments, I made some changes on top of
>> > > your
>> > > patch. These changes are giving 3-4% improvement over your
>> > > version.
>> > >
>> > > You may find the changes at
>> > > https://gist.github.com/ashwinyes/34cbdd999784402c859c71613587faf
>> > > c
>> > >
>> > Is the correct in Line 103/104, you only process one packets in the
>> > last FWDSTEP packets?
>> Its doing processx4_* there. So its processing 4 packets.
>>
>> >
>> > Actually, I don't like your change in l3fwd_lpm_send_packets,
>> > making
>> > the simple logic complicated. And I don't think it can help to
>> > improve
>> > performance. :-)
>> Its not making it complicated. The number of lines of code may be
>> higher by may be 10 lines, but the conditions of the loops are
>> simplified which reduces the number of branch instructions and helps
>> the processor to go through them faster.

I suspected not much improvement we can get.

>>
>> If possible, please try it out on your machine.

OK, I'll test. If no performance regression, I'll adopt your suggestion in v3.

>
> Missed out one point.
> Since 2 loops are form "for (i = 0; i < FWDSTEP; i++)" i.e. looping for
> constant number of iterations, compiler will easily unroll them.
>
> Thanks
> Ashwin
>> >
>> >
>> > >
>> > >
>> > > Please check it out and let me know your comments.
>> > >
>> > > Thanks
>> > > Ashwin
  

Patch

diff --git a/examples/l3fwd/l3fwd_em.c b/examples/l3fwd/l3fwd_em.c
index ba844b2..da96cfd 100644
--- a/examples/l3fwd/l3fwd_em.c
+++ b/examples/l3fwd/l3fwd_em.c
@@ -328,7 +328,7 @@  struct ipv6_l3fwd_em_route {
 	return (uint8_t)((ret < 0) ? portid : ipv6_l3fwd_out_if[ret]);
 }
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 #if defined(NO_HASH_MULTI_LOOKUP)
 #include "l3fwd_em_sequential.h"
 #else
@@ -709,7 +709,7 @@  struct ipv6_l3fwd_em_route {
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_em_send_packets(nb_rx, pkts_burst,
 							portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_em_hlm.h b/examples/l3fwd/l3fwd_em_hlm.h
index 636dea4..4ec600a 100644
--- a/examples/l3fwd/l3fwd_em_hlm.h
+++ b/examples/l3fwd/l3fwd_em_hlm.h
@@ -35,8 +35,13 @@ 
 #ifndef __L3FWD_EM_HLM_H__
 #define __L3FWD_EM_HLM_H__
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
 #include "l3fwd_em_hlm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#include "l3fwd_em_hlm_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) void
 em_get_dst_port_ipv4x8(struct lcore_conf *qconf, struct rte_mbuf *m[8],
@@ -238,7 +243,7 @@  static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 		uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j, pos;
 	uint16_t dst_port[MAX_PKT_BURST];
 
 	/*
@@ -247,6 +252,12 @@  static inline __attribute__((always_inline)) uint16_t
 	 */
 	int32_t n = RTE_ALIGN_FLOOR(nb_rx, 8);
 
+	for (j = 0; j < 8 && j < nb_rx; j++) {
+		rte_prefetch0(pkts_burst[j]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
 	for (j = 0; j < n; j += 8) {
 
 		uint32_t pkt_type =
@@ -263,6 +274,12 @@  static inline __attribute__((always_inline)) uint16_t
 		uint32_t tcp_or_udp = pkt_type &
 			(RTE_PTYPE_L4_TCP | RTE_PTYPE_L4_UDP);
 
+		for (i = 0, pos = j + 8; i < 8 && pos < nb_rx; i++, pos++) {
+			rte_prefetch0(pkts_burst[pos]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+
 		if (tcp_or_udp && (l3_type == RTE_PTYPE_L3_IPV4)) {
 
 			em_get_dst_port_ipv4x8(qconf, &pkts_burst[j], portid,
diff --git a/examples/l3fwd/l3fwd_em_hlm_neon.h b/examples/l3fwd/l3fwd_em_hlm_neon.h
new file mode 100644
index 0000000..dae1acf
--- /dev/null
+++ b/examples/l3fwd/l3fwd_em_hlm_neon.h
@@ -0,0 +1,74 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_EM_HLM_NEON_H__
+#define __L3FWD_EM_HLM_NEON_H__
+
+#include <arm_neon.h>
+
+static inline void
+get_ipv4_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		union ipv4_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(rte_pktmbuf_mtod_offset(m0, int32_t *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv4_hdr, time_to_live)));
+
+	key->xmm = vandq_s32(tmpdata0, mask0);
+}
+
+static inline void
+get_ipv6_5tuple(struct rte_mbuf *m0, int32x4_t mask0,
+		int32x4_t mask1, union ipv6_5tuple_host *key)
+{
+	int32x4_t tmpdata0 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len)));
+
+	int32x4_t tmpdata1 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 8));
+
+	int32x4_t tmpdata2 = vld1q_s32(
+			rte_pktmbuf_mtod_offset(m0, int *,
+				sizeof(struct ether_hdr) +
+				offsetof(struct ipv6_hdr, payload_len) + 16));
+
+	key->xmm[0] = vandq_s32(tmpdata0, mask0);
+	key->xmm[1] = tmpdata1;
+	key->xmm[2] = vandq_s32(tmpdata2, mask1);
+}
+#endif /* __L3FWD_EM_HLM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_em_sequential.h b/examples/l3fwd/l3fwd_em_sequential.h
index c0a9725..c3df473 100644
--- a/examples/l3fwd/l3fwd_em_sequential.h
+++ b/examples/l3fwd/l3fwd_em_sequential.h
@@ -43,7 +43,11 @@ 
  * compilation time.
  */
 
+#if defined(__SSE4_1__)
 #include "l3fwd_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_neon.h"
+#endif
 
 static inline __attribute__((always_inline)) uint16_t
 em_get_dst_port(const struct lcore_conf *qconf, struct rte_mbuf *pkt,
@@ -101,11 +105,23 @@  static inline __attribute__((always_inline)) uint16_t
 l3fwd_em_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
 			uint8_t portid, struct lcore_conf *qconf)
 {
-	int32_t j;
+	int32_t i, j;
 	uint16_t dst_port[MAX_PKT_BURST];
 
-	for (j = 0; j < nb_rx; j++)
+	if (nb_rx > 0) {
+		rte_prefetch0(pkts_burst[0]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[0],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (i = 1, j = 0; j < nb_rx; i++, j++) {
+		if (i < nb_rx) {
+			rte_prefetch0(pkts_burst[i]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[i],
+						       struct ether_hdr *) + 1);
+		}
 		dst_port[j] = em_get_dst_port(qconf, pkts_burst[j], portid);
+	}
 
 	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
 }
diff --git a/examples/l3fwd/l3fwd_lpm.c b/examples/l3fwd/l3fwd_lpm.c
index fc554fc..ddef250 100644
--- a/examples/l3fwd/l3fwd_lpm.c
+++ b/examples/l3fwd/l3fwd_lpm.c
@@ -189,6 +189,8 @@  static inline __attribute__((always_inline)) uint16_t
 
 #if defined(__SSE4_1__)
 #include "l3fwd_lpm_sse.h"
+#elif defined(RTE_MACHINE_CPUFLAG_NEON)
+#include "l3fwd_lpm_neon.h"
 #else
 #include "l3fwd_lpm.h"
 #endif
@@ -261,7 +263,7 @@  static inline __attribute__((always_inline)) uint16_t
 			if (nb_rx == 0)
 				continue;
 
-#if defined(__SSE4_1__)
+#if defined(__SSE4_1__) || defined(RTE_MACHINE_CPUFLAG_NEON)
 			l3fwd_lpm_send_packets(nb_rx, pkts_burst,
 						portid, qconf);
 #else
diff --git a/examples/l3fwd/l3fwd_lpm_neon.h b/examples/l3fwd/l3fwd_lpm_neon.h
new file mode 100644
index 0000000..2f047b3
--- /dev/null
+++ b/examples/l3fwd/l3fwd_lpm_neon.h
@@ -0,0 +1,165 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __L3FWD_LPM_NEON_H__
+#define __L3FWD_LPM_NEON_H__
+
+#include <arm_neon.h>
+
+#include "l3fwd_neon.h"
+
+/*
+ * Read packet_type and destination IPV4 addresses from 4 mbufs.
+ */
+static inline void
+processx4_step1(struct rte_mbuf *pkt[FWDSTEP],
+		int32x4_t *dip,
+		uint32_t *ipv4_flag)
+{
+	struct ipv4_hdr *ipv4_hdr;
+	struct ether_hdr *eth_hdr;
+	int32_t dst[FWDSTEP];
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[0], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[0] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] = pkt[0]->packet_type & RTE_PTYPE_L3_IPV4;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[1], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[1] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[1]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[2], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[2] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[2]->packet_type;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt[3], struct ether_hdr *);
+	ipv4_hdr = (struct ipv4_hdr *)(eth_hdr + 1);
+	dst[3] = ipv4_hdr->dst_addr;
+	ipv4_flag[0] &= pkt[3]->packet_type;
+
+	dip[0] = vld1q_s32(dst);
+}
+
+/*
+ * Lookup into LPM for destination port.
+ * If lookup fails, use incoming port (portid) as destination port.
+ */
+static inline void
+processx4_step2(const struct lcore_conf *qconf,
+		int32x4_t dip,
+		uint32_t ipv4_flag,
+		uint8_t portid,
+		struct rte_mbuf *pkt[FWDSTEP],
+		uint16_t dprt[FWDSTEP])
+{
+	rte_xmm_t dst;
+
+	dip = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(dip)));
+
+	/* if all 4 packets are IPV4. */
+	if (likely(ipv4_flag)) {
+		rte_lpm_lookupx4(qconf->ipv4_lookup_struct, dip, dst.u32,
+			portid);
+		/* get rid of unused upper 16 bit for each dport. */
+		vst1_s16((int16_t *)dprt, vqmovn_s32(dst.x));
+	} else {
+		dst.x = dip;
+		dprt[0] = lpm_get_dst_port_with_ipv4(qconf, pkt[0],
+						     dst.u32[0], portid);
+		dprt[1] = lpm_get_dst_port_with_ipv4(qconf, pkt[1],
+						     dst.u32[1], portid);
+		dprt[2] = lpm_get_dst_port_with_ipv4(qconf, pkt[2],
+						     dst.u32[2], portid);
+		dprt[3] = lpm_get_dst_port_with_ipv4(qconf, pkt[3],
+						     dst.u32[3], portid);
+	}
+}
+
+/*
+ * Buffer optimized handling of packets, invoked
+ * from main_loop.
+ */
+static inline void
+l3fwd_lpm_send_packets(int nb_rx, struct rte_mbuf **pkts_burst,
+			uint8_t portid, struct lcore_conf *qconf)
+{
+	int32_t i, j, pos;
+	uint16_t dst_port[MAX_PKT_BURST];
+	int32x4_t dip[MAX_PKT_BURST / FWDSTEP];
+	uint32_t ipv4_flag[MAX_PKT_BURST / FWDSTEP];
+	const int32_t k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+
+	for (j = 0; j < FWDSTEP && j < nb_rx; j++) {
+		rte_prefetch0(pkts_burst[j]);
+		rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[j],
+					       struct ether_hdr *) + 1);
+	}
+
+	for (j = 0; j != k; j += FWDSTEP) {
+		for (i = 0, pos = j + FWDSTEP; i < FWDSTEP && pos < nb_rx;
+		     i++, pos++) {
+			rte_prefetch0(pkts_burst[pos]);
+			rte_prefetch0(rte_pktmbuf_mtod(pkts_burst[pos],
+						       struct ether_hdr *) + 1);
+		}
+		processx4_step1(&pkts_burst[j], &dip[j / FWDSTEP],
+				&ipv4_flag[j / FWDSTEP]);
+
+		processx4_step2(qconf, dip[j / FWDSTEP],
+				ipv4_flag[j / FWDSTEP], portid, &pkts_burst[j],
+				&dst_port[j]);
+	}
+
+	/* Classify last up to 3 packets one by one */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 2:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+		/* fallthrough */
+	case 1:
+		dst_port[j] = lpm_get_dst_port(qconf, pkts_burst[j], portid);
+		j++;
+	}
+
+	send_packets_multi(qconf, pkts_burst, dst_port, nb_rx);
+}
+
+#endif /* __L3FWD_LPM_NEON_H__ */
diff --git a/examples/l3fwd/l3fwd_neon.h b/examples/l3fwd/l3fwd_neon.h
new file mode 100644
index 0000000..75c8976
--- /dev/null
+++ b/examples/l3fwd/l3fwd_neon.h
@@ -0,0 +1,259 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016 Intel Corporation. All rights reserved.
+ *   Copyright(c) 2017, Linaro Limited
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+
+#ifndef _L3FWD_NEON_H_
+#define _L3FWD_NEON_H_
+
+#include "l3fwd.h"
+#include "l3fwd_common.h"
+
+/*
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+processx4_step3(struct rte_mbuf *pkt[FWDSTEP], uint16_t dst_port[FWDSTEP])
+{
+	uint32x4_t te[FWDSTEP];
+	uint32x4_t ve[FWDSTEP];
+	uint32_t *p[FWDSTEP];
+
+	p[0] = rte_pktmbuf_mtod(pkt[0], uint32_t *);
+	p[1] = rte_pktmbuf_mtod(pkt[1], uint32_t *);
+	p[2] = rte_pktmbuf_mtod(pkt[2], uint32_t *);
+	p[3] = rte_pktmbuf_mtod(pkt[3], uint32_t *);
+
+	ve[0] = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+	te[0] = vld1q_u32(p[0]);
+
+	ve[1] = vreinterpretq_u32_s32(val_eth[dst_port[1]]);
+	te[1] = vld1q_u32(p[1]);
+
+	ve[2] = vreinterpretq_u32_s32(val_eth[dst_port[2]]);
+	te[2] = vld1q_u32(p[2]);
+
+	ve[3] = vreinterpretq_u32_s32(val_eth[dst_port[3]]);
+	te[3] = vld1q_u32(p[3]);
+
+	/* Update last 4 bytes */
+	ve[0] = vsetq_lane_u32(vgetq_lane_u32(te[0], 3), ve[0], 3);
+	ve[1] = vsetq_lane_u32(vgetq_lane_u32(te[1], 3), ve[1], 3);
+	ve[2] = vsetq_lane_u32(vgetq_lane_u32(te[2], 3), ve[2], 3);
+	ve[3] = vsetq_lane_u32(vgetq_lane_u32(te[3], 3), ve[3], 3);
+
+	vst1q_u32(p[0], ve[0]);
+	vst1q_u32(p[1], ve[1]);
+	vst1q_u32(p[2], ve[2]);
+	vst1q_u32(p[3], ve[3]);
+
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[0] + 1),
+		&dst_port[0], pkt[0]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[1] + 1),
+		&dst_port[1], pkt[1]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[2] + 1),
+		&dst_port[2], pkt[2]->packet_type);
+	rfc1812_process((struct ipv4_hdr *)((struct ether_hdr *)p[3] + 1),
+		&dst_port[3], pkt[3]->packet_type);
+}
+
+/*
+ * Group consecutive packets with the same destination port in bursts of 4.
+ * Suppose we have array of destionation ports:
+ * dst_port[] = {a, b, c, d,, e, ... }
+ * dp1 should contain: <a, b, c, d>, dp2: <b, c, d, e>.
+ * We doing 4 comparisions at once and the result is 4 bit mask.
+ * This mask is used as an index into prebuild array of pnum values.
+ */
+static inline uint16_t *
+port_groupx4(uint16_t pn[FWDSTEP + 1], uint16_t *lp, uint16x8_t dp1,
+	     uint16x8_t dp2)
+{
+	union {
+		uint16_t u16[FWDSTEP + 1];
+		uint64_t u64;
+	} *pnum = (void *)pn;
+
+	int32_t v;
+	uint16x8_t mask = {1, 2, 4, 8, 0, 0, 0, 0};
+
+	dp1 = vceqq_u16(dp1, dp2);
+	dp1 = vandq_u16(dp1, mask);
+	v = vaddvq_u16(dp1);
+
+	/* update last port counter. */
+	lp[0] += gptbl[v].lpv;
+
+	/* if dest port value has changed. */
+	if (v != GRPMSK) {
+		pnum->u64 = gptbl[v].pnum;
+		pnum->u16[FWDSTEP] = 1;
+		lp = pnum->u16 + gptbl[v].idx;
+	}
+
+	return lp;
+}
+
+/**
+ * Process one packet:
+ * Update source and destination MAC addresses in the ethernet header.
+ * Perform RFC1812 checks and updates for IPV4 packets.
+ */
+static inline void
+process_packet(struct rte_mbuf *pkt, uint16_t *dst_port)
+{
+	struct ether_hdr *eth_hdr;
+	uint32x4_t te, ve;
+
+	eth_hdr = rte_pktmbuf_mtod(pkt, struct ether_hdr *);
+
+	te = vld1q_u32((uint32_t *)eth_hdr);
+	ve = vreinterpretq_u32_s32(val_eth[dst_port[0]]);
+
+
+	rfc1812_process((struct ipv4_hdr *)(eth_hdr + 1), dst_port,
+			pkt->packet_type);
+
+	ve = vsetq_lane_u32(vgetq_lane_u32(te, 3), ve, 3);
+	vst1q_u32((uint32_t *)eth_hdr, ve);
+}
+
+/**
+ * Send packets burst from pkts_burst to the ports in dst_port array
+ */
+static inline __attribute__((always_inline)) void
+send_packets_multi(struct lcore_conf *qconf, struct rte_mbuf **pkts_burst,
+		uint16_t dst_port[MAX_PKT_BURST], int nb_rx)
+{
+	int32_t k;
+	int j = 0;
+	uint16_t dlp;
+	uint16_t *lp;
+	uint16_t pnum[MAX_PKT_BURST + 1];
+
+	/*
+	 * Finish packet processing and group consecutive
+	 * packets with the same destination port.
+	 */
+	k = RTE_ALIGN_FLOOR(nb_rx, FWDSTEP);
+	if (k != 0) {
+		uint16x8_t dp1, dp2;
+
+		lp = pnum;
+		lp[0] = 1;
+
+		processx4_step3(pkts_burst, dst_port);
+
+		/* dp1: <d[0], d[1], d[2], d[3], ... > */
+		dp1 = vld1q_u16(dst_port);
+
+		for (j = FWDSTEP; j != k; j += FWDSTEP) {
+			processx4_step3(&pkts_burst[j], &dst_port[j]);
+
+			/*
+			 * dp2:
+			 * <d[j-3], d[j-2], d[j-1], d[j], ... >
+			 */
+			dp2 = vld1q_u16(&dst_port[j - FWDSTEP + 1]);
+			lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+			/*
+			 * dp1:
+			 * <d[j], d[j+1], d[j+2], d[j+3], ... >
+			 */
+			dp1 = vextq_u16(dp1, dp1, FWDSTEP - 1);
+		}
+
+		/*
+		 * dp2: <d[j-3], d[j-2], d[j-1], d[j-1], ... >
+		 */
+		dp2 = vextq_u16(dp1, dp1, 1);
+		dp2 = vsetq_lane_u16(vgetq_lane_u16(dp2, 2), dp2, 3);
+		lp  = port_groupx4(&pnum[j - FWDSTEP], lp, dp1, dp2);
+
+		/*
+		 * remove values added by the last repeated
+		 * dst port.
+		 */
+		lp[0]--;
+		dlp = dst_port[j - 1];
+	} else {
+		/* set dlp and lp to the never used values. */
+		dlp = BAD_PORT - 1;
+		lp = pnum + MAX_PKT_BURST;
+	}
+
+	/* Process up to last 3 packets one by one. */
+	switch (nb_rx % FWDSTEP) {
+	case 3:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 2:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+		/* fallthrough */
+	case 1:
+		process_packet(pkts_burst[j], dst_port + j);
+		GROUP_PORT_STEP(dlp, dst_port, lp, pnum, j);
+		j++;
+	}
+
+	/*
+	 * Send packets out, through destination port.
+	 * Consecutive packets with the same destination port
+	 * are already grouped together.
+	 * If destination port for the packet equals BAD_PORT,
+	 * then free the packet without sending it out.
+	 */
+	for (j = 0; j < nb_rx; j += k) {
+
+		int32_t m;
+		uint16_t pn;
+
+		pn = dst_port[j];
+		k = pnum[j];
+
+		if (likely(pn != BAD_PORT))
+			send_packetsx4(qconf, pn, pkts_burst + j, k);
+		else
+			for (m = j; m != j + k; m++)
+				rte_pktmbuf_free(pkts_burst[m]);
+
+	}
+}
+
+#endif /* _L3FWD_NEON_H_ */