diff mbox series

[v3,18/18] lpm: choose vector path at runtime

Message ID	20200930130415.11211-19-ciara.power@intel.com (mailing list archive)
State	Superseded, archived
Delegated to:	David Marchand
Headers	IronPort-SDR: FV+b+4P7kMAuRCOiVVKgUFcw1+WSZogbbXOeRQJmkh0af7lDTioAra7QBjh4mVDQnRH++BK7we CxocF9TM+Mjg== IronPort-SDR: GUzG9J+IIRnW/o3gdcIxhUIFJBYvgysAv/oJSiBiQaTNYyC/tO4ejejVoxTaAmdsVCfx2K2hED AxDIjaY6zCCw== From: Ciara Power <ciara.power@intel.com> To: dev@dpdk.org Cc: Ciara Power <ciara.power@intel.com>, Bruce Richardson <bruce.richardson@intel.com>, Vladimir Medvedkin <vladimir.medvedkin@intel.com>, Jerin Jacob <jerinj@marvell.com>, Ruifeng Wang <ruifeng.wang@arm.com> Date: Wed, 30 Sep 2020 14:04:14 +0100 Message-Id: <20200930130415.11211-19-ciara.power@intel.com> In-Reply-To: <20200930130415.11211-1-ciara.power@intel.com> References: <20200807155859.63888-1-ciara.power@intel.com> <20200930130415.11211-1-ciara.power@intel.com> Subject: [dpdk-dev] [PATCH v3 18/18] lpm: choose vector path at runtime Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>
Series	add max SIMD bitwidth to EAL \| [v3,00/18] add max SIMD bitwidth to EAL [v3,01/18] eal: add max SIMD bitwidth [v3,02/18] eal: add default SIMD bitwidth values [v3,03/18] doc: add detail on using max SIMD bitwidth [v3,04/18] net/i40e: add checks for max SIMD bitwidth [v3,05/18] net/axgbe: add checks for max SIMD bitwidth [v3,06/18] net/bnxt: add checks for max SIMD bitwidth [v3,07/18] net/enic: add checks for max SIMD bitwidth [v3,08/18] net/fm10k: add checks for max SIMD bitwidth [v3,09/18] net/iavf: add checks for max SIMD bitwidth [v3,10/18] net/ice: add checks for max SIMD bitwidth [v3,11/18] net/ixgbe: add checks for max SIMD bitwidth [v3,12/18] net/mlx5: add checks for max SIMD bitwidth [v3,13/18] net/virtio: add checks for max SIMD bitwidth [v3,14/18] distributor: add checks for max SIMD bitwidth [v3,15/18] member: add checks for max SIMD bitwidth [v3,16/18] efd: add checks for max SIMD bitwidth [v3,17/18] net: add checks for max SIMD bitwidth [v3,18/18] lpm: choose vector path at runtime

Checks

Context	Check	Description
ci/checkpatch	success	coding style OK
ci/Intel-compilation	success	Compilation OK
ci/travis-robot	success	Travis build: passed

Commit Message

Power, Ciara Sept. 30, 2020, 1:04 p.m. UTC

  When choosing the vector path, max SIMD bitwidth is now checked to
ensure a vector path is allowable. To do this, rather than the vector
lookup functions being called directly from apps, a generic lookup
function is called which will call the vector functions if suitable.

Signed-off-by: Ciara Power <ciara.power@intel.com>
---
 lib/librte_lpm/rte_lpm.h         | 57 ++++++++++++++++++++++++++------
 lib/librte_lpm/rte_lpm_altivec.h |  2 +-
 lib/librte_lpm/rte_lpm_neon.h    |  2 +-
 lib/librte_lpm/rte_lpm_sse.h     |  2 +-
 4 files changed, 50 insertions(+), 13 deletions(-)

Comments

Vladimir Medvedkin Sept. 30, 2020, 1:54 p.m. UTC | #1

Hi Ciara,


On 30/09/2020 14:04, Ciara Power wrote:
> When choosing the vector path, max SIMD bitwidth is now checked to
> ensure a vector path is allowable. To do this, rather than the vector
> lookup functions being called directly from apps, a generic lookup
> function is called which will call the vector functions if suitable.
> 
> Signed-off-by: Ciara Power <ciara.power@intel.com>
> ---
>   lib/librte_lpm/rte_lpm.h         | 57 ++++++++++++++++++++++++++------
>   lib/librte_lpm/rte_lpm_altivec.h |  2 +-
>   lib/librte_lpm/rte_lpm_neon.h    |  2 +-
>   lib/librte_lpm/rte_lpm_sse.h     |  2 +-
>   4 files changed, 50 insertions(+), 13 deletions(-)
> 
> diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> index 03da2d37e0..edba7cafd5 100644
> --- a/lib/librte_lpm/rte_lpm.h
> +++ b/lib/librte_lpm/rte_lpm.h
> @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
>   /* Mask four results. */
>   #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ffffff00ffffff)
>   
> +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> +#include "rte_lpm_neon.h"
> +#elif defined(RTE_ARCH_PPC_64)
> +#include "rte_lpm_altivec.h"
> +#else
> +#include "rte_lpm_sse.h"
> +#endif
> +
>   /**
> - * Lookup four IP addresses in an LPM table.
> + * Lookup four IP addresses in an LPM table individually by calling the
> + * lookup function for each ip. This is used when lookupx4 is called but
> + * the vector path is not suitable.
>    *
>    * @param lpm
>    *   LPM object handle
> @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
>    *   if lookup would fail.
>    */
>   static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> -	uint32_t defv);
> +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv)
> +{
> +	int i;
> +	for (i = 0; i < 4; i++)
> +		if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> +			hop[i] = defv; /* lookupx4 expected to set on failure */
> +}
>   
> -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> -#include "rte_lpm_neon.h"
> -#elif defined(RTE_ARCH_PPC_64)
> -#include "rte_lpm_altivec.h"
> -#else
> -#include "rte_lpm_sse.h"
> -#endif
> +/**
> + * Lookup four IP addresses in an LPM table.
> + *
> + * @param lpm
> + *   LPM object handle
> + * @param ip
> + *   Four IPs to be looked up in the LPM table
> + * @param hop
> + *   Next hop of the most specific rule found for IP (valid on lookup hit only).
> + *   This is an 4 elements array of two byte values.
> + *   If the lookup was successful for the given IP, then least significant byte
> + *   of the corresponding element is the  actual next hop and the most
> + *   significant byte is zero.
> + *   If the lookup for the given IP failed, then corresponding element would
> + *   contain default value, see description of then next parameter.
> + * @param defv
> + *   Default value to populate into corresponding element of hop[] array,
> + *   if lookup would fail.
> + */
> +static inline void
> +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +	uint32_t defv)
> +{
> +	if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> +		rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> +	else
> +		rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
> +}

I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4 is 
used in the hot path, and a bulk size is too small to amortize the cost 
of adding this extra logic.

>   
>   #ifdef __cplusplus
>   }
> diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> index 228c41b38e..82142d3351 100644
> --- a/lib/librte_lpm/rte_lpm_altivec.h
> +++ b/lib/librte_lpm/rte_lpm_altivec.h
> @@ -16,7 +16,7 @@ extern "C" {
>   #endif
>   
>   static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	uint32_t defv)
>   {
>   	vector signed int i24;
> diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> index 6c131d3125..14b184515d 100644
> --- a/lib/librte_lpm/rte_lpm_neon.h
> +++ b/lib/librte_lpm/rte_lpm_neon.h
> @@ -16,7 +16,7 @@ extern "C" {
>   #endif
>   
>   static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	uint32_t defv)
>   {
>   	uint32x4_t i24;
> diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> index 44770b6ff8..cb5477c6cf 100644
> --- a/lib/librte_lpm/rte_lpm_sse.h
> +++ b/lib/librte_lpm/rte_lpm_sse.h
> @@ -15,7 +15,7 @@ extern "C" {
>   #endif
>   
>   static inline void
> -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>   	uint32_t defv)
>   {
>   	__m128i i24;
>

Ananyev, Konstantin Oct. 8, 2020, 2:40 p.m. UTC | #2

> 
> Hi Ciara,
> 
> 
> On 30/09/2020 14:04, Ciara Power wrote:
> > When choosing the vector path, max SIMD bitwidth is now checked to
> > ensure a vector path is allowable. To do this, rather than the vector
> > lookup functions being called directly from apps, a generic lookup
> > function is called which will call the vector functions if suitable.
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> > ---
> >   lib/librte_lpm/rte_lpm.h         | 57 ++++++++++++++++++++++++++------
> >   lib/librte_lpm/rte_lpm_altivec.h |  2 +-
> >   lib/librte_lpm/rte_lpm_neon.h    |  2 +-
> >   lib/librte_lpm/rte_lpm_sse.h     |  2 +-
> >   4 files changed, 50 insertions(+), 13 deletions(-)
> >
> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> > index 03da2d37e0..edba7cafd5 100644
> > --- a/lib/librte_lpm/rte_lpm.h
> > +++ b/lib/librte_lpm/rte_lpm.h
> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> >   /* Mask four results. */
> >   #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ffffff00ffffff)
> >
> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> > +#include "rte_lpm_neon.h"
> > +#elif defined(RTE_ARCH_PPC_64)
> > +#include "rte_lpm_altivec.h"
> > +#else
> > +#include "rte_lpm_sse.h"
> > +#endif
> > +
> >   /**
> > - * Lookup four IP addresses in an LPM table.
> > + * Lookup four IP addresses in an LPM table individually by calling the
> > + * lookup function for each ip. This is used when lookupx4 is called but
> > + * the vector path is not suitable.
> >    *
> >    * @param lpm
> >    *   LPM object handle
> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
> >    *   if lookup would fail.
> >    */
> >   static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > -	uint32_t defv);
> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +	uint32_t defv)
> > +{
> > +	int i;
> > +	for (i = 0; i < 4; i++)
> > +		if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> > +			hop[i] = defv; /* lookupx4 expected to set on failure */
> > +}
> >
> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
> > -#include "rte_lpm_neon.h"
> > -#elif defined(RTE_ARCH_PPC_64)
> > -#include "rte_lpm_altivec.h"
> > -#else
> > -#include "rte_lpm_sse.h"
> > -#endif
> > +/**
> > + * Lookup four IP addresses in an LPM table.
> > + *
> > + * @param lpm
> > + *   LPM object handle
> > + * @param ip
> > + *   Four IPs to be looked up in the LPM table
> > + * @param hop
> > + *   Next hop of the most specific rule found for IP (valid on lookup hit only).
> > + *   This is an 4 elements array of two byte values.
> > + *   If the lookup was successful for the given IP, then least significant byte
> > + *   of the corresponding element is the  actual next hop and the most
> > + *   significant byte is zero.
> > + *   If the lookup for the given IP failed, then corresponding element would
> > + *   contain default value, see description of then next parameter.
> > + * @param defv
> > + *   Default value to populate into corresponding element of hop[] array,
> > + *   if lookup would fail.
> > + */
> > +static inline void
> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +	uint32_t defv)
> > +{
> > +	if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> > +		rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> > +	else
> > +		rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
> > +}
> 
> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4 is
> used in the hot path, and a bulk size is too small to amortize the cost
> of adding this extra logic.

I do share Vladimir's concern regarding performance here.
As I said in other mail - it seems not much point to insert
these checks into inline SSE specific function, as SSE is enabled
by default for all x86 builds. 

As another more generic thought - might be better to avoid
these checks in other public SIMD-specific inline functions (if any).
If such function get called from some .c, then at least such SIMD
ISA is already enabled for that .c file and I think this check should be
left for caller to do.     
 
> >
> >   #ifdef __cplusplus
> >   }
> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
> > index 228c41b38e..82142d3351 100644
> > --- a/lib/librte_lpm/rte_lpm_altivec.h
> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
> > @@ -16,7 +16,7 @@ extern "C" {
> >   #endif
> >
> >   static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >   	uint32_t defv)
> >   {
> >   	vector signed int i24;
> > diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
> > index 6c131d3125..14b184515d 100644
> > --- a/lib/librte_lpm/rte_lpm_neon.h
> > +++ b/lib/librte_lpm/rte_lpm_neon.h
> > @@ -16,7 +16,7 @@ extern "C" {
> >   #endif
> >
> >   static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >   	uint32_t defv)
> >   {
> >   	uint32x4_t i24;
> > diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
> > index 44770b6ff8..cb5477c6cf 100644
> > --- a/lib/librte_lpm/rte_lpm_sse.h
> > +++ b/lib/librte_lpm/rte_lpm_sse.h
> > @@ -15,7 +15,7 @@ extern "C" {
> >   #endif
> >
> >   static inline void
> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >   	uint32_t defv)
> >   {
> >   	__m128i i24;
> >
> 
> --
> Regards,
> Vladimir

David Marchand Oct. 8, 2020, 3:19 p.m. UTC | #3

On Wed, Sep 30, 2020 at 3:14 PM Ciara Power <ciara.power@intel.com> wrote:
>
> When choosing the vector path, max SIMD bitwidth is now checked to
> ensure a vector path is allowable. To do this, rather than the vector
> lookup functions being called directly from apps, a generic lookup
> function is called which will call the vector functions if suitable.
>
> Signed-off-by: Ciara Power <ciara.power@intel.com>

Got a build error on this patch with ./devtools/test-meson-builds.sh
("gcc-shared" target):

[2/3] Compiling C object
'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'.
FAILED: examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o
ccache gcc -Iexamples/c590b3c@@dpdk-l3fwd-thread@exe -Iexamples
-I../../dpdk/examples -Iexamples/performance-thread/l3fwd-thread
-I../../dpdk/examples/performance-thread/l3fwd-thread
-I../../dpdk/examples/performance-thread/l3fwd-thread/../common
-I../../dpdk/examples/performance-thread/l3fwd-thread/../common/arch/x86
-I. -I../../dpdk/ -Iconfig -I../../dpdk/config
-Ilib/librte_eal/include -I../../dpdk/lib/librte_eal/include
-Ilib/librte_eal/linux/include
-I../../dpdk/lib/librte_eal/linux/include -Ilib/librte_eal/x86/include
-I../../dpdk/lib/librte_eal/x86/include -Ilib/librte_eal/common
-I../../dpdk/lib/librte_eal/common -Ilib/librte_eal
-I../../dpdk/lib/librte_eal -Ilib/librte_kvargs
-I../../dpdk/lib/librte_kvargs
-Ilib/librte_telemetry/../librte_metrics
-I../../dpdk/lib/librte_telemetry/../librte_metrics
-Ilib/librte_telemetry -I../../dpdk/lib/librte_telemetry
-Ilib/librte_mempool -I../../dpdk/lib/librte_mempool -Ilib/librte_ring
-I../../dpdk/lib/librte_ring -Ilib/librte_net
-I../../dpdk/lib/librte_net -Ilib/librte_mbuf
-I../../dpdk/lib/librte_mbuf -Ilib/librte_ethdev
-I../../dpdk/lib/librte_ethdev -Ilib/librte_meter
-I../../dpdk/lib/librte_meter -Ilib/librte_cmdline
-I../../dpdk/lib/librte_cmdline -Ilib/librte_timer
-I../../dpdk/lib/librte_timer -Ilib/librte_lpm
-I../../dpdk/lib/librte_lpm -Ilib/librte_hash
-I../../dpdk/lib/librte_hash -Ilib/librte_rcu
-I../../dpdk/lib/librte_rcu
-I/home/dmarchan/intel-ipsec-mb/install/include
-fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall
-Winvalid-pch -Werror -O2 -g -include rte_config.h -Wextra -Wcast-qual
-Wdeprecated -Wformat-nonliteral -Wformat-security
-Wmissing-declarations -Wmissing-prototypes -Wnested-externs
-Wold-style-definition -Wpointer-arith -Wsign-compare
-Wstrict-prototypes -Wundef -Wwrite-strings
-Wno-address-of-packed-member -Wno-packed-not-aligned
-Wno-missing-field-initializers -D_GNU_SOURCE -march=native
-Wno-format-truncation -DALLOW_EXPERIMENTAL_API -MD -MQ
'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
-MF 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o.d'
-o 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
-c ../../dpdk/examples/performance-thread/l3fwd-thread/main.c
#‘target_mem_ref’ not supported by expression#’In file included from
../../dpdk/examples/performance-thread/l3fwd-thread/main.c:133:
../../dpdk/examples/performance-thread/l3fwd-thread/main.c: In
function ‘process_burst’:
../../dpdk/lib/librte_lpm/rte_lpm.h:435:7: error:  may be used
uninitialized in this function [-Werror=maybe-uninitialized]
  435 |   if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
      |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors
ninja: build stopped: subcommand failed.

gcc (GCC) 9.3.1 20200408 (Red Hat 9.3.1-2)

David Marchand Oct. 9, 2020, 12:37 p.m. UTC | #4

On Thu, Oct 8, 2020 at 5:19 PM David Marchand <david.marchand@redhat.com> wrote:
>
> On Wed, Sep 30, 2020 at 3:14 PM Ciara Power <ciara.power@intel.com> wrote:
> >
> > When choosing the vector path, max SIMD bitwidth is now checked to
> > ensure a vector path is allowable. To do this, rather than the vector
> > lookup functions being called directly from apps, a generic lookup
> > function is called which will call the vector functions if suitable.
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
>
> Got a build error on this patch with ./devtools/test-meson-builds.sh
> ("gcc-shared" target):
>
> [2/3] Compiling C object
> 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'.
> FAILED: examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o
> ccache gcc -Iexamples/c590b3c@@dpdk-l3fwd-thread@exe -Iexamples
> -I../../dpdk/examples -Iexamples/performance-thread/l3fwd-thread
> -I../../dpdk/examples/performance-thread/l3fwd-thread
> -I../../dpdk/examples/performance-thread/l3fwd-thread/../common
> -I../../dpdk/examples/performance-thread/l3fwd-thread/../common/arch/x86
> -I. -I../../dpdk/ -Iconfig -I../../dpdk/config
> -Ilib/librte_eal/include -I../../dpdk/lib/librte_eal/include
> -Ilib/librte_eal/linux/include
> -I../../dpdk/lib/librte_eal/linux/include -Ilib/librte_eal/x86/include
> -I../../dpdk/lib/librte_eal/x86/include -Ilib/librte_eal/common
> -I../../dpdk/lib/librte_eal/common -Ilib/librte_eal
> -I../../dpdk/lib/librte_eal -Ilib/librte_kvargs
> -I../../dpdk/lib/librte_kvargs
> -Ilib/librte_telemetry/../librte_metrics
> -I../../dpdk/lib/librte_telemetry/../librte_metrics
> -Ilib/librte_telemetry -I../../dpdk/lib/librte_telemetry
> -Ilib/librte_mempool -I../../dpdk/lib/librte_mempool -Ilib/librte_ring
> -I../../dpdk/lib/librte_ring -Ilib/librte_net
> -I../../dpdk/lib/librte_net -Ilib/librte_mbuf
> -I../../dpdk/lib/librte_mbuf -Ilib/librte_ethdev
> -I../../dpdk/lib/librte_ethdev -Ilib/librte_meter
> -I../../dpdk/lib/librte_meter -Ilib/librte_cmdline
> -I../../dpdk/lib/librte_cmdline -Ilib/librte_timer
> -I../../dpdk/lib/librte_timer -Ilib/librte_lpm
> -I../../dpdk/lib/librte_lpm -Ilib/librte_hash
> -I../../dpdk/lib/librte_hash -Ilib/librte_rcu
> -I../../dpdk/lib/librte_rcu
> -I/home/dmarchan/intel-ipsec-mb/install/include
> -fdiagnostics-color=always -pipe -D_FILE_OFFSET_BITS=64 -Wall
> -Winvalid-pch -Werror -O2 -g -include rte_config.h -Wextra -Wcast-qual
> -Wdeprecated -Wformat-nonliteral -Wformat-security
> -Wmissing-declarations -Wmissing-prototypes -Wnested-externs
> -Wold-style-definition -Wpointer-arith -Wsign-compare
> -Wstrict-prototypes -Wundef -Wwrite-strings
> -Wno-address-of-packed-member -Wno-packed-not-aligned
> -Wno-missing-field-initializers -D_GNU_SOURCE -march=native
> -Wno-format-truncation -DALLOW_EXPERIMENTAL_API -MD -MQ
> 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
> -MF 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o.d'
> -o 'examples/c590b3c@@dpdk-l3fwd-thread@exe/performance-thread_l3fwd-thread_main.c.o'
> -c ../../dpdk/examples/performance-thread/l3fwd-thread/main.c
> #‘target_mem_ref’ not supported by expression#’In file included from
> ../../dpdk/examples/performance-thread/l3fwd-thread/main.c:133:
> ../../dpdk/examples/performance-thread/l3fwd-thread/main.c: In
> function ‘process_burst’:
> ../../dpdk/lib/librte_lpm/rte_lpm.h:435:7: error:  may be used
> uninitialized in this function [-Werror=maybe-uninitialized]
>   435 |   if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
>       |       ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> cc1: all warnings being treated as errors
> ninja: build stopped: subcommand failed.
>
> gcc (GCC) 9.3.1 20200408 (Red Hat 9.3.1-2)

On the build issue, I guess you can use a rte_xmm_t passe-plat.

diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index edba7cafd5..43db784a76 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -430,10 +430,14 @@ static inline void
 rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
        uint32_t defv)
 {
-       int i;
-       for (i = 0; i < 4; i++)
-               if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
+       unsigned int i;
+       rte_xmm_t _ip;
+
+       _ip.x = ip;
+       for (i = 0; i < RTE_DIM(_ip.u32); i++) {
+               if (rte_lpm_lookup(lpm, _ip.u32[i], &hop[i]) < 0)
                        hop[i] = defv; /* lookupx4 expected to set on failure */
+       }
 }

 /**

Power, Ciara Oct. 9, 2020, 2:31 p.m. UTC | #5

Hi Konstantin,


>-----Original Message-----
>From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
>Sent: Thursday 8 October 2020 15:40
>To: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>; Power, Ciara
><ciara.power@intel.com>; dev@dpdk.org
>Cc: Richardson, Bruce <bruce.richardson@intel.com>; Jerin Jacob
><jerinj@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
>Subject: RE: [dpdk-dev] [PATCH v3 18/18] lpm: choose vector path at runtime
>
>>
>> Hi Ciara,
>>
>>
>> On 30/09/2020 14:04, Ciara Power wrote:
>> > When choosing the vector path, max SIMD bitwidth is now checked to
>> > ensure a vector path is allowable. To do this, rather than the
>> > vector lookup functions being called directly from apps, a generic
>> > lookup function is called which will call the vector functions if suitable.
>> >
>> > Signed-off-by: Ciara Power <ciara.power@intel.com>
>> > ---
>> >   lib/librte_lpm/rte_lpm.h         | 57 ++++++++++++++++++++++++++------
>> >   lib/librte_lpm/rte_lpm_altivec.h |  2 +-
>> >   lib/librte_lpm/rte_lpm_neon.h    |  2 +-
>> >   lib/librte_lpm/rte_lpm_sse.h     |  2 +-
>> >   4 files changed, 50 insertions(+), 13 deletions(-)
>> >
>> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
>> > index 03da2d37e0..edba7cafd5 100644
>> > --- a/lib/librte_lpm/rte_lpm.h
>> > +++ b/lib/librte_lpm/rte_lpm.h
>> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm
>*lpm, const uint32_t *ips,
>> >   /* Mask four results. */
>> >   #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ffffff00ffffff)
>> >
>> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) #include
>> > +"rte_lpm_neon.h"
>> > +#elif defined(RTE_ARCH_PPC_64)
>> > +#include "rte_lpm_altivec.h"
>> > +#else
>> > +#include "rte_lpm_sse.h"
>> > +#endif
>> > +
>> >   /**
>> > - * Lookup four IP addresses in an LPM table.
>> > + * Lookup four IP addresses in an LPM table individually by calling
>> > + the
>> > + * lookup function for each ip. This is used when lookupx4 is
>> > + called but
>> > + * the vector path is not suitable.
>> >    *
>> >    * @param lpm
>> >    *   LPM object handle
>> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct
>rte_lpm *lpm, const uint32_t *ips,
>> >    *   if lookup would fail.
>> >    */
>> >   static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>> > -	uint32_t defv);
>> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t
>hop[4],
>> > +	uint32_t defv)
>> > +{
>> > +	int i;
>> > +	for (i = 0; i < 4; i++)
>> > +		if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
>> > +			hop[i] = defv; /* lookupx4 expected to set on failure
>*/ }
>> >
>> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) -#include
>> > "rte_lpm_neon.h"
>> > -#elif defined(RTE_ARCH_PPC_64)
>> > -#include "rte_lpm_altivec.h"
>> > -#else
>> > -#include "rte_lpm_sse.h"
>> > -#endif
>> > +/**
>> > + * Lookup four IP addresses in an LPM table.
>> > + *
>> > + * @param lpm
>> > + *   LPM object handle
>> > + * @param ip
>> > + *   Four IPs to be looked up in the LPM table
>> > + * @param hop
>> > + *   Next hop of the most specific rule found for IP (valid on lookup hit
>only).
>> > + *   This is an 4 elements array of two byte values.
>> > + *   If the lookup was successful for the given IP, then least significant
>byte
>> > + *   of the corresponding element is the  actual next hop and the most
>> > + *   significant byte is zero.
>> > + *   If the lookup for the given IP failed, then corresponding element
>would
>> > + *   contain default value, see description of then next parameter.
>> > + * @param defv
>> > + *   Default value to populate into corresponding element of hop[] array,
>> > + *   if lookup would fail.
>> > + */
>> > +static inline void
>> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
>> > +	uint32_t defv)
>> > +{
>> > +	if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
>> > +		rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
>> > +	else
>> > +		rte_lpm_lookupx4_scalar(lpm, ip, hop, defv); }
>>
>> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4
>> is used in the hot path, and a bulk size is too small to amortize the
>> cost of adding this extra logic.
>
>I do share Vladimir's concern regarding performance here.
>As I said in other mail - it seems not much point to insert these checks into
>inline SSE specific function, as SSE is enabled by default for all x86 builds.
>

The performance impact is quite small, thanks Vladimir for providing these results:

before patches:
	LPM LookupX4: 25.1 cycles (fails = 12.5%)
	LPM LookupX4: 25.2 cycles (fails = 12.5%)
	LPM LookupX4: 25.2 cycles (fails = 12.5%)
 
v3:
	LPM LookupX4: 26.2 cycles (fails = 12.5%)
	LPM LookupX4: 26.2 cycles (fails = 12.5%)
	LPM LookupX4: 26.2 cycles (fails = 12.5%)
 
v4: 
Note: I haven't sent this publicly yet, modified v3 slightly to check the bitwidth
in LPM create and set a flag that is used in lookupx4 to choose either vector or scalar function.  
	LPM LookupX4: 25.5 cycles (fails = 12.5%)
	LPM LookupX4: 25.5 cycles (fails = 12.5%)
	LPM LookupX4: 25.5 cycles (fails = 12.5%)


Thanks,
Ciara

>As another more generic thought - might be better to avoid these checks in
>other public SIMD-specific inline functions (if any).
>If such function get called from some .c, then at least such SIMD ISA is
>already enabled for that .c file and I think this check should be
>left for caller to do.
>
>> >
>> >   #ifdef __cplusplus
>> >   }
>> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h
>> > b/lib/librte_lpm/rte_lpm_altivec.h
>> > index 228c41b38e..82142d3351 100644
>> > --- a/lib/librte_lpm/rte_lpm_altivec.h
>> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
>> > @@ -16,7 +16,7 @@ extern "C" {
>> >   #endif
>> >
>> >   static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> >   	uint32_t defv)
>> >   {
>> >   	vector signed int i24;
>> > diff --git a/lib/librte_lpm/rte_lpm_neon.h
>> > b/lib/librte_lpm/rte_lpm_neon.h index 6c131d3125..14b184515d 100644
>> > --- a/lib/librte_lpm/rte_lpm_neon.h
>> > +++ b/lib/librte_lpm/rte_lpm_neon.h
>> > @@ -16,7 +16,7 @@ extern "C" {
>> >   #endif
>> >
>> >   static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> >   	uint32_t defv)
>> >   {
>> >   	uint32x4_t i24;
>> > diff --git a/lib/librte_lpm/rte_lpm_sse.h
>> > b/lib/librte_lpm/rte_lpm_sse.h index 44770b6ff8..cb5477c6cf 100644
>> > --- a/lib/librte_lpm/rte_lpm_sse.h
>> > +++ b/lib/librte_lpm/rte_lpm_sse.h
>> > @@ -15,7 +15,7 @@ extern "C" {
>> >   #endif
>> >
>> >   static inline void
>> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > hop[4],
>> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
>> > +hop[4],
>> >   	uint32_t defv)
>> >   {
>> >   	__m128i i24;
>> >
>>
>> --
>> Regards,
>> Vladimir

Ananyev, Konstantin Oct. 11, 2020, 10:49 p.m. UTC | #6

Hi Ciara,

> Hi Konstantin,
> 
> 
> >-----Original Message-----
> >From: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> >Sent: Thursday 8 October 2020 15:40
> >To: Medvedkin, Vladimir <vladimir.medvedkin@intel.com>; Power, Ciara
> ><ciara.power@intel.com>; dev@dpdk.org
> >Cc: Richardson, Bruce <bruce.richardson@intel.com>; Jerin Jacob
> ><jerinj@marvell.com>; Ruifeng Wang <ruifeng.wang@arm.com>
> >Subject: RE: [dpdk-dev] [PATCH v3 18/18] lpm: choose vector path at runtime
> >
> >>
> >> Hi Ciara,
> >>
> >>
> >> On 30/09/2020 14:04, Ciara Power wrote:
> >> > When choosing the vector path, max SIMD bitwidth is now checked to
> >> > ensure a vector path is allowable. To do this, rather than the
> >> > vector lookup functions being called directly from apps, a generic
> >> > lookup function is called which will call the vector functions if suitable.
> >> >
> >> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> >> > ---
> >> >   lib/librte_lpm/rte_lpm.h         | 57 ++++++++++++++++++++++++++------
> >> >   lib/librte_lpm/rte_lpm_altivec.h |  2 +-
> >> >   lib/librte_lpm/rte_lpm_neon.h    |  2 +-
> >> >   lib/librte_lpm/rte_lpm_sse.h     |  2 +-
> >> >   4 files changed, 50 insertions(+), 13 deletions(-)
> >> >
> >> > diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
> >> > index 03da2d37e0..edba7cafd5 100644
> >> > --- a/lib/librte_lpm/rte_lpm.h
> >> > +++ b/lib/librte_lpm/rte_lpm.h
> >> > @@ -397,8 +397,18 @@ rte_lpm_lookup_bulk_func(const struct rte_lpm
> >*lpm, const uint32_t *ips,
> >> >   /* Mask four results. */
> >> >   #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ffffff00ffffff)
> >> >
> >> > +#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) #include
> >> > +"rte_lpm_neon.h"
> >> > +#elif defined(RTE_ARCH_PPC_64)
> >> > +#include "rte_lpm_altivec.h"
> >> > +#else
> >> > +#include "rte_lpm_sse.h"
> >> > +#endif
> >> > +
> >> >   /**
> >> > - * Lookup four IP addresses in an LPM table.
> >> > + * Lookup four IP addresses in an LPM table individually by calling
> >> > + the
> >> > + * lookup function for each ip. This is used when lookupx4 is
> >> > + called but
> >> > + * the vector path is not suitable.
> >> >    *
> >> >    * @param lpm
> >> >    *   LPM object handle
> >> > @@ -417,16 +427,43 @@ rte_lpm_lookup_bulk_func(const struct
> >rte_lpm *lpm, const uint32_t *ips,
> >> >    *   if lookup would fail.
> >> >    */
> >> >   static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >> > -	uint32_t defv);
> >> > +rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t
> >hop[4],
> >> > +	uint32_t defv)
> >> > +{
> >> > +	int i;
> >> > +	for (i = 0; i < 4; i++)
> >> > +		if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
> >> > +			hop[i] = defv; /* lookupx4 expected to set on failure
> >*/ }
> >> >
> >> > -#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64) -#include
> >> > "rte_lpm_neon.h"
> >> > -#elif defined(RTE_ARCH_PPC_64)
> >> > -#include "rte_lpm_altivec.h"
> >> > -#else
> >> > -#include "rte_lpm_sse.h"
> >> > -#endif
> >> > +/**
> >> > + * Lookup four IP addresses in an LPM table.
> >> > + *
> >> > + * @param lpm
> >> > + *   LPM object handle
> >> > + * @param ip
> >> > + *   Four IPs to be looked up in the LPM table
> >> > + * @param hop
> >> > + *   Next hop of the most specific rule found for IP (valid on lookup hit
> >only).
> >> > + *   This is an 4 elements array of two byte values.
> >> > + *   If the lookup was successful for the given IP, then least significant
> >byte
> >> > + *   of the corresponding element is the  actual next hop and the most
> >> > + *   significant byte is zero.
> >> > + *   If the lookup for the given IP failed, then corresponding element
> >would
> >> > + *   contain default value, see description of then next parameter.
> >> > + * @param defv
> >> > + *   Default value to populate into corresponding element of hop[] array,
> >> > + *   if lookup would fail.
> >> > + */
> >> > +static inline void
> >> > +rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
> >> > +	uint32_t defv)
> >> > +{
> >> > +	if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
> >> > +		rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
> >> > +	else
> >> > +		rte_lpm_lookupx4_scalar(lpm, ip, hop, defv); }
> >>
> >> I'm afraid this will lead to a drop in performance. rte_lpm_lookupx4
> >> is used in the hot path, and a bulk size is too small to amortize the
> >> cost of adding this extra logic.
> >
> >I do share Vladimir's concern regarding performance here.
> >As I said in other mail - it seems not much point to insert these checks into
> >inline SSE specific function, as SSE is enabled by default for all x86 builds.
> >
> 
> The performance impact is quite small, thanks Vladimir for providing these results:
> 
> before patches:
> 	LPM LookupX4: 25.1 cycles (fails = 12.5%)
> 	LPM LookupX4: 25.2 cycles (fails = 12.5%)
> 	LPM LookupX4: 25.2 cycles (fails = 12.5%)
> 
> v3:
> 	LPM LookupX4: 26.2 cycles (fails = 12.5%)
> 	LPM LookupX4: 26.2 cycles (fails = 12.5%)
> 	LPM LookupX4: 26.2 cycles (fails = 12.5%) 

Yes, perf difference is surprisingly small...
Wonder what tests did you use for that?
I'd expect that on l3fwd it would be more noticeable,
especially on machines with low-end cpus. 

> v4:
> Note: I haven't sent this publicly yet, modified v3 slightly to check the bitwidth
> in LPM create and set a flag that is used in lookupx4 to choose either vector or scalar function.

Yes, avoiding function call will definitely help here.
Though I am sill not convinced we have to make such checks in that function at all
(and other inline functions).
Inline functions will be compiled within user code and their behaviour should be controlled
together with the rest of user code.
Let say  in l3fwd for IA rte_lpm_lookupx4 is called from /l3fwd_lpm_sse.h,
which as name implies is supposed to be build and used with SSE enabled.
If we'd like l3fwd to obey 'max-simd-width' parameter, then it needs to be done
somewhere at startup, when behaviour is selected, not inside every possible inline function
that does use SSE instrincts.

> 	LPM LookupX4: 25.5 cycles (fails = 12.5%)
> 	LPM LookupX4: 25.5 cycles (fails = 12.5%)
> 	LPM LookupX4: 25.5 cycles (fails = 12.5%)
> 
> 
> Thanks,
> Ciara
> 
> >As another more generic thought - might be better to avoid these checks in
> >other public SIMD-specific inline functions (if any).
> >If such function get called from some .c, then at least such SIMD ISA is
> >already enabled for that .c file and I think this check should be
> >left for caller to do.
> >
> >> >
> >> >   #ifdef __cplusplus
> >> >   }
> >> > diff --git a/lib/librte_lpm/rte_lpm_altivec.h
> >> > b/lib/librte_lpm/rte_lpm_altivec.h
> >> > index 228c41b38e..82142d3351 100644
> >> > --- a/lib/librte_lpm/rte_lpm_altivec.h
> >> > +++ b/lib/librte_lpm/rte_lpm_altivec.h
> >> > @@ -16,7 +16,7 @@ extern "C" {
> >> >   #endif
> >> >
> >> >   static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> >   	uint32_t defv)
> >> >   {
> >> >   	vector signed int i24;
> >> > diff --git a/lib/librte_lpm/rte_lpm_neon.h
> >> > b/lib/librte_lpm/rte_lpm_neon.h index 6c131d3125..14b184515d 100644
> >> > --- a/lib/librte_lpm/rte_lpm_neon.h
> >> > +++ b/lib/librte_lpm/rte_lpm_neon.h
> >> > @@ -16,7 +16,7 @@ extern "C" {
> >> >   #endif
> >> >
> >> >   static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> >   	uint32_t defv)
> >> >   {
> >> >   	uint32x4_t i24;
> >> > diff --git a/lib/librte_lpm/rte_lpm_sse.h
> >> > b/lib/librte_lpm/rte_lpm_sse.h index 44770b6ff8..cb5477c6cf 100644
> >> > --- a/lib/librte_lpm/rte_lpm_sse.h
> >> > +++ b/lib/librte_lpm/rte_lpm_sse.h
> >> > @@ -15,7 +15,7 @@ extern "C" {
> >> >   #endif
> >> >
> >> >   static inline void
> >> > -rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > hop[4],
> >> > +rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t
> >> > +hop[4],
> >> >   	uint32_t defv)
> >> >   {
> >> >   	__m128i i24;
> >> >
> >>
> >> --
> >> Regards,
> >> Vladimir

diff mbox series

Patch

diff --git a/lib/librte_lpm/rte_lpm.h b/lib/librte_lpm/rte_lpm.h
index 03da2d37e0..edba7cafd5 100644
--- a/lib/librte_lpm/rte_lpm.h
+++ b/lib/librte_lpm/rte_lpm.h
@@ -397,8 +397,18 @@  rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
 /* Mask four results. */
 #define	 RTE_LPM_MASKX4_RES	UINT64_C(0x00ffffff00ffffff)
 
+#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
+#include "rte_lpm_neon.h"
+#elif defined(RTE_ARCH_PPC_64)
+#include "rte_lpm_altivec.h"
+#else
+#include "rte_lpm_sse.h"
+#endif
+
 /**
- * Lookup four IP addresses in an LPM table.
+ * Lookup four IP addresses in an LPM table individually by calling the
+ * lookup function for each ip. This is used when lookupx4 is called but
+ * the vector path is not suitable.
  *
  * @param lpm
  *   LPM object handle
@@ -417,16 +427,43 @@  rte_lpm_lookup_bulk_func(const struct rte_lpm *lpm, const uint32_t *ips,
  *   if lookup would fail.
  */
 static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
-	uint32_t defv);
+rte_lpm_lookupx4_scalar(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv)
+{
+	int i;
+	for (i = 0; i < 4; i++)
+		if (rte_lpm_lookup(lpm, ((uint32_t *) &ip)[i], &hop[i]) < 0)
+			hop[i] = defv; /* lookupx4 expected to set on failure */
+}
 
-#if defined(RTE_ARCH_ARM) || defined(RTE_ARCH_ARM64)
-#include "rte_lpm_neon.h"
-#elif defined(RTE_ARCH_PPC_64)
-#include "rte_lpm_altivec.h"
-#else
-#include "rte_lpm_sse.h"
-#endif
+/**
+ * Lookup four IP addresses in an LPM table.
+ *
+ * @param lpm
+ *   LPM object handle
+ * @param ip
+ *   Four IPs to be looked up in the LPM table
+ * @param hop
+ *   Next hop of the most specific rule found for IP (valid on lookup hit only).
+ *   This is an 4 elements array of two byte values.
+ *   If the lookup was successful for the given IP, then least significant byte
+ *   of the corresponding element is the  actual next hop and the most
+ *   significant byte is zero.
+ *   If the lookup for the given IP failed, then corresponding element would
+ *   contain default value, see description of then next parameter.
+ * @param defv
+ *   Default value to populate into corresponding element of hop[] array,
+ *   if lookup would fail.
+ */
+static inline void
+rte_lpm_lookupx4(struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+	uint32_t defv)
+{
+	if (rte_get_max_simd_bitwidth() >= RTE_MAX_128_SIMD)
+		rte_lpm_lookupx4_vec(lpm, ip, hop, defv);
+	else
+		rte_lpm_lookupx4_scalar(lpm, ip, hop, defv);
+}
 
 #ifdef __cplusplus
 }
diff --git a/lib/librte_lpm/rte_lpm_altivec.h b/lib/librte_lpm/rte_lpm_altivec.h
index 228c41b38e..82142d3351 100644
--- a/lib/librte_lpm/rte_lpm_altivec.h
+++ b/lib/librte_lpm/rte_lpm_altivec.h
@@ -16,7 +16,7 @@  extern "C" {
 #endif
 
 static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	uint32_t defv)
 {
 	vector signed int i24;
diff --git a/lib/librte_lpm/rte_lpm_neon.h b/lib/librte_lpm/rte_lpm_neon.h
index 6c131d3125..14b184515d 100644
--- a/lib/librte_lpm/rte_lpm_neon.h
+++ b/lib/librte_lpm/rte_lpm_neon.h
@@ -16,7 +16,7 @@  extern "C" {
 #endif
 
 static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	uint32_t defv)
 {
 	uint32x4_t i24;
diff --git a/lib/librte_lpm/rte_lpm_sse.h b/lib/librte_lpm/rte_lpm_sse.h
index 44770b6ff8..cb5477c6cf 100644
--- a/lib/librte_lpm/rte_lpm_sse.h
+++ b/lib/librte_lpm/rte_lpm_sse.h
@@ -15,7 +15,7 @@  extern "C" {
 #endif
 
 static inline void
-rte_lpm_lookupx4(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
+rte_lpm_lookupx4_vec(const struct rte_lpm *lpm, xmm_t ip, uint32_t hop[4],
 	uint32_t defv)
 {
 	__m128i i24;