[v5,16/17] net: add checks for max SIMD bitwidth

Message ID 20201013110437.309110-17-ciara.power@intel.com (mailing list archive)
State Superseded, archived
Delegated to: David Marchand
Headers
Series add max SIMD bitwidth to EAL |

Checks

Context Check Description
ci/checkpatch success coding style OK

Commit Message

Power, Ciara Oct. 13, 2020, 11:04 a.m. UTC
  When choosing a vector path to take, an extra condition must be
satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
path.

The vector path was initially chosen in RTE_INIT, however this is no
longer suitable as we cannot check the max SIMD bitwidth at that time.
Default handlers are now chosen in RTE_INIT, these default handlers
are used the first time the crc calc is called, and they set the suitable
handlers to be used going forward.

Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
Suggested-by: Olivier Matz <olivier.matz@6wind.com>

Signed-off-by: Ciara Power <ciara.power@intel.com>

---
v4:
  - Added default handlers to be set at RTE_INIT time, rather than
    choosing scalar handlers.
  - Modified logging.
  - Updated enum name.
v3:
  - Moved choosing vector paths out of RTE_INIT.
  - Moved checking max_simd_bitwidth into the set_alg function.
---
 lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
 lib/librte_net/rte_net_crc.h |  8 ++++
 2 files changed, 72 insertions(+), 11 deletions(-)
  

Comments

Olivier Matz Oct. 13, 2020, 11:32 a.m. UTC | #1
Hi Ciara,

On Tue, Oct 13, 2020 at 12:04:36PM +0100, Ciara Power wrote:
> When choosing a vector path to take, an extra condition must be
> satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> path.
> 
> The vector path was initially chosen in RTE_INIT, however this is no
> longer suitable as we cannot check the max SIMD bitwidth at that time.
> Default handlers are now chosen in RTE_INIT, these default handlers
> are used the first time the crc calc is called, and they set the suitable
> handlers to be used going forward.
> 
> Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> 
> Signed-off-by: Ciara Power <ciara.power@intel.com>
> 

[...]

> --- a/lib/librte_net/rte_net_crc.h
> +++ b/lib/librte_net/rte_net_crc.h
> @@ -7,6 +7,8 @@
>  
>  #include <stdint.h>
>  
> +#include <rte_log.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
>  	RTE_NET_CRC_NEON,
>  };
>  
> +extern int libnet_logtype;
> +
> +#define NET_LOG(level, fmt, args...)					\
> +	rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n",	\
> +		__func__, ## args)
> +
>  /**
>   * This API set the CRC computation algorithm (i.e. scalar version,
>   * x86 64-bit sse4.2 intrinsic version, etc.) and internal data

We should expose this log macro and log type in a public header file. It
can stay in the .c file. In the future, we may want to expose it in a
private header, but not in a public API.

Regards,
Olivier
  
Ananyev, Konstantin Oct. 13, 2020, 1:07 p.m. UTC | #2
> When choosing a vector path to take, an extra condition must be
> satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> path.
> 
> The vector path was initially chosen in RTE_INIT, however this is no
> longer suitable as we cannot check the max SIMD bitwidth at that time.
> Default handlers are now chosen in RTE_INIT, these default handlers
> are used the first time the crc calc is called, and they set the suitable
> handlers to be used going forward.
> 
> Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> 
> Signed-off-by: Ciara Power <ciara.power@intel.com>
> 
> ---
> v4:
>   - Added default handlers to be set at RTE_INIT time, rather than
>     choosing scalar handlers.
>   - Modified logging.
>   - Updated enum name.
> v3:
>   - Moved choosing vector paths out of RTE_INIT.
>   - Moved checking max_simd_bitwidth into the set_alg function.
> ---
>  lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
>  lib/librte_net/rte_net_crc.h |  8 ++++
>  2 files changed, 72 insertions(+), 11 deletions(-)
> 
> diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> index 4f5b9e8286..11d0161a32 100644
> --- a/lib/librte_net/rte_net_crc.c
> +++ b/lib/librte_net/rte_net_crc.c
> @@ -9,6 +9,7 @@
>  #include <rte_cpuflags.h>
>  #include <rte_common.h>
>  #include <rte_net_crc.h>
> +#include <rte_eal.h>
> 
>  #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
>  #define X86_64_SSE42_PCLMULQDQ     1
> @@ -32,6 +33,12 @@
>  static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
>  static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
> 
> +static uint32_t
> +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> +
> +static uint32_t
> +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> +
>  static uint32_t
>  rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
> 
> @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
>  typedef uint32_t
>  (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
> 
> -static rte_net_crc_handler *handlers;
> +static rte_net_crc_handler handlers_default[] = {
> +	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> +	[RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> +};
> +
> +static rte_net_crc_handler *handlers = handlers_default;
> 
>  static rte_net_crc_handler handlers_scalar[] = {
>  	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
>  };
>  #endif
> 
> +static uint16_t max_simd_bitwidth;
> +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> +
>  /**
>   * Reflect the bits about the middle
>   *
> @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
>  	return crc;
>  }
> 
> +static uint32_t
> +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> +{
> +	if (max_simd_bitwidth == 0)
> +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> +	handlers = handlers_scalar;
> +#ifdef X86_64_SSE42_PCLMULQDQ
> +	if (max_simd_bitwidth >= RTE_SIMD_128)
> +		handlers = handlers_sse42;
> +#endif
> +#ifdef ARM64_NEON_PMULL
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> +			max_simd_bitwidth >= RTE_SIMD_128) {
> +		handlers = handlers_neon;
> +#endif

You probably don't want to make all these checks for *every* invocation
of that function. I think it would be better:
if (ma_simd_bitwidth == 0) {....}
return handlers[..](...);

BTW, while it allows us to use best possible handler,
such approach means extra indirect call(/jump) anyway.
Hard to say off-hand would it affect performance,
and if yes how significantly.
Couldn't find any perf tests in our UT for it...

> +	return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> +}
> +
> +static uint32_t
> +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> +{
> +	if (max_simd_bitwidth == 0)
> +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> +	handlers = handlers_scalar;
> +#ifdef X86_64_SSE42_PCLMULQDQ
> +	if (max_simd_bitwidth >= RTE_SIMD_128)
> +		handlers = handlers_sse42;
> +#endif
> +#ifdef ARM64_NEON_PMULL
> +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> +			max_simd_bitwidth >= RTE_SIMD_128) {
> +		handlers = handlers_neon;
> +#endif
> +	return handlers[RTE_NET_CRC32_ETH](data, data_len);
> +}
> +
>  static void
>  rte_net_crc_scalar_init(void)
>  {
> @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
>  void
>  rte_net_crc_set_alg(enum rte_net_crc_alg alg)
>  {
> +	if (max_simd_bitwidth == 0)
> +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> +
>  	switch (alg) {
>  #ifdef X86_64_SSE42_PCLMULQDQ
>  	case RTE_NET_CRC_SSE42:
> -		handlers = handlers_sse42;
> -		break;
> +		if (max_simd_bitwidth >= RTE_SIMD_128) {
> +			handlers = handlers_sse42;
> +			return;
> +		}
> +		NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
>  #elif defined ARM64_NEON_PMULL
>  		/* fall-through */
>  	case RTE_NET_CRC_NEON:
> -		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> +				max_simd_bitwidth >= RTE_SIMD_128) {
>  			handlers = handlers_neon;
> -			break;
> +			return;
>  		}
> +		NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
>  #endif
>  		/* fall-through */
>  	case RTE_NET_CRC_SCALAR:
> @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
>  /* Select highest available crc algorithm as default one */
>  RTE_INIT(rte_net_crc_init)
>  {
> -	enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> -
>  	rte_net_crc_scalar_init();
> 
>  #ifdef X86_64_SSE42_PCLMULQDQ
> -	alg = RTE_NET_CRC_SSE42;
>  	rte_net_crc_sse42_init();
>  #elif defined ARM64_NEON_PMULL
>  	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> -		alg = RTE_NET_CRC_NEON;
>  		rte_net_crc_neon_init();
>  	}
>  #endif
> -
> -	rte_net_crc_set_alg(alg);
>  }
> diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> index 16e85ca970..c942865ecf 100644
> --- a/lib/librte_net/rte_net_crc.h
> +++ b/lib/librte_net/rte_net_crc.h
> @@ -7,6 +7,8 @@
> 
>  #include <stdint.h>
> 
> +#include <rte_log.h>
> +
>  #ifdef __cplusplus
>  extern "C" {
>  #endif
> @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
>  	RTE_NET_CRC_NEON,
>  };
> 
> +extern int libnet_logtype;
> +
> +#define NET_LOG(level, fmt, args...)					\
> +	rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n",	\
> +		__func__, ## args)
> +
>  /**
>   * This API set the CRC computation algorithm (i.e. scalar version,
>   * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> --
> 2.22.0
  
Ananyev, Konstantin Oct. 13, 2020, 1:25 p.m. UTC | #3
> 
> > When choosing a vector path to take, an extra condition must be
> > satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> > path.
> >
> > The vector path was initially chosen in RTE_INIT, however this is no
> > longer suitable as we cannot check the max SIMD bitwidth at that time.
> > Default handlers are now chosen in RTE_INIT, these default handlers
> > are used the first time the crc calc is called, and they set the suitable
> > handlers to be used going forward.
> >
> > Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> >
> > ---
> > v4:
> >   - Added default handlers to be set at RTE_INIT time, rather than
> >     choosing scalar handlers.
> >   - Modified logging.
> >   - Updated enum name.
> > v3:
> >   - Moved choosing vector paths out of RTE_INIT.
> >   - Moved checking max_simd_bitwidth into the set_alg function.
> > ---
> >  lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
> >  lib/librte_net/rte_net_crc.h |  8 ++++
> >  2 files changed, 72 insertions(+), 11 deletions(-)
> >
> > diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> > index 4f5b9e8286..11d0161a32 100644
> > --- a/lib/librte_net/rte_net_crc.c
> > +++ b/lib/librte_net/rte_net_crc.c
> > @@ -9,6 +9,7 @@
> >  #include <rte_cpuflags.h>
> >  #include <rte_common.h>
> >  #include <rte_net_crc.h>
> > +#include <rte_eal.h>
> >
> >  #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
> >  #define X86_64_SSE42_PCLMULQDQ     1
> > @@ -32,6 +33,12 @@
> >  static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
> >  static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
> >
> > +static uint32_t
> > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> > +
> > +static uint32_t
> > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> > +
> >  static uint32_t
> >  rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
> >
> > @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
> >  typedef uint32_t
> >  (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
> >
> > -static rte_net_crc_handler *handlers;
> > +static rte_net_crc_handler handlers_default[] = {
> > +	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> > +	[RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> > +};
> > +
> > +static rte_net_crc_handler *handlers = handlers_default;
> >
> >  static rte_net_crc_handler handlers_scalar[] = {
> >  	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> > @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
> >  };
> >  #endif
> >
> > +static uint16_t max_simd_bitwidth;
> > +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> > +
> >  /**
> >   * Reflect the bits about the middle
> >   *
> > @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
> >  	return crc;
> >  }
> >
> > +static uint32_t
> > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> > +{
> > +	if (max_simd_bitwidth == 0)
> > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > +	handlers = handlers_scalar;
> > +#ifdef X86_64_SSE42_PCLMULQDQ
> > +	if (max_simd_bitwidth >= RTE_SIMD_128)
> > +		handlers = handlers_sse42;
> > +#endif
> > +#ifdef ARM64_NEON_PMULL
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > +			max_simd_bitwidth >= RTE_SIMD_128) {
> > +		handlers = handlers_neon;
> > +#endif
> 
> You probably don't want to make all these checks for *every* invocation
> of that function. I think it would be better:
> if (ma_simd_bitwidth == 0) {....}
> return handlers[..](...);

As another thougth - it is probably a bit safer to update max_simd_bitwidht
after handler value update.

handler = ...; rte_smp_wmb(); max_simd_width = ...;

> 
> BTW, while it allows us to use best possible handler,
> such approach means extra indirect call(/jump) anyway.
> Hard to say off-hand would it affect performance,
> and if yes how significantly.
> Couldn't find any perf tests in our UT for it...
> 
> > +	return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> > +}
> > +
> > +static uint32_t
> > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> > +{
> > +	if (max_simd_bitwidth == 0)
> > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > +	handlers = handlers_scalar;
> > +#ifdef X86_64_SSE42_PCLMULQDQ
> > +	if (max_simd_bitwidth >= RTE_SIMD_128)
> > +		handlers = handlers_sse42;
> > +#endif
> > +#ifdef ARM64_NEON_PMULL
> > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > +			max_simd_bitwidth >= RTE_SIMD_128) {
> > +		handlers = handlers_neon;
> > +#endif
> > +	return handlers[RTE_NET_CRC32_ETH](data, data_len);
> > +}
> > +
> >  static void
> >  rte_net_crc_scalar_init(void)
> >  {
> > @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
> >  void
> >  rte_net_crc_set_alg(enum rte_net_crc_alg alg)
> >  {
> > +	if (max_simd_bitwidth == 0)
> > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > +
> >  	switch (alg) {
> >  #ifdef X86_64_SSE42_PCLMULQDQ
> >  	case RTE_NET_CRC_SSE42:
> > -		handlers = handlers_sse42;
> > -		break;
> > +		if (max_simd_bitwidth >= RTE_SIMD_128) {
> > +			handlers = handlers_sse42;
> > +			return;
> > +		}
> > +		NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
> >  #elif defined ARM64_NEON_PMULL
> >  		/* fall-through */
> >  	case RTE_NET_CRC_NEON:
> > -		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > +				max_simd_bitwidth >= RTE_SIMD_128) {
> >  			handlers = handlers_neon;
> > -			break;
> > +			return;
> >  		}
> > +		NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
> >  #endif
> >  		/* fall-through */
> >  	case RTE_NET_CRC_SCALAR:
> > @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
> >  /* Select highest available crc algorithm as default one */
> >  RTE_INIT(rte_net_crc_init)
> >  {
> > -	enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> > -
> >  	rte_net_crc_scalar_init();
> >
> >  #ifdef X86_64_SSE42_PCLMULQDQ
> > -	alg = RTE_NET_CRC_SSE42;
> >  	rte_net_crc_sse42_init();
> >  #elif defined ARM64_NEON_PMULL
> >  	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > -		alg = RTE_NET_CRC_NEON;
> >  		rte_net_crc_neon_init();
> >  	}
> >  #endif
> > -
> > -	rte_net_crc_set_alg(alg);
> >  }
> > diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> > index 16e85ca970..c942865ecf 100644
> > --- a/lib/librte_net/rte_net_crc.h
> > +++ b/lib/librte_net/rte_net_crc.h
> > @@ -7,6 +7,8 @@
> >
> >  #include <stdint.h>
> >
> > +#include <rte_log.h>
> > +
> >  #ifdef __cplusplus
> >  extern "C" {
> >  #endif
> > @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> >  	RTE_NET_CRC_NEON,
> >  };
> >
> > +extern int libnet_logtype;
> > +
> > +#define NET_LOG(level, fmt, args...)					\
> > +	rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n",	\
> > +		__func__, ## args)
> > +
> >  /**
> >   * This API set the CRC computation algorithm (i.e. scalar version,
> >   * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> > --
> > 2.22.0
  
Ananyev, Konstantin Oct. 13, 2020, 1:57 p.m. UTC | #4
> >
> > > When choosing a vector path to take, an extra condition must be
> > > satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> > > path.
> > >
> > > The vector path was initially chosen in RTE_INIT, however this is no
> > > longer suitable as we cannot check the max SIMD bitwidth at that time.
> > > Default handlers are now chosen in RTE_INIT, these default handlers
> > > are used the first time the crc calc is called, and they set the suitable
> > > handlers to be used going forward.
> > >
> > > Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> > >
> > > Signed-off-by: Ciara Power <ciara.power@intel.com>
> > >
> > > ---
> > > v4:
> > >   - Added default handlers to be set at RTE_INIT time, rather than
> > >     choosing scalar handlers.
> > >   - Modified logging.
> > >   - Updated enum name.
> > > v3:
> > >   - Moved choosing vector paths out of RTE_INIT.
> > >   - Moved checking max_simd_bitwidth into the set_alg function.
> > > ---
> > >  lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
> > >  lib/librte_net/rte_net_crc.h |  8 ++++
> > >  2 files changed, 72 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> > > index 4f5b9e8286..11d0161a32 100644
> > > --- a/lib/librte_net/rte_net_crc.c
> > > +++ b/lib/librte_net/rte_net_crc.c
> > > @@ -9,6 +9,7 @@
> > >  #include <rte_cpuflags.h>
> > >  #include <rte_common.h>
> > >  #include <rte_net_crc.h>
> > > +#include <rte_eal.h>
> > >
> > >  #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
> > >  #define X86_64_SSE42_PCLMULQDQ     1
> > > @@ -32,6 +33,12 @@
> > >  static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
> > >  static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
> > >
> > > +static uint32_t
> > > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> > > +
> > > +static uint32_t
> > > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> > > +
> > >  static uint32_t
> > >  rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
> > >
> > > @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
> > >  typedef uint32_t
> > >  (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
> > >
> > > -static rte_net_crc_handler *handlers;
> > > +static rte_net_crc_handler handlers_default[] = {
> > > +	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> > > +	[RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> > > +};
> > > +
> > > +static rte_net_crc_handler *handlers = handlers_default;
> > >
> > >  static rte_net_crc_handler handlers_scalar[] = {
> > >  	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> > > @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
> > >  };
> > >  #endif
> > >
> > > +static uint16_t max_simd_bitwidth;
> > > +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> > > +
> > >  /**
> > >   * Reflect the bits about the middle
> > >   *
> > > @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
> > >  	return crc;
> > >  }
> > >
> > > +static uint32_t
> > > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> > > +{
> > > +	if (max_simd_bitwidth == 0)
> > > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > +	handlers = handlers_scalar;
> > > +#ifdef X86_64_SSE42_PCLMULQDQ
> > > +	if (max_simd_bitwidth >= RTE_SIMD_128)
> > > +		handlers = handlers_sse42;
> > > +#endif
> > > +#ifdef ARM64_NEON_PMULL
> > > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > +			max_simd_bitwidth >= RTE_SIMD_128) {
> > > +		handlers = handlers_neon;
> > > +#endif
> >
> > You probably don't want to make all these checks for *every* invocation
> > of that function. I think it would be better:
> > if (ma_simd_bitwidth == 0) {....}
> > return handlers[..](...);
> 
> As another thougth - it is probably a bit safer to update max_simd_bitwidht
> after handler value update.
> 
> handler = ...; rte_smp_wmb(); max_simd_width = ...;
> 
> >
> > BTW, while it allows us to use best possible handler,
> > such approach means extra indirect call(/jump) anyway.

Ah sorry, that would only happen once, please ignore that one. 

> > Hard to say off-hand would it affect performance,
> > and if yes how significantly.
> > Couldn't find any perf tests in our UT for it...
> >
> > > +	return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> > > +}
> > > +
> > > +static uint32_t
> > > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> > > +{
> > > +	if (max_simd_bitwidth == 0)
> > > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > +	handlers = handlers_scalar;
> > > +#ifdef X86_64_SSE42_PCLMULQDQ
> > > +	if (max_simd_bitwidth >= RTE_SIMD_128)
> > > +		handlers = handlers_sse42;
> > > +#endif
> > > +#ifdef ARM64_NEON_PMULL
> > > +	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > +			max_simd_bitwidth >= RTE_SIMD_128) {
> > > +		handlers = handlers_neon;
> > > +#endif
> > > +	return handlers[RTE_NET_CRC32_ETH](data, data_len);
> > > +}
> > > +
> > >  static void
> > >  rte_net_crc_scalar_init(void)
> > >  {
> > > @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
> > >  void
> > >  rte_net_crc_set_alg(enum rte_net_crc_alg alg)
> > >  {
> > > +	if (max_simd_bitwidth == 0)
> > > +		max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > +
> > >  	switch (alg) {
> > >  #ifdef X86_64_SSE42_PCLMULQDQ
> > >  	case RTE_NET_CRC_SSE42:
> > > -		handlers = handlers_sse42;
> > > -		break;
> > > +		if (max_simd_bitwidth >= RTE_SIMD_128) {
> > > +			handlers = handlers_sse42;
> > > +			return;
> > > +		}
> > > +		NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
> > >  #elif defined ARM64_NEON_PMULL
> > >  		/* fall-through */
> > >  	case RTE_NET_CRC_NEON:
> > > -		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > > +		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > +				max_simd_bitwidth >= RTE_SIMD_128) {
> > >  			handlers = handlers_neon;
> > > -			break;
> > > +			return;
> > >  		}
> > > +		NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
> > >  #endif
> > >  		/* fall-through */
> > >  	case RTE_NET_CRC_SCALAR:
> > > @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
> > >  /* Select highest available crc algorithm as default one */
> > >  RTE_INIT(rte_net_crc_init)
> > >  {
> > > -	enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> > > -
> > >  	rte_net_crc_scalar_init();
> > >
> > >  #ifdef X86_64_SSE42_PCLMULQDQ
> > > -	alg = RTE_NET_CRC_SSE42;
> > >  	rte_net_crc_sse42_init();
> > >  #elif defined ARM64_NEON_PMULL
> > >  	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > > -		alg = RTE_NET_CRC_NEON;
> > >  		rte_net_crc_neon_init();
> > >  	}
> > >  #endif
> > > -
> > > -	rte_net_crc_set_alg(alg);
> > >  }
> > > diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> > > index 16e85ca970..c942865ecf 100644
> > > --- a/lib/librte_net/rte_net_crc.h
> > > +++ b/lib/librte_net/rte_net_crc.h
> > > @@ -7,6 +7,8 @@
> > >
> > >  #include <stdint.h>
> > >
> > > +#include <rte_log.h>
> > > +
> > >  #ifdef __cplusplus
> > >  extern "C" {
> > >  #endif
> > > @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> > >  	RTE_NET_CRC_NEON,
> > >  };
> > >
> > > +extern int libnet_logtype;
> > > +
> > > +#define NET_LOG(level, fmt, args...)					\
> > > +	rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n",	\
> > > +		__func__, ## args)
> > > +
> > >  /**
> > >   * This API set the CRC computation algorithm (i.e. scalar version,
> > >   * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> > > --
> > > 2.22.0
  

Patch

diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
index 4f5b9e8286..11d0161a32 100644
--- a/lib/librte_net/rte_net_crc.c
+++ b/lib/librte_net/rte_net_crc.c
@@ -9,6 +9,7 @@ 
 #include <rte_cpuflags.h>
 #include <rte_common.h>
 #include <rte_net_crc.h>
+#include <rte_eal.h>
 
 #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
 #define X86_64_SSE42_PCLMULQDQ     1
@@ -32,6 +33,12 @@ 
 static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
 static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
 
+static uint32_t
+rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
+
+static uint32_t
+rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
+
 static uint32_t
 rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
 
@@ -41,7 +48,12 @@  rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
 typedef uint32_t
 (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
 
-static rte_net_crc_handler *handlers;
+static rte_net_crc_handler handlers_default[] = {
+	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
+	[RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
+};
+
+static rte_net_crc_handler *handlers = handlers_default;
 
 static rte_net_crc_handler handlers_scalar[] = {
 	[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
@@ -60,6 +72,9 @@  static rte_net_crc_handler handlers_neon[] = {
 };
 #endif
 
+static uint16_t max_simd_bitwidth;
+RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
+
 /**
  * Reflect the bits about the middle
  *
@@ -112,6 +127,42 @@  crc32_eth_calc_lut(const uint8_t *data,
 	return crc;
 }
 
+static uint32_t
+rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
+{
+	if (max_simd_bitwidth == 0)
+		max_simd_bitwidth = rte_get_max_simd_bitwidth();
+	handlers = handlers_scalar;
+#ifdef X86_64_SSE42_PCLMULQDQ
+	if (max_simd_bitwidth >= RTE_SIMD_128)
+		handlers = handlers_sse42;
+#endif
+#ifdef ARM64_NEON_PMULL
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+			max_simd_bitwidth >= RTE_SIMD_128) {
+		handlers = handlers_neon;
+#endif
+	return handlers[RTE_NET_CRC16_CCITT](data, data_len);
+}
+
+static uint32_t
+rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
+{
+	if (max_simd_bitwidth == 0)
+		max_simd_bitwidth = rte_get_max_simd_bitwidth();
+	handlers = handlers_scalar;
+#ifdef X86_64_SSE42_PCLMULQDQ
+	if (max_simd_bitwidth >= RTE_SIMD_128)
+		handlers = handlers_sse42;
+#endif
+#ifdef ARM64_NEON_PMULL
+	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+			max_simd_bitwidth >= RTE_SIMD_128) {
+		handlers = handlers_neon;
+#endif
+	return handlers[RTE_NET_CRC32_ETH](data, data_len);
+}
+
 static void
 rte_net_crc_scalar_init(void)
 {
@@ -145,18 +196,26 @@  rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
 void
 rte_net_crc_set_alg(enum rte_net_crc_alg alg)
 {
+	if (max_simd_bitwidth == 0)
+		max_simd_bitwidth = rte_get_max_simd_bitwidth();
+
 	switch (alg) {
 #ifdef X86_64_SSE42_PCLMULQDQ
 	case RTE_NET_CRC_SSE42:
-		handlers = handlers_sse42;
-		break;
+		if (max_simd_bitwidth >= RTE_SIMD_128) {
+			handlers = handlers_sse42;
+			return;
+		}
+		NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
 #elif defined ARM64_NEON_PMULL
 		/* fall-through */
 	case RTE_NET_CRC_NEON:
-		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+		if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+				max_simd_bitwidth >= RTE_SIMD_128) {
 			handlers = handlers_neon;
-			break;
+			return;
 		}
+		NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
 #endif
 		/* fall-through */
 	case RTE_NET_CRC_SCALAR:
@@ -184,19 +243,13 @@  rte_net_crc_calc(const void *data,
 /* Select highest available crc algorithm as default one */
 RTE_INIT(rte_net_crc_init)
 {
-	enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
-
 	rte_net_crc_scalar_init();
 
 #ifdef X86_64_SSE42_PCLMULQDQ
-	alg = RTE_NET_CRC_SSE42;
 	rte_net_crc_sse42_init();
 #elif defined ARM64_NEON_PMULL
 	if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
-		alg = RTE_NET_CRC_NEON;
 		rte_net_crc_neon_init();
 	}
 #endif
-
-	rte_net_crc_set_alg(alg);
 }
diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
index 16e85ca970..c942865ecf 100644
--- a/lib/librte_net/rte_net_crc.h
+++ b/lib/librte_net/rte_net_crc.h
@@ -7,6 +7,8 @@ 
 
 #include <stdint.h>
 
+#include <rte_log.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -25,6 +27,12 @@  enum rte_net_crc_alg {
 	RTE_NET_CRC_NEON,
 };
 
+extern int libnet_logtype;
+
+#define NET_LOG(level, fmt, args...)					\
+	rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n",	\
+		__func__, ## args)
+
 /**
  * This API set the CRC computation algorithm (i.e. scalar version,
  * x86 64-bit sse4.2 intrinsic version, etc.) and internal data