[v5,16/17] net: add checks for max SIMD bitwidth
Checks
Commit Message
When choosing a vector path to take, an extra condition must be
satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
path.
The vector path was initially chosen in RTE_INIT, however this is no
longer suitable as we cannot check the max SIMD bitwidth at that time.
Default handlers are now chosen in RTE_INIT, these default handlers
are used the first time the crc calc is called, and they set the suitable
handlers to be used going forward.
Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
Suggested-by: Olivier Matz <olivier.matz@6wind.com>
Signed-off-by: Ciara Power <ciara.power@intel.com>
---
v4:
- Added default handlers to be set at RTE_INIT time, rather than
choosing scalar handlers.
- Modified logging.
- Updated enum name.
v3:
- Moved choosing vector paths out of RTE_INIT.
- Moved checking max_simd_bitwidth into the set_alg function.
---
lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
lib/librte_net/rte_net_crc.h | 8 ++++
2 files changed, 72 insertions(+), 11 deletions(-)
Comments
Hi Ciara,
On Tue, Oct 13, 2020 at 12:04:36PM +0100, Ciara Power wrote:
> When choosing a vector path to take, an extra condition must be
> satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> path.
>
> The vector path was initially chosen in RTE_INIT, however this is no
> longer suitable as we cannot check the max SIMD bitwidth at that time.
> Default handlers are now chosen in RTE_INIT, these default handlers
> are used the first time the crc calc is called, and they set the suitable
> handlers to be used going forward.
>
> Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
>
> Signed-off-by: Ciara Power <ciara.power@intel.com>
>
[...]
> --- a/lib/librte_net/rte_net_crc.h
> +++ b/lib/librte_net/rte_net_crc.h
> @@ -7,6 +7,8 @@
>
> #include <stdint.h>
>
> +#include <rte_log.h>
> +
> #ifdef __cplusplus
> extern "C" {
> #endif
> @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> RTE_NET_CRC_NEON,
> };
>
> +extern int libnet_logtype;
> +
> +#define NET_LOG(level, fmt, args...) \
> + rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n", \
> + __func__, ## args)
> +
> /**
> * This API set the CRC computation algorithm (i.e. scalar version,
> * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
We should expose this log macro and log type in a public header file. It
can stay in the .c file. In the future, we may want to expose it in a
private header, but not in a public API.
Regards,
Olivier
> When choosing a vector path to take, an extra condition must be
> satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> path.
>
> The vector path was initially chosen in RTE_INIT, however this is no
> longer suitable as we cannot check the max SIMD bitwidth at that time.
> Default handlers are now chosen in RTE_INIT, these default handlers
> are used the first time the crc calc is called, and they set the suitable
> handlers to be used going forward.
>
> Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> Suggested-by: Olivier Matz <olivier.matz@6wind.com>
>
> Signed-off-by: Ciara Power <ciara.power@intel.com>
>
> ---
> v4:
> - Added default handlers to be set at RTE_INIT time, rather than
> choosing scalar handlers.
> - Modified logging.
> - Updated enum name.
> v3:
> - Moved choosing vector paths out of RTE_INIT.
> - Moved checking max_simd_bitwidth into the set_alg function.
> ---
> lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
> lib/librte_net/rte_net_crc.h | 8 ++++
> 2 files changed, 72 insertions(+), 11 deletions(-)
>
> diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> index 4f5b9e8286..11d0161a32 100644
> --- a/lib/librte_net/rte_net_crc.c
> +++ b/lib/librte_net/rte_net_crc.c
> @@ -9,6 +9,7 @@
> #include <rte_cpuflags.h>
> #include <rte_common.h>
> #include <rte_net_crc.h>
> +#include <rte_eal.h>
>
> #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
> #define X86_64_SSE42_PCLMULQDQ 1
> @@ -32,6 +33,12 @@
> static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
> static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
>
> +static uint32_t
> +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> +
> +static uint32_t
> +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> +
> static uint32_t
> rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
>
> @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
> typedef uint32_t
> (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
>
> -static rte_net_crc_handler *handlers;
> +static rte_net_crc_handler handlers_default[] = {
> + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> + [RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> +};
> +
> +static rte_net_crc_handler *handlers = handlers_default;
>
> static rte_net_crc_handler handlers_scalar[] = {
> [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
> };
> #endif
>
> +static uint16_t max_simd_bitwidth;
> +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> +
> /**
> * Reflect the bits about the middle
> *
> @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
> return crc;
> }
>
> +static uint32_t
> +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> +{
> + if (max_simd_bitwidth == 0)
> + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> + handlers = handlers_scalar;
> +#ifdef X86_64_SSE42_PCLMULQDQ
> + if (max_simd_bitwidth >= RTE_SIMD_128)
> + handlers = handlers_sse42;
> +#endif
> +#ifdef ARM64_NEON_PMULL
> + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> + max_simd_bitwidth >= RTE_SIMD_128) {
> + handlers = handlers_neon;
> +#endif
You probably don't want to make all these checks for *every* invocation
of that function. I think it would be better:
if (ma_simd_bitwidth == 0) {....}
return handlers[..](...);
BTW, while it allows us to use best possible handler,
such approach means extra indirect call(/jump) anyway.
Hard to say off-hand would it affect performance,
and if yes how significantly.
Couldn't find any perf tests in our UT for it...
> + return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> +}
> +
> +static uint32_t
> +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> +{
> + if (max_simd_bitwidth == 0)
> + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> + handlers = handlers_scalar;
> +#ifdef X86_64_SSE42_PCLMULQDQ
> + if (max_simd_bitwidth >= RTE_SIMD_128)
> + handlers = handlers_sse42;
> +#endif
> +#ifdef ARM64_NEON_PMULL
> + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> + max_simd_bitwidth >= RTE_SIMD_128) {
> + handlers = handlers_neon;
> +#endif
> + return handlers[RTE_NET_CRC32_ETH](data, data_len);
> +}
> +
> static void
> rte_net_crc_scalar_init(void)
> {
> @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
> void
> rte_net_crc_set_alg(enum rte_net_crc_alg alg)
> {
> + if (max_simd_bitwidth == 0)
> + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> +
> switch (alg) {
> #ifdef X86_64_SSE42_PCLMULQDQ
> case RTE_NET_CRC_SSE42:
> - handlers = handlers_sse42;
> - break;
> + if (max_simd_bitwidth >= RTE_SIMD_128) {
> + handlers = handlers_sse42;
> + return;
> + }
> + NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
> #elif defined ARM64_NEON_PMULL
> /* fall-through */
> case RTE_NET_CRC_NEON:
> - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> + max_simd_bitwidth >= RTE_SIMD_128) {
> handlers = handlers_neon;
> - break;
> + return;
> }
> + NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
> #endif
> /* fall-through */
> case RTE_NET_CRC_SCALAR:
> @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
> /* Select highest available crc algorithm as default one */
> RTE_INIT(rte_net_crc_init)
> {
> - enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> -
> rte_net_crc_scalar_init();
>
> #ifdef X86_64_SSE42_PCLMULQDQ
> - alg = RTE_NET_CRC_SSE42;
> rte_net_crc_sse42_init();
> #elif defined ARM64_NEON_PMULL
> if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> - alg = RTE_NET_CRC_NEON;
> rte_net_crc_neon_init();
> }
> #endif
> -
> - rte_net_crc_set_alg(alg);
> }
> diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> index 16e85ca970..c942865ecf 100644
> --- a/lib/librte_net/rte_net_crc.h
> +++ b/lib/librte_net/rte_net_crc.h
> @@ -7,6 +7,8 @@
>
> #include <stdint.h>
>
> +#include <rte_log.h>
> +
> #ifdef __cplusplus
> extern "C" {
> #endif
> @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> RTE_NET_CRC_NEON,
> };
>
> +extern int libnet_logtype;
> +
> +#define NET_LOG(level, fmt, args...) \
> + rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n", \
> + __func__, ## args)
> +
> /**
> * This API set the CRC computation algorithm (i.e. scalar version,
> * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> --
> 2.22.0
>
> > When choosing a vector path to take, an extra condition must be
> > satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> > path.
> >
> > The vector path was initially chosen in RTE_INIT, however this is no
> > longer suitable as we cannot check the max SIMD bitwidth at that time.
> > Default handlers are now chosen in RTE_INIT, these default handlers
> > are used the first time the crc calc is called, and they set the suitable
> > handlers to be used going forward.
> >
> > Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> >
> > Signed-off-by: Ciara Power <ciara.power@intel.com>
> >
> > ---
> > v4:
> > - Added default handlers to be set at RTE_INIT time, rather than
> > choosing scalar handlers.
> > - Modified logging.
> > - Updated enum name.
> > v3:
> > - Moved choosing vector paths out of RTE_INIT.
> > - Moved checking max_simd_bitwidth into the set_alg function.
> > ---
> > lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
> > lib/librte_net/rte_net_crc.h | 8 ++++
> > 2 files changed, 72 insertions(+), 11 deletions(-)
> >
> > diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> > index 4f5b9e8286..11d0161a32 100644
> > --- a/lib/librte_net/rte_net_crc.c
> > +++ b/lib/librte_net/rte_net_crc.c
> > @@ -9,6 +9,7 @@
> > #include <rte_cpuflags.h>
> > #include <rte_common.h>
> > #include <rte_net_crc.h>
> > +#include <rte_eal.h>
> >
> > #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
> > #define X86_64_SSE42_PCLMULQDQ 1
> > @@ -32,6 +33,12 @@
> > static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
> > static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
> >
> > +static uint32_t
> > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> > +
> > +static uint32_t
> > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> > +
> > static uint32_t
> > rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
> >
> > @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
> > typedef uint32_t
> > (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
> >
> > -static rte_net_crc_handler *handlers;
> > +static rte_net_crc_handler handlers_default[] = {
> > + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> > + [RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> > +};
> > +
> > +static rte_net_crc_handler *handlers = handlers_default;
> >
> > static rte_net_crc_handler handlers_scalar[] = {
> > [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> > @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
> > };
> > #endif
> >
> > +static uint16_t max_simd_bitwidth;
> > +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> > +
> > /**
> > * Reflect the bits about the middle
> > *
> > @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
> > return crc;
> > }
> >
> > +static uint32_t
> > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> > +{
> > + if (max_simd_bitwidth == 0)
> > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > + handlers = handlers_scalar;
> > +#ifdef X86_64_SSE42_PCLMULQDQ
> > + if (max_simd_bitwidth >= RTE_SIMD_128)
> > + handlers = handlers_sse42;
> > +#endif
> > +#ifdef ARM64_NEON_PMULL
> > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > + max_simd_bitwidth >= RTE_SIMD_128) {
> > + handlers = handlers_neon;
> > +#endif
>
> You probably don't want to make all these checks for *every* invocation
> of that function. I think it would be better:
> if (ma_simd_bitwidth == 0) {....}
> return handlers[..](...);
As another thougth - it is probably a bit safer to update max_simd_bitwidht
after handler value update.
handler = ...; rte_smp_wmb(); max_simd_width = ...;
>
> BTW, while it allows us to use best possible handler,
> such approach means extra indirect call(/jump) anyway.
> Hard to say off-hand would it affect performance,
> and if yes how significantly.
> Couldn't find any perf tests in our UT for it...
>
> > + return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> > +}
> > +
> > +static uint32_t
> > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> > +{
> > + if (max_simd_bitwidth == 0)
> > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > + handlers = handlers_scalar;
> > +#ifdef X86_64_SSE42_PCLMULQDQ
> > + if (max_simd_bitwidth >= RTE_SIMD_128)
> > + handlers = handlers_sse42;
> > +#endif
> > +#ifdef ARM64_NEON_PMULL
> > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > + max_simd_bitwidth >= RTE_SIMD_128) {
> > + handlers = handlers_neon;
> > +#endif
> > + return handlers[RTE_NET_CRC32_ETH](data, data_len);
> > +}
> > +
> > static void
> > rte_net_crc_scalar_init(void)
> > {
> > @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
> > void
> > rte_net_crc_set_alg(enum rte_net_crc_alg alg)
> > {
> > + if (max_simd_bitwidth == 0)
> > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > +
> > switch (alg) {
> > #ifdef X86_64_SSE42_PCLMULQDQ
> > case RTE_NET_CRC_SSE42:
> > - handlers = handlers_sse42;
> > - break;
> > + if (max_simd_bitwidth >= RTE_SIMD_128) {
> > + handlers = handlers_sse42;
> > + return;
> > + }
> > + NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
> > #elif defined ARM64_NEON_PMULL
> > /* fall-through */
> > case RTE_NET_CRC_NEON:
> > - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > + max_simd_bitwidth >= RTE_SIMD_128) {
> > handlers = handlers_neon;
> > - break;
> > + return;
> > }
> > + NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
> > #endif
> > /* fall-through */
> > case RTE_NET_CRC_SCALAR:
> > @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
> > /* Select highest available crc algorithm as default one */
> > RTE_INIT(rte_net_crc_init)
> > {
> > - enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> > -
> > rte_net_crc_scalar_init();
> >
> > #ifdef X86_64_SSE42_PCLMULQDQ
> > - alg = RTE_NET_CRC_SSE42;
> > rte_net_crc_sse42_init();
> > #elif defined ARM64_NEON_PMULL
> > if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > - alg = RTE_NET_CRC_NEON;
> > rte_net_crc_neon_init();
> > }
> > #endif
> > -
> > - rte_net_crc_set_alg(alg);
> > }
> > diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> > index 16e85ca970..c942865ecf 100644
> > --- a/lib/librte_net/rte_net_crc.h
> > +++ b/lib/librte_net/rte_net_crc.h
> > @@ -7,6 +7,8 @@
> >
> > #include <stdint.h>
> >
> > +#include <rte_log.h>
> > +
> > #ifdef __cplusplus
> > extern "C" {
> > #endif
> > @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> > RTE_NET_CRC_NEON,
> > };
> >
> > +extern int libnet_logtype;
> > +
> > +#define NET_LOG(level, fmt, args...) \
> > + rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n", \
> > + __func__, ## args)
> > +
> > /**
> > * This API set the CRC computation algorithm (i.e. scalar version,
> > * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> > --
> > 2.22.0
> >
> > > When choosing a vector path to take, an extra condition must be
> > > satisfied to ensure the max SIMD bitwidth allows for the CPU enabled
> > > path.
> > >
> > > The vector path was initially chosen in RTE_INIT, however this is no
> > > longer suitable as we cannot check the max SIMD bitwidth at that time.
> > > Default handlers are now chosen in RTE_INIT, these default handlers
> > > are used the first time the crc calc is called, and they set the suitable
> > > handlers to be used going forward.
> > >
> > > Suggested-by: Jasvinder Singh <jasvinder.singh@intel.com>
> > > Suggested-by: Olivier Matz <olivier.matz@6wind.com>
> > >
> > > Signed-off-by: Ciara Power <ciara.power@intel.com>
> > >
> > > ---
> > > v4:
> > > - Added default handlers to be set at RTE_INIT time, rather than
> > > choosing scalar handlers.
> > > - Modified logging.
> > > - Updated enum name.
> > > v3:
> > > - Moved choosing vector paths out of RTE_INIT.
> > > - Moved checking max_simd_bitwidth into the set_alg function.
> > > ---
> > > lib/librte_net/rte_net_crc.c | 75 ++++++++++++++++++++++++++++++------
> > > lib/librte_net/rte_net_crc.h | 8 ++++
> > > 2 files changed, 72 insertions(+), 11 deletions(-)
> > >
> > > diff --git a/lib/librte_net/rte_net_crc.c b/lib/librte_net/rte_net_crc.c
> > > index 4f5b9e8286..11d0161a32 100644
> > > --- a/lib/librte_net/rte_net_crc.c
> > > +++ b/lib/librte_net/rte_net_crc.c
> > > @@ -9,6 +9,7 @@
> > > #include <rte_cpuflags.h>
> > > #include <rte_common.h>
> > > #include <rte_net_crc.h>
> > > +#include <rte_eal.h>
> > >
> > > #if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
> > > #define X86_64_SSE42_PCLMULQDQ 1
> > > @@ -32,6 +33,12 @@
> > > static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
> > > static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
> > >
> > > +static uint32_t
> > > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
> > > +
> > > +static uint32_t
> > > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
> > > +
> > > static uint32_t
> > > rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
> > >
> > > @@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
> > > typedef uint32_t
> > > (*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
> > >
> > > -static rte_net_crc_handler *handlers;
> > > +static rte_net_crc_handler handlers_default[] = {
> > > + [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
> > > + [RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
> > > +};
> > > +
> > > +static rte_net_crc_handler *handlers = handlers_default;
> > >
> > > static rte_net_crc_handler handlers_scalar[] = {
> > > [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
> > > @@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
> > > };
> > > #endif
> > >
> > > +static uint16_t max_simd_bitwidth;
> > > +RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
> > > +
> > > /**
> > > * Reflect the bits about the middle
> > > *
> > > @@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
> > > return crc;
> > > }
> > >
> > > +static uint32_t
> > > +rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
> > > +{
> > > + if (max_simd_bitwidth == 0)
> > > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > + handlers = handlers_scalar;
> > > +#ifdef X86_64_SSE42_PCLMULQDQ
> > > + if (max_simd_bitwidth >= RTE_SIMD_128)
> > > + handlers = handlers_sse42;
> > > +#endif
> > > +#ifdef ARM64_NEON_PMULL
> > > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > + max_simd_bitwidth >= RTE_SIMD_128) {
> > > + handlers = handlers_neon;
> > > +#endif
> >
> > You probably don't want to make all these checks for *every* invocation
> > of that function. I think it would be better:
> > if (ma_simd_bitwidth == 0) {....}
> > return handlers[..](...);
>
> As another thougth - it is probably a bit safer to update max_simd_bitwidht
> after handler value update.
>
> handler = ...; rte_smp_wmb(); max_simd_width = ...;
>
> >
> > BTW, while it allows us to use best possible handler,
> > such approach means extra indirect call(/jump) anyway.
Ah sorry, that would only happen once, please ignore that one.
> > Hard to say off-hand would it affect performance,
> > and if yes how significantly.
> > Couldn't find any perf tests in our UT for it...
> >
> > > + return handlers[RTE_NET_CRC16_CCITT](data, data_len);
> > > +}
> > > +
> > > +static uint32_t
> > > +rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
> > > +{
> > > + if (max_simd_bitwidth == 0)
> > > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > + handlers = handlers_scalar;
> > > +#ifdef X86_64_SSE42_PCLMULQDQ
> > > + if (max_simd_bitwidth >= RTE_SIMD_128)
> > > + handlers = handlers_sse42;
> > > +#endif
> > > +#ifdef ARM64_NEON_PMULL
> > > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > + max_simd_bitwidth >= RTE_SIMD_128) {
> > > + handlers = handlers_neon;
> > > +#endif
> > > + return handlers[RTE_NET_CRC32_ETH](data, data_len);
> > > +}
> > > +
> > > static void
> > > rte_net_crc_scalar_init(void)
> > > {
> > > @@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
> > > void
> > > rte_net_crc_set_alg(enum rte_net_crc_alg alg)
> > > {
> > > + if (max_simd_bitwidth == 0)
> > > + max_simd_bitwidth = rte_get_max_simd_bitwidth();
> > > +
> > > switch (alg) {
> > > #ifdef X86_64_SSE42_PCLMULQDQ
> > > case RTE_NET_CRC_SSE42:
> > > - handlers = handlers_sse42;
> > > - break;
> > > + if (max_simd_bitwidth >= RTE_SIMD_128) {
> > > + handlers = handlers_sse42;
> > > + return;
> > > + }
> > > + NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
> > > #elif defined ARM64_NEON_PMULL
> > > /* fall-through */
> > > case RTE_NET_CRC_NEON:
> > > - if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > > + if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
> > > + max_simd_bitwidth >= RTE_SIMD_128) {
> > > handlers = handlers_neon;
> > > - break;
> > > + return;
> > > }
> > > + NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
> > > #endif
> > > /* fall-through */
> > > case RTE_NET_CRC_SCALAR:
> > > @@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
> > > /* Select highest available crc algorithm as default one */
> > > RTE_INIT(rte_net_crc_init)
> > > {
> > > - enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
> > > -
> > > rte_net_crc_scalar_init();
> > >
> > > #ifdef X86_64_SSE42_PCLMULQDQ
> > > - alg = RTE_NET_CRC_SSE42;
> > > rte_net_crc_sse42_init();
> > > #elif defined ARM64_NEON_PMULL
> > > if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
> > > - alg = RTE_NET_CRC_NEON;
> > > rte_net_crc_neon_init();
> > > }
> > > #endif
> > > -
> > > - rte_net_crc_set_alg(alg);
> > > }
> > > diff --git a/lib/librte_net/rte_net_crc.h b/lib/librte_net/rte_net_crc.h
> > > index 16e85ca970..c942865ecf 100644
> > > --- a/lib/librte_net/rte_net_crc.h
> > > +++ b/lib/librte_net/rte_net_crc.h
> > > @@ -7,6 +7,8 @@
> > >
> > > #include <stdint.h>
> > >
> > > +#include <rte_log.h>
> > > +
> > > #ifdef __cplusplus
> > > extern "C" {
> > > #endif
> > > @@ -25,6 +27,12 @@ enum rte_net_crc_alg {
> > > RTE_NET_CRC_NEON,
> > > };
> > >
> > > +extern int libnet_logtype;
> > > +
> > > +#define NET_LOG(level, fmt, args...) \
> > > + rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n", \
> > > + __func__, ## args)
> > > +
> > > /**
> > > * This API set the CRC computation algorithm (i.e. scalar version,
> > > * x86 64-bit sse4.2 intrinsic version, etc.) and internal data
> > > --
> > > 2.22.0
@@ -9,6 +9,7 @@
#include <rte_cpuflags.h>
#include <rte_common.h>
#include <rte_net_crc.h>
+#include <rte_eal.h>
#if defined(RTE_ARCH_X86_64) && defined(__PCLMUL__)
#define X86_64_SSE42_PCLMULQDQ 1
@@ -32,6 +33,12 @@
static uint32_t crc32_eth_lut[CRC_LUT_SIZE];
static uint32_t crc16_ccitt_lut[CRC_LUT_SIZE];
+static uint32_t
+rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len);
+
+static uint32_t
+rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len);
+
static uint32_t
rte_crc16_ccitt_handler(const uint8_t *data, uint32_t data_len);
@@ -41,7 +48,12 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len);
typedef uint32_t
(*rte_net_crc_handler)(const uint8_t *data, uint32_t data_len);
-static rte_net_crc_handler *handlers;
+static rte_net_crc_handler handlers_default[] = {
+ [RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_default_handler,
+ [RTE_NET_CRC32_ETH] = rte_crc32_eth_default_handler,
+};
+
+static rte_net_crc_handler *handlers = handlers_default;
static rte_net_crc_handler handlers_scalar[] = {
[RTE_NET_CRC16_CCITT] = rte_crc16_ccitt_handler,
@@ -60,6 +72,9 @@ static rte_net_crc_handler handlers_neon[] = {
};
#endif
+static uint16_t max_simd_bitwidth;
+RTE_LOG_REGISTER(libnet_logtype, lib.net, INFO);
+
/**
* Reflect the bits about the middle
*
@@ -112,6 +127,42 @@ crc32_eth_calc_lut(const uint8_t *data,
return crc;
}
+static uint32_t
+rte_crc16_ccitt_default_handler(const uint8_t *data, uint32_t data_len)
+{
+ if (max_simd_bitwidth == 0)
+ max_simd_bitwidth = rte_get_max_simd_bitwidth();
+ handlers = handlers_scalar;
+#ifdef X86_64_SSE42_PCLMULQDQ
+ if (max_simd_bitwidth >= RTE_SIMD_128)
+ handlers = handlers_sse42;
+#endif
+#ifdef ARM64_NEON_PMULL
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+ max_simd_bitwidth >= RTE_SIMD_128) {
+ handlers = handlers_neon;
+#endif
+ return handlers[RTE_NET_CRC16_CCITT](data, data_len);
+}
+
+static uint32_t
+rte_crc32_eth_default_handler(const uint8_t *data, uint32_t data_len)
+{
+ if (max_simd_bitwidth == 0)
+ max_simd_bitwidth = rte_get_max_simd_bitwidth();
+ handlers = handlers_scalar;
+#ifdef X86_64_SSE42_PCLMULQDQ
+ if (max_simd_bitwidth >= RTE_SIMD_128)
+ handlers = handlers_sse42;
+#endif
+#ifdef ARM64_NEON_PMULL
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+ max_simd_bitwidth >= RTE_SIMD_128) {
+ handlers = handlers_neon;
+#endif
+ return handlers[RTE_NET_CRC32_ETH](data, data_len);
+}
+
static void
rte_net_crc_scalar_init(void)
{
@@ -145,18 +196,26 @@ rte_crc32_eth_handler(const uint8_t *data, uint32_t data_len)
void
rte_net_crc_set_alg(enum rte_net_crc_alg alg)
{
+ if (max_simd_bitwidth == 0)
+ max_simd_bitwidth = rte_get_max_simd_bitwidth();
+
switch (alg) {
#ifdef X86_64_SSE42_PCLMULQDQ
case RTE_NET_CRC_SSE42:
- handlers = handlers_sse42;
- break;
+ if (max_simd_bitwidth >= RTE_SIMD_128) {
+ handlers = handlers_sse42;
+ return;
+ }
+ NET_LOG(INFO, "Max SIMD Bitwidth too low, can't use SSE\n");
#elif defined ARM64_NEON_PMULL
/* fall-through */
case RTE_NET_CRC_NEON:
- if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
+ if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL) &&
+ max_simd_bitwidth >= RTE_SIMD_128) {
handlers = handlers_neon;
- break;
+ return;
}
+ NET_LOG(INFO, "Max SIMD Bitwidth too low or CPU flag not enabled, can't use NEON\n");
#endif
/* fall-through */
case RTE_NET_CRC_SCALAR:
@@ -184,19 +243,13 @@ rte_net_crc_calc(const void *data,
/* Select highest available crc algorithm as default one */
RTE_INIT(rte_net_crc_init)
{
- enum rte_net_crc_alg alg = RTE_NET_CRC_SCALAR;
-
rte_net_crc_scalar_init();
#ifdef X86_64_SSE42_PCLMULQDQ
- alg = RTE_NET_CRC_SSE42;
rte_net_crc_sse42_init();
#elif defined ARM64_NEON_PMULL
if (rte_cpu_get_flag_enabled(RTE_CPUFLAG_PMULL)) {
- alg = RTE_NET_CRC_NEON;
rte_net_crc_neon_init();
}
#endif
-
- rte_net_crc_set_alg(alg);
}
@@ -7,6 +7,8 @@
#include <stdint.h>
+#include <rte_log.h>
+
#ifdef __cplusplus
extern "C" {
#endif
@@ -25,6 +27,12 @@ enum rte_net_crc_alg {
RTE_NET_CRC_NEON,
};
+extern int libnet_logtype;
+
+#define NET_LOG(level, fmt, args...) \
+ rte_log(RTE_LOG_ ## level, libnet_logtype, "%s(): " fmt "\n", \
+ __func__, ## args)
+
/**
* This API set the CRC computation algorithm (i.e. scalar version,
* x86 64-bit sse4.2 intrinsic version, etc.) and internal data