[v2] build: disable compiler AVX512F support

Message ID 20181102210348.28148-1-yskoh@mellanox.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v2] build: disable compiler AVX512F support |

Checks

Context Check Description
ci/Intel-compilation success Compilation OK

Commit Message

Yongseok Koh Nov. 2, 2018, 9:04 p.m. UTC
  This is a workaround to prevent a crash, which might be caused by
optimization of newer gcc (7.3.0) on Intel Skylake.

Bugzilla ID: 97

Cc: stable@dpdk.org

Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
---

v2:
* disable the flag only in case of gcc

 config/x86/meson.build | 5 +++++
 mk/rte.cpuflags.mk     | 7 +++++++
 2 files changed, 12 insertions(+)
  

Comments

Wiles, Keith Nov. 5, 2018, 2:06 p.m. UTC | #1
> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> 
> This is a workaround to prevent a crash, which might be caused by
> optimization of newer gcc (7.3.0) on Intel Skylake.

Should the code below not also test for the gcc version and the Sky Lake processor, maybe I am wrong but it seems it is turning AVX512 for all GCC builds

Also bug 97 seems a bit obscure reference, maybe you know the bug report, but more details would be good?
> 
> Bugzilla ID: 97
> 
> Cc: stable@dpdk.org
> 
> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
> ---
> 
> v2:
> * disable the flag only in case of gcc
> 
> config/x86/meson.build | 5 +++++
> mk/rte.cpuflags.mk     | 7 +++++++
> 2 files changed, 12 insertions(+)
> 
> diff --git a/config/x86/meson.build b/config/x86/meson.build
> index 33efb5e547..8ddca0ea9f 100644
> --- a/config/x86/meson.build
> +++ b/config/x86/meson.build
> @@ -47,6 +47,11 @@ endif
> if cc.get_define('__AVX512F__', args: march_opt) != ''
> 	dpdk_conf.set('RTE_MACHINE_CPUFLAG_AVX512F', 1)
> 	compile_time_cpuflags += ['RTE_CPUFLAG_AVX512F']
> +else
> +# disable AVX512F support of gcc as a workaround for Bug 97
> +	if cc.get_id() == 'gcc' and cc.has_argument('-mavx512f')
> +		machine_args += '-mno-avx512f'
> +	endif
> endif
> 
> dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
> index 43ed84155b..a8c26fb011 100644
> --- a/mk/rte.cpuflags.mk
> +++ b/mk/rte.cpuflags.mk
> @@ -68,6 +68,13 @@ endif
> ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
> ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
> CPUFLAGS += AVX512F
> +else
> +# disable AVX512F support of gcc as a workaround for Bug 97
> +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
> +	ifeq ($(shell $(CC) --target-help | grep -q mavx512f && echo 1), 1)
> +		MACHINE_CFLAGS += -mno-avx512f
> +	endif
> +endif
> endif
> endif
> 
> -- 
> 2.11.0
> 

Regards,
Keith
  
Yongseok Koh Nov. 6, 2018, 9:30 p.m. UTC | #2
> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> 
> 
> 
>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>> 
>> This is a workaround to prevent a crash, which might be caused by
>> optimization of newer gcc (7.3.0) on Intel Skylake.
> 
> Should the code below not also test for the gcc version and the Sky Lake processor, maybe I am wrong but it seems it is turning AVX512 for all GCC builds

I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.

> 
> Also bug 97 seems a bit obscure reference, maybe you know the bug report, but more details would be good?

I sent out the report to dev list two month ago. And I created the Bug 97 in order to reference it in the commit message.
I didn't want to repeat same message here and there, but it would've been better to have some sort of summary of the Bug, although v3 has a few more words. However, v3 has been merged.

Thanks,
Yongseok

>> 
>> Bugzilla ID: 97
>> 
>> Cc: stable@dpdk.org
>> 
>> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
>> ---
>> 
>> v2:
>> * disable the flag only in case of gcc
>> 
>> config/x86/meson.build | 5 +++++
>> mk/rte.cpuflags.mk     | 7 +++++++
>> 2 files changed, 12 insertions(+)
>> 
>> diff --git a/config/x86/meson.build b/config/x86/meson.build
>> index 33efb5e547..8ddca0ea9f 100644
>> --- a/config/x86/meson.build
>> +++ b/config/x86/meson.build
>> @@ -47,6 +47,11 @@ endif
>> if cc.get_define('__AVX512F__', args: march_opt) != ''
>> 	dpdk_conf.set('RTE_MACHINE_CPUFLAG_AVX512F', 1)
>> 	compile_time_cpuflags += ['RTE_CPUFLAG_AVX512F']
>> +else
>> +# disable AVX512F support of gcc as a workaround for Bug 97
>> +	if cc.get_id() == 'gcc' and cc.has_argument('-mavx512f')
>> +		machine_args += '-mno-avx512f'
>> +	endif
>> endif
>> 
>> dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
>> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
>> index 43ed84155b..a8c26fb011 100644
>> --- a/mk/rte.cpuflags.mk
>> +++ b/mk/rte.cpuflags.mk
>> @@ -68,6 +68,13 @@ endif
>> ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
>> ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
>> CPUFLAGS += AVX512F
>> +else
>> +# disable AVX512F support of gcc as a workaround for Bug 97
>> +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
>> +	ifeq ($(shell $(CC) --target-help | grep -q mavx512f && echo 1), 1)
>> +		MACHINE_CFLAGS += -mno-avx512f
>> +	endif
>> +endif
>> endif
>> endif
>> 
>> -- 
>> 2.11.0
>> 
> 
> Regards,
> Keith
  
Wiles, Keith Nov. 7, 2018, 9:04 a.m. UTC | #3
> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> 
> 
>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>> 
>> 
>> 
>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>> 
>>> This is a workaround to prevent a crash, which might be caused by
>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>> 
>> Should the code below not also test for the gcc version and the Sky Lake processor, maybe I am wrong but it seems it is turning AVX512 for all GCC builds
> 
> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.

Are you not turning off all of the GCC versions for AVX512. And you can test for range or greater then GCC version and it just seems like we are turning off every gcc version, is that true?
> 
>> 
>> Also bug 97 seems a bit obscure reference, maybe you know the bug report, but more details would be good?
> 
> I sent out the report to dev list two month ago. And I created the Bug 97 in order to reference it in the commit message.
> I didn't want to repeat same message here and there, but it would've been better to have some sort of summary of the Bug, although v3 has a few more words. However, v3 has been merged.

Still this is too obscure if nothing else give a link to a specific bug not just 97.
> 
> Thanks,
> Yongseok
> 
>>> 
>>> Bugzilla ID: 97
>>> 
>>> Cc: stable@dpdk.org
>>> 
>>> Signed-off-by: Yongseok Koh <yskoh@mellanox.com>
>>> ---
>>> 
>>> v2:
>>> * disable the flag only in case of gcc
>>> 
>>> config/x86/meson.build | 5 +++++
>>> mk/rte.cpuflags.mk     | 7 +++++++
>>> 2 files changed, 12 insertions(+)
>>> 
>>> diff --git a/config/x86/meson.build b/config/x86/meson.build
>>> index 33efb5e547..8ddca0ea9f 100644
>>> --- a/config/x86/meson.build
>>> +++ b/config/x86/meson.build
>>> @@ -47,6 +47,11 @@ endif
>>> if cc.get_define('__AVX512F__', args: march_opt) != ''
>>> 	dpdk_conf.set('RTE_MACHINE_CPUFLAG_AVX512F', 1)
>>> 	compile_time_cpuflags += ['RTE_CPUFLAG_AVX512F']
>>> +else
>>> +# disable AVX512F support of gcc as a workaround for Bug 97
>>> +	if cc.get_id() == 'gcc' and cc.has_argument('-mavx512f')
>>> +		machine_args += '-mno-avx512f'
>>> +	endif
>>> endif
>>> 
>>> dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
>>> diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
>>> index 43ed84155b..a8c26fb011 100644
>>> --- a/mk/rte.cpuflags.mk
>>> +++ b/mk/rte.cpuflags.mk
>>> @@ -68,6 +68,13 @@ endif
>>> ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
>>> ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
>>> CPUFLAGS += AVX512F
>>> +else
>>> +# disable AVX512F support of gcc as a workaround for Bug 97
>>> +ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
>>> +	ifeq ($(shell $(CC) --target-help | grep -q mavx512f && echo 1), 1)
>>> +		MACHINE_CFLAGS += -mno-avx512f
>>> +	endif
>>> +endif
>>> endif
>>> endif
>>> 
>>> -- 
>>> 2.11.0
>>> 
>> 
>> Regards,
>> Keith
> 

Regards,
Keith
  
Thomas Monjalon Nov. 8, 2018, 3:59 p.m. UTC | #4
Hi,

We need to gather more information about this bug.
More below.

07/11/2018 10:04, Wiles, Keith:
> > On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> >>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>> 
> >>> This is a workaround to prevent a crash, which might be caused by
> >>> optimization of newer gcc (7.3.0) on Intel Skylake.
> >> 
> >> Should the code below not also test for the gcc version and
> >> the Sky Lake processor, maybe I am wrong but it seems it is
> >> turning AVX512 for all GCC builds
> > 
> > I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> > Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> > Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> > And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
> 
> Are you not turning off all of the GCC versions for AVX512.
> And you can test for range or greater then GCC version and
> it just seems like we are turning off every gcc version, is that true?

Do we know exactly which GCC versions are affected?

> >> Also bug 97 seems a bit obscure reference, maybe you know
> >> the bug report, but more details would be good?
> > 
> > I sent out the report to dev list two month ago.
> > And I created the Bug 97 in order to reference it
> > in the commit message.
> > I didn't want to repeat same message here and there,
> > but it would've been better to have some sort of summary
> > of the Bug, although v3 has a few more words.
> > However, v3 has been merged.
> 
> Still this is too obscure if nothing else give a link to
> a specific bug not just 97.

The URL is
	https://bugs.dpdk.org/show_bug.cgi?id=97
The bug is also pointing to an email:
	https://mails.dpdk.org/archives/dev/2018-September/111522.html

Summary:
	- CPU: Intel Skylake
	- Linux environment: Ubuntu 18.04
	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
	- Scenario: testpmd crashes when it starts forwarding
	- Behaviour: AVX2 version of rte_memcpy() optimized with 512b instructions
	- Fix: disable AVX512 optimization with -mno-avx512f

It seems to have been reproduced only when using mlx5 PMD so far.
Any other experience?
  
Ferruh Yigit Nov. 8, 2018, 5:21 p.m. UTC | #5
On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
> Hi,
> 
> We need to gather more information about this bug.
> More below.
> 
> 07/11/2018 10:04, Wiles, Keith:
>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>
>>>>> This is a workaround to prevent a crash, which might be caused by
>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>>>>
>>>> Should the code below not also test for the gcc version and
>>>> the Sky Lake processor, maybe I am wrong but it seems it is
>>>> turning AVX512 for all GCC builds
>>>
>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
>>
>> Are you not turning off all of the GCC versions for AVX512.
>> And you can test for range or greater then GCC version and
>> it just seems like we are turning off every gcc version, is that true?
> 
> Do we know exactly which GCC versions are affected?
> 
>>>> Also bug 97 seems a bit obscure reference, maybe you know
>>>> the bug report, but more details would be good?
>>>
>>> I sent out the report to dev list two month ago.
>>> And I created the Bug 97 in order to reference it
>>> in the commit message.
>>> I didn't want to repeat same message here and there,
>>> but it would've been better to have some sort of summary
>>> of the Bug, although v3 has a few more words.
>>> However, v3 has been merged.
>>
>> Still this is too obscure if nothing else give a link to
>> a specific bug not just 97.
> 
> The URL is
> 	https://bugs.dpdk.org/show_bug.cgi?id=97
> The bug is also pointing to an email:
> 	https://mails.dpdk.org/archives/dev/2018-September/111522.html
> 
> Summary:
> 	- CPU: Intel Skylake
> 	- Linux environment: Ubuntu 18.04
> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)

Is it possible to test a few other gcc versions to check if the issue is
specific to this compiler version?

> 	- Scenario: testpmd crashes when it starts forwarding
> 	- Behaviour: AVX2 version of rte_memcpy() optimized with 512b instructions
> 	- Fix: disable AVX512 optimization with -mno-avx512f
> 
> It seems to have been reproduced only when using mlx5 PMD so far.
> Any other experience?
> 
>
  
Yongseok Koh Nov. 8, 2018, 11:01 p.m. UTC | #6
> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> 
> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
>> Hi,
>> 
>> We need to gather more information about this bug.
>> More below.
>> 
>> 07/11/2018 10:04, Wiles, Keith:
>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>> 
>>>>>> This is a workaround to prevent a crash, which might be caused by
>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>>>>> 
>>>>> Should the code below not also test for the gcc version and
>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
>>>>> turning AVX512 for all GCC builds
>>>> 
>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
>>> 
>>> Are you not turning off all of the GCC versions for AVX512.
>>> And you can test for range or greater then GCC version and
>>> it just seems like we are turning off every gcc version, is that true?
>> 
>> Do we know exactly which GCC versions are affected?
>> 
>>>>> Also bug 97 seems a bit obscure reference, maybe you know
>>>>> the bug report, but more details would be good?
>>>> 
>>>> I sent out the report to dev list two month ago.
>>>> And I created the Bug 97 in order to reference it
>>>> in the commit message.
>>>> I didn't want to repeat same message here and there,
>>>> but it would've been better to have some sort of summary
>>>> of the Bug, although v3 has a few more words.
>>>> However, v3 has been merged.
>>> 
>>> Still this is too obscure if nothing else give a link to
>>> a specific bug not just 97.
>> 
>> The URL is
>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
>> The bug is also pointing to an email:
>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
>> 
>> Summary:
>> 	- CPU: Intel Skylake
>> 	- Linux environment: Ubuntu 18.04
>> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> 
> Is it possible to test a few other gcc versions to check if the issue is
> specific to this compiler version?

Nothing's impossible but even with my quick search in gcc.gnu.org,
I could find the following documents mention mavx512f support:

GCC 4.9.0
April 22, 2014 (changes, documentation)
 
GCC 5.1
April 22, 2015 (changes, documentation)
 
GCC 6.4
July 4, 2017 (changes, documentation)
 
GCC 7.1
May 2, 2017 (changes, documentation)
 
GCC 8.1
May 2, 2018 (changes, documentation)

We altogether have to put quite large resource to verify all of the versions.
 
I assumed older than gcc 7 would have the same issue. I know it was a speculation
but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
For two months, we couldn't have any tangible solution (actually nobody cared including myself),
so I submitted the patch to temporarily disable mavx512f.

I'm still not sure what the best option is...

Thanks,
Yongseok

> 
>> 	- Scenario: testpmd crashes when it starts forwarding
>> 	- Behaviour: AVX2 version of rte_memcpy() optimized with 512b instructions
>> 	- Fix: disable AVX512 optimization with -mno-avx512f
>> 
>> It seems to have been reproduced only when using mlx5 PMD so far.
>> Any other experience?
  
Christian Ehrhardt Nov. 9, 2018, 6:27 a.m. UTC | #7
On Fri, Nov 9, 2018 at 12:01 AM Yongseok Koh <yskoh@mellanox.com> wrote:
>
>
> > On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> >
> > On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
> >> Hi,
> >>
> >> We need to gather more information about this bug.
> >> More below.
> >>

Thanks Thomas for looping us in!

> >> 07/11/2018 10:04, Wiles, Keith:
> >>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> >>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>>>
> >>>>>> This is a workaround to prevent a crash, which might be caused by
> >>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
> >>>>>
> >>>>> Should the code below not also test for the gcc version and
> >>>>> the Sky Lake processor, maybe I am wrong but it seems it is
> >>>>> turning AVX512 for all GCC builds
> >>>>
> >>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> >>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> >>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> >>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
> >>>
> >>> Are you not turning off all of the GCC versions for AVX512.
> >>> And you can test for range or greater then GCC version and
> >>> it just seems like we are turning off every gcc version, is that true?
> >>
> >> Do we know exactly which GCC versions are affected?
> >>
> >>>>> Also bug 97 seems a bit obscure reference, maybe you know
> >>>>> the bug report, but more details would be good?
> >>>>
> >>>> I sent out the report to dev list two month ago.
> >>>> And I created the Bug 97 in order to reference it
> >>>> in the commit message.
> >>>> I didn't want to repeat same message here and there,
> >>>> but it would've been better to have some sort of summary
> >>>> of the Bug, although v3 has a few more words.
> >>>> However, v3 has been merged.
> >>>
> >>> Still this is too obscure if nothing else give a link to
> >>> a specific bug not just 97.
> >>
> >> The URL is
> >>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
> >> The bug is also pointing to an email:
> >>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
> >>
> >> Summary:
> >>      - CPU: Intel Skylake
> >>      - Linux environment: Ubuntu 18.04
> >>      - Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> >
> > Is it possible to test a few other gcc versions to check if the issue is
> > specific to this compiler version?
>
> Nothing's impossible but even with my quick search in gcc.gnu.org,
> I could find the following documents mention mavx512f support:
>
> GCC 4.9.0
> April 22, 2014 (changes, documentation)
>
> GCC 5.1
> April 22, 2015 (changes, documentation)
>
> GCC 6.4
> July 4, 2017 (changes, documentation)
>
> GCC 7.1
> May 2, 2017 (changes, documentation)
>
> GCC 8.1
> May 2, 2018 (changes, documentation)
>
> We altogether have to put quite large resource to verify all of the versions.
>
> I assumed older than gcc 7 would have the same issue. I know it was a speculation
> but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
> For two months, we couldn't have any tangible solution (actually nobody cared including myself),
> so I submitted the patch to temporarily disable mavx512f.
>
> I'm still not sure what the best option is...
>

What I wonder in all of this as I don't understand that part of it yet is this.
I assume you are building on Ubuntu as that is your gcc reference.
FYI: as people asked for bug references, there also is [1] which seems
pretty much the same issue.

It builds with mostly defaults, that means per
mk/machine/default/rte.vars.mk and similar it sets -march=corei7

But when I look at what that implies all avx512 is disabled
$ gcc -Q --help=target -m64 -march=corei7 | grep avx512f
 -mavx512f                             [disabled]

So I wonder what/why -mno-avx512f should help at all.
I used the full list of gcc args we have for the build (e.g. [2] of a
18.05 build), but that doesn't change that (mostly -W, -I and -D).
So I wonder, did people do a custom build and bump up march or enable
-mavx512f on their own to hit that?
Or are we facing a real gcc issue where " -mavx512f [disabled]" is not
the same as -mno-avx512f ?
Maybe someone who hit the bug could clarify that please?

BTW: per reports I've seen it also seems to apply to the latest
compiler update of the same series - at least it was said to be fully
updated, that would be 7.3.0-27ubuntu1~18.04
But this is 2nd grade information as I don't have a system with the
right combo MLX5+Skylake available atm, so I can't confirm for sure
:-/

[1]: https://bugs.launchpad.net/ubuntu/+source/dpdk/+bug/1799397
[2]: https://launchpadlibrarian.net/373589345/buildlog_ubuntu-bionic-amd64.dpdk_18.05-1~ubuntu0.18.04.1_BUILDING.txt.gz
  
Ferruh Yigit Nov. 9, 2018, 9:49 a.m. UTC | #8
On 11/9/2018 6:27 AM, Christian Ehrhardt wrote:
> On Fri, Nov 9, 2018 at 12:01 AM Yongseok Koh <yskoh@mellanox.com> wrote:
>>
>>
>>> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>>>
>>> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
>>>> Hi,
>>>>
>>>> We need to gather more information about this bug.
>>>> More below.
>>>>
> 
> Thanks Thomas for looping us in!
> 
>>>> 07/11/2018 10:04, Wiles, Keith:
>>>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>>>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>>>>
>>>>>>>> This is a workaround to prevent a crash, which might be caused by
>>>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>>>>>>>
>>>>>>> Should the code below not also test for the gcc version and
>>>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
>>>>>>> turning AVX512 for all GCC builds
>>>>>>
>>>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
>>>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
>>>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
>>>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
>>>>>
>>>>> Are you not turning off all of the GCC versions for AVX512.
>>>>> And you can test for range or greater then GCC version and
>>>>> it just seems like we are turning off every gcc version, is that true?
>>>>
>>>> Do we know exactly which GCC versions are affected?
>>>>
>>>>>>> Also bug 97 seems a bit obscure reference, maybe you know
>>>>>>> the bug report, but more details would be good?
>>>>>>
>>>>>> I sent out the report to dev list two month ago.
>>>>>> And I created the Bug 97 in order to reference it
>>>>>> in the commit message.
>>>>>> I didn't want to repeat same message here and there,
>>>>>> but it would've been better to have some sort of summary
>>>>>> of the Bug, although v3 has a few more words.
>>>>>> However, v3 has been merged.
>>>>>
>>>>> Still this is too obscure if nothing else give a link to
>>>>> a specific bug not just 97.
>>>>
>>>> The URL is
>>>>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
>>>> The bug is also pointing to an email:
>>>>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
>>>>
>>>> Summary:
>>>>      - CPU: Intel Skylake
>>>>      - Linux environment: Ubuntu 18.04
>>>>      - Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
>>>
>>> Is it possible to test a few other gcc versions to check if the issue is
>>> specific to this compiler version?
>>
>> Nothing's impossible but even with my quick search in gcc.gnu.org,
>> I could find the following documents mention mavx512f support:
>>
>> GCC 4.9.0
>> April 22, 2014 (changes, documentation)
>>
>> GCC 5.1
>> April 22, 2015 (changes, documentation)
>>
>> GCC 6.4
>> July 4, 2017 (changes, documentation)
>>
>> GCC 7.1
>> May 2, 2017 (changes, documentation)
>>
>> GCC 8.1
>> May 2, 2018 (changes, documentation)
>>
>> We altogether have to put quite large resource to verify all of the versions.
>>
>> I assumed older than gcc 7 would have the same issue. I know it was a speculation
>> but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
>> For two months, we couldn't have any tangible solution (actually nobody cared including myself),
>> so I submitted the patch to temporarily disable mavx512f.
>>
>> I'm still not sure what the best option is...
>>
> 
> What I wonder in all of this as I don't understand that part of it yet is this.
> I assume you are building on Ubuntu as that is your gcc reference.
> FYI: as people asked for bug references, there also is [1] which seems
> pretty much the same issue.
> 
> It builds with mostly defaults, that means per
> mk/machine/default/rte.vars.mk and similar it sets -march=corei7
> 
> But when I look at what that implies all avx512 is disabled
> $ gcc -Q --help=target -m64 -march=corei7 | grep avx512f
>  -mavx512f                             [disabled]

This is output is the "corei7" architecture. Defect is for "native" build on
Skylake.

For "native" arch:
gcc -Q --help=target -m64 -march=native | grep avx512f
  -mavx512f                             [enabled]


Also I can confirm from disassembly output that avx512 instructions are used
without '-mno-avx512f' provided.

> 
> So I wonder what/why -mno-avx512f should help at all.
> I used the full list of gcc args we have for the build (e.g. [2] of a
> 18.05 build), but that doesn't change that (mostly -W, -I and -D).
> So I wonder, did people do a custom build and bump up march or enable
> -mavx512f on their own to hit that?
> Or are we facing a real gcc issue where " -mavx512f [disabled]" is not
> the same as -mno-avx512f ?
> Maybe someone who hit the bug could clarify that please?
> 
> BTW: per reports I've seen it also seems to apply to the latest
> compiler update of the same series - at least it was said to be fully
> updated, that would be 7.3.0-27ubuntu1~18.04
> But this is 2nd grade information as I don't have a system with the
> right combo MLX5+Skylake available atm, so I can't confirm for sure
> :-/
> 
> [1]: https://bugs.launchpad.net/ubuntu/+source/dpdk/+bug/1799397
> [2]: https://launchpadlibrarian.net/373589345/buildlog_ubuntu-bionic-amd64.dpdk_18.05-1~ubuntu0.18.04.1_BUILDING.txt.gz
>
  
Ferruh Yigit Nov. 9, 2018, 10:03 a.m. UTC | #9
On 11/8/2018 11:01 PM, Yongseok Koh wrote:
> 
>> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>>
>> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
>>> Hi,
>>>
>>> We need to gather more information about this bug.
>>> More below.
>>>
>>> 07/11/2018 10:04, Wiles, Keith:
>>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>>>
>>>>>>> This is a workaround to prevent a crash, which might be caused by
>>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>>>>>>
>>>>>> Should the code below not also test for the gcc version and
>>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
>>>>>> turning AVX512 for all GCC builds
>>>>>
>>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
>>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
>>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
>>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
>>>>
>>>> Are you not turning off all of the GCC versions for AVX512.
>>>> And you can test for range or greater then GCC version and
>>>> it just seems like we are turning off every gcc version, is that true?
>>>
>>> Do we know exactly which GCC versions are affected?
>>>
>>>>>> Also bug 97 seems a bit obscure reference, maybe you know
>>>>>> the bug report, but more details would be good?
>>>>>
>>>>> I sent out the report to dev list two month ago.
>>>>> And I created the Bug 97 in order to reference it
>>>>> in the commit message.
>>>>> I didn't want to repeat same message here and there,
>>>>> but it would've been better to have some sort of summary
>>>>> of the Bug, although v3 has a few more words.
>>>>> However, v3 has been merged.
>>>>
>>>> Still this is too obscure if nothing else give a link to
>>>> a specific bug not just 97.
>>>
>>> The URL is
>>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
>>> The bug is also pointing to an email:
>>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
>>>
>>> Summary:
>>> 	- CPU: Intel Skylake
>>> 	- Linux environment: Ubuntu 18.04
>>> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
>>
>> Is it possible to test a few other gcc versions to check if the issue is
>> specific to this compiler version?
> 
> Nothing's impossible but even with my quick search in gcc.gnu.org,
> I could find the following documents mention mavx512f support:
> 
> GCC 4.9.0
> April 22, 2014 (changes, documentation)
>  
> GCC 5.1
> April 22, 2015 (changes, documentation)
>  
> GCC 6.4
> July 4, 2017 (changes, documentation)
>  
> GCC 7.1
> May 2, 2017 (changes, documentation)
>  
> GCC 8.1
> May 2, 2018 (changes, documentation)
> 
> We altogether have to put quite large resource to verify all of the versions.
>  
> I assumed older than gcc 7 would have the same issue. I know it was a speculation
> but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
> For two months, we couldn't have any tangible solution (actually nobody cared including myself),
> so I submitted the patch to temporarily disable mavx512f.
> 
> I'm still not sure what the best option is...

For permanent fix we need more information, currently we can't re-produce this
defect. Since you can reproduce it we need your support.

Right now we don't know if this is compiler issue or code defect in rte_memcpy()
or something else.

It is easy to disable mavx512f as temporarily solution but it is coming with the
cost of the performance drop, also without knowing the actual root cause I
wouldn't say this is being conservative, actual issue may be just hidden with
this change.

I think as first thing we need to find a way to reproduce this issue in any
other way than using mlx5 PMD. So that we can put more organized effort to fix this.
I attached a simple unit test for rte_memcpy(), if this is a rte_memcpy() with
avx512f defect as claimed, you should be able to see the issue with that, right?
Did you able to find a chance to test it? Do you observer any crash there?

> 
> Thanks,
> Yongseok
> 
>>
>>> 	- Scenario: testpmd crashes when it starts forwarding
>>> 	- Behaviour: AVX2 version of rte_memcpy() optimized with 512b instructions
>>> 	- Fix: disable AVX512 optimization with -mno-avx512f
>>>
>>> It seems to have been reproduced only when using mlx5 PMD so far.
>>> Any other experience?
>
  
Thomas Monjalon Nov. 9, 2018, 11:35 a.m. UTC | #10
09/11/2018 10:49, Ferruh Yigit:
> On 11/9/2018 6:27 AM, Christian Ehrhardt wrote:
> > On Fri, Nov 9, 2018 at 12:01 AM Yongseok Koh <yskoh@mellanox.com> wrote:
> >>
> >>
> >>> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> >>>
> >>> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
> >>>> Hi,
> >>>>
> >>>> We need to gather more information about this bug.
> >>>> More below.
> >>>>
> > 
> > Thanks Thomas for looping us in!
> > 
> >>>> 07/11/2018 10:04, Wiles, Keith:
> >>>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> >>>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>>>>>
> >>>>>>>> This is a workaround to prevent a crash, which might be caused by
> >>>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
> >>>>>>>
> >>>>>>> Should the code below not also test for the gcc version and
> >>>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
> >>>>>>> turning AVX512 for all GCC builds
> >>>>>>
> >>>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> >>>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> >>>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> >>>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
> >>>>>
> >>>>> Are you not turning off all of the GCC versions for AVX512.
> >>>>> And you can test for range or greater then GCC version and
> >>>>> it just seems like we are turning off every gcc version, is that true?
> >>>>
> >>>> Do we know exactly which GCC versions are affected?
> >>>>
> >>>>>>> Also bug 97 seems a bit obscure reference, maybe you know
> >>>>>>> the bug report, but more details would be good?
> >>>>>>
> >>>>>> I sent out the report to dev list two month ago.
> >>>>>> And I created the Bug 97 in order to reference it
> >>>>>> in the commit message.
> >>>>>> I didn't want to repeat same message here and there,
> >>>>>> but it would've been better to have some sort of summary
> >>>>>> of the Bug, although v3 has a few more words.
> >>>>>> However, v3 has been merged.
> >>>>>
> >>>>> Still this is too obscure if nothing else give a link to
> >>>>> a specific bug not just 97.
> >>>>
> >>>> The URL is
> >>>>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
> >>>> The bug is also pointing to an email:
> >>>>      https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
> >>>>
> >>>> Summary:
> >>>>      - CPU: Intel Skylake
> >>>>      - Linux environment: Ubuntu 18.04
> >>>>      - Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> >>>
> >>> Is it possible to test a few other gcc versions to check if the issue is
> >>> specific to this compiler version?
> >>
> >> Nothing's impossible but even with my quick search in gcc.gnu.org,
> >> I could find the following documents mention mavx512f support:
> >>
> >> GCC 4.9.0
> >> April 22, 2014 (changes, documentation)
> >>
> >> GCC 5.1
> >> April 22, 2015 (changes, documentation)
> >>
> >> GCC 6.4
> >> July 4, 2017 (changes, documentation)
> >>
> >> GCC 7.1
> >> May 2, 2017 (changes, documentation)
> >>
> >> GCC 8.1
> >> May 2, 2018 (changes, documentation)
> >>
> >> We altogether have to put quite large resource to verify all of the versions.
> >>
> >> I assumed older than gcc 7 would have the same issue. I know it was a speculation
> >> but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
> >> For two months, we couldn't have any tangible solution (actually nobody cared including myself),
> >> so I submitted the patch to temporarily disable mavx512f.
> >>
> >> I'm still not sure what the best option is...
> > 
> > What I wonder in all of this as I don't understand that part of it yet is this.
> > I assume you are building on Ubuntu as that is your gcc reference.
> > FYI: as people asked for bug references, there also is [1] which seems
> > pretty much the same issue.
> > 
> > It builds with mostly defaults, that means per
> > mk/machine/default/rte.vars.mk and similar it sets -march=corei7
> > 
> > But when I look at what that implies all avx512 is disabled
> > $ gcc -Q --help=target -m64 -march=corei7 | grep avx512f
> >  -mavx512f                             [disabled]
> 
> This is output is the "corei7" architecture. Defect is for "native" build on
> Skylake.
> 
> For "native" arch:
> gcc -Q --help=target -m64 -march=native | grep avx512f
>   -mavx512f                             [enabled]

I confirm.


> Also I can confirm from disassembly output that avx512 instructions are used
> without '-mno-avx512f' provided.
> 
> > So I wonder what/why -mno-avx512f should help at all.
> > I used the full list of gcc args we have for the build (e.g. [2] of a
> > 18.05 build), but that doesn't change that (mostly -W, -I and -D).
> > So I wonder, did people do a custom build and bump up march or enable
> > -mavx512f on their own to hit that?
> > Or are we facing a real gcc issue where " -mavx512f [disabled]" is not
> > the same as -mno-avx512f ?
> > Maybe someone who hit the bug could clarify that please?
> > 
> > BTW: per reports I've seen it also seems to apply to the latest
> > compiler update of the same series - at least it was said to be fully
> > updated, that would be 7.3.0-27ubuntu1~18.04
> > But this is 2nd grade information as I don't have a system with the
> > right combo MLX5+Skylake available atm, so I can't confirm for sure
> > :-/

I tried latest gcc 7.3.0-27ubuntu1~18.04, and the bug is still there.
  
Thomas Monjalon Nov. 9, 2018, 1:17 p.m. UTC | #11
09/11/2018 11:03, Ferruh Yigit:
> On 11/8/2018 11:01 PM, Yongseok Koh wrote:
> > 
> >> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> >>
> >> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
> >>> Hi,
> >>>
> >>> We need to gather more information about this bug.
> >>> More below.
> >>>
> >>> 07/11/2018 10:04, Wiles, Keith:
> >>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> >>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> >>>>>>>
> >>>>>>> This is a workaround to prevent a crash, which might be caused by
> >>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
> >>>>>>
> >>>>>> Should the code below not also test for the gcc version and
> >>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
> >>>>>> turning AVX512 for all GCC builds
> >>>>>
> >>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> >>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> >>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> >>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
> >>>>
> >>>> Are you not turning off all of the GCC versions for AVX512.
> >>>> And you can test for range or greater then GCC version and
> >>>> it just seems like we are turning off every gcc version, is that true?
> >>>
> >>> Do we know exactly which GCC versions are affected?
> >>>
> >>>>>> Also bug 97 seems a bit obscure reference, maybe you know
> >>>>>> the bug report, but more details would be good?
> >>>>>
> >>>>> I sent out the report to dev list two month ago.
> >>>>> And I created the Bug 97 in order to reference it
> >>>>> in the commit message.
> >>>>> I didn't want to repeat same message here and there,
> >>>>> but it would've been better to have some sort of summary
> >>>>> of the Bug, although v3 has a few more words.
> >>>>> However, v3 has been merged.
> >>>>
> >>>> Still this is too obscure if nothing else give a link to
> >>>> a specific bug not just 97.
> >>>
> >>> The URL is
> >>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
> >>> The bug is also pointing to an email:
> >>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
> >>>
> >>> Summary:
> >>> 	- CPU: Intel Skylake
> >>> 	- Linux environment: Ubuntu 18.04
> >>> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> >>
> >> Is it possible to test a few other gcc versions to check if the issue is
> >> specific to this compiler version?
> > 
> > Nothing's impossible but even with my quick search in gcc.gnu.org,
> > I could find the following documents mention mavx512f support:
> > 
> > GCC 4.9.0
> > April 22, 2014 (changes, documentation)
> >  
> > GCC 5.1
> > April 22, 2015 (changes, documentation)
> >  
> > GCC 6.4
> > July 4, 2017 (changes, documentation)
> >  
> > GCC 7.1
> > May 2, 2017 (changes, documentation)
> >  
> > GCC 8.1
> > May 2, 2018 (changes, documentation)
> > 
> > We altogether have to put quite large resource to verify all of the versions.
> >  
> > I assumed older than gcc 7 would have the same issue. I know it was a speculation
> > but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
> > For two months, we couldn't have any tangible solution (actually nobody cared including myself),
> > so I submitted the patch to temporarily disable mavx512f.
> > 
> > I'm still not sure what the best option is...
> 
> For permanent fix we need more information, currently we can't re-produce this
> defect. Since you can reproduce it we need your support.
> 
> Right now we don't know if this is compiler issue or code defect in rte_memcpy()
> or something else.
> 
> It is easy to disable mavx512f as temporarily solution but it is coming with the
> cost of the performance drop, also without knowing the actual root cause I
> wouldn't say this is being conservative, actual issue may be just hidden with
> this change.
> 
> I think as first thing we need to find a way to reproduce this issue in any
> other way than using mlx5 PMD. So that we can put more organized effort to fix this.
> I attached a simple unit test for rte_memcpy(), if this is a rte_memcpy() with
> avx512f defect as claimed, you should be able to see the issue with that, right?
> Did you able to find a chance to test it? Do you observer any crash there?

I am able to connect to a machine where the issue is reproduced.
So I have tested replacing rte_memcpy with memcpy,
and the crash disappears when using memcpy.
So it confirms that the issue is in rte_memcpy.

About the unit test you attached in bugzilla:
	https://bugs.dpdk.org/attachment.cgi?id=15
It does not reproduce the bug:
	RTE>>rte_memcpy_autotest
	.................................................................
	Test OK
  
Thomas Monjalon Nov. 9, 2018, 2:27 p.m. UTC | #12
09/11/2018 14:17, Thomas Monjalon:
> 09/11/2018 11:03, Ferruh Yigit:
> > On 11/8/2018 11:01 PM, Yongseok Koh wrote:
> > > 
> > >> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
> > >>
> > >> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
> > >>> Hi,
> > >>>
> > >>> We need to gather more information about this bug.
> > >>> More below.
> > >>>
> > >>> 07/11/2018 10:04, Wiles, Keith:
> > >>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> > >>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
> > >>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> > >>>>>>>
> > >>>>>>> This is a workaround to prevent a crash, which might be caused by
> > >>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
> > >>>>>>
> > >>>>>> Should the code below not also test for the gcc version and
> > >>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
> > >>>>>> turning AVX512 for all GCC builds
> > >>>>>
> > >>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> > >>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> > >>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> > >>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
> > >>>>
> > >>>> Are you not turning off all of the GCC versions for AVX512.
> > >>>> And you can test for range or greater then GCC version and
> > >>>> it just seems like we are turning off every gcc version, is that true?
> > >>>
> > >>> Do we know exactly which GCC versions are affected?
> > >>>
> > >>>>>> Also bug 97 seems a bit obscure reference, maybe you know
> > >>>>>> the bug report, but more details would be good?
> > >>>>>
> > >>>>> I sent out the report to dev list two month ago.
> > >>>>> And I created the Bug 97 in order to reference it
> > >>>>> in the commit message.
> > >>>>> I didn't want to repeat same message here and there,
> > >>>>> but it would've been better to have some sort of summary
> > >>>>> of the Bug, although v3 has a few more words.
> > >>>>> However, v3 has been merged.
> > >>>>
> > >>>> Still this is too obscure if nothing else give a link to
> > >>>> a specific bug not just 97.
> > >>>
> > >>> The URL is
> > >>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
> > >>> The bug is also pointing to an email:
> > >>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
> > >>>
> > >>> Summary:
> > >>> 	- CPU: Intel Skylake
> > >>> 	- Linux environment: Ubuntu 18.04
> > >>> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> > >>
> > >> Is it possible to test a few other gcc versions to check if the issue is
> > >> specific to this compiler version?
> > > 
> > > Nothing's impossible but even with my quick search in gcc.gnu.org,
> > > I could find the following documents mention mavx512f support:
> > > 
> > > GCC 4.9.0
> > > April 22, 2014 (changes, documentation)
> > >  
> > > GCC 5.1
> > > April 22, 2015 (changes, documentation)
> > >  
> > > GCC 6.4
> > > July 4, 2017 (changes, documentation)
> > >  
> > > GCC 7.1
> > > May 2, 2017 (changes, documentation)
> > >  
> > > GCC 8.1
> > > May 2, 2018 (changes, documentation)
> > > 
> > > We altogether have to put quite large resource to verify all of the versions.
> > >  
> > > I assumed older than gcc 7 would have the same issue. I know it was a speculation
> > > but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
> > > For two months, we couldn't have any tangible solution (actually nobody cared including myself),
> > > so I submitted the patch to temporarily disable mavx512f.
> > > 
> > > I'm still not sure what the best option is...
> > 
> > For permanent fix we need more information, currently we can't re-produce this
> > defect. Since you can reproduce it we need your support.
> > 
> > Right now we don't know if this is compiler issue or code defect in rte_memcpy()
> > or something else.
> > 
> > It is easy to disable mavx512f as temporarily solution but it is coming with the
> > cost of the performance drop, also without knowing the actual root cause I
> > wouldn't say this is being conservative, actual issue may be just hidden with
> > this change.
> > 
> > I think as first thing we need to find a way to reproduce this issue in any
> > other way than using mlx5 PMD. So that we can put more organized effort to fix this.
> > I attached a simple unit test for rte_memcpy(), if this is a rte_memcpy() with
> > avx512f defect as claimed, you should be able to see the issue with that, right?
> > Did you able to find a chance to test it? Do you observer any crash there?
> 
> I am able to connect to a machine where the issue is reproduced.
> So I have tested replacing rte_memcpy with memcpy,
> and the crash disappears when using memcpy.
> So it confirms that the issue is in rte_memcpy.

One workaround is to disable CONFIG_RTE_ENABLE_AVX,
but it is disabling AVX and AVX2 for all DPDK code.

A more limited fix (tested) can be to disable AVX2 version of rte_memcpy
and rely on the AVX version (which is not crashing):

--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
-#elif defined RTE_MACHINE_CPUFLAG_AVX2
+#elif defined RTE_MACHINE_CPUFLAG_AVX2_disable
  
Stephen Hemminger Nov. 9, 2018, 6:46 p.m. UTC | #13
On Thu, 08 Nov 2018 16:59:22 +0100
Thomas Monjalon <thomas@monjalon.net> wrote:

> Hi,
> 
> We need to gather more information about this bug.
> More below.
> 
> 07/11/2018 10:04, Wiles, Keith:
> > > On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:  
> > >> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:  
> > >>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
> > >>> 
> > >>> This is a workaround to prevent a crash, which might be caused by
> > >>> optimization of newer gcc (7.3.0) on Intel Skylake.  
> > >> 
> > >> Should the code below not also test for the gcc version and
> > >> the Sky Lake processor, maybe I am wrong but it seems it is
> > >> turning AVX512 for all GCC builds  
> > > 
> > > I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
> > > Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
> > > Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
> > > And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.  
> > 
> > Are you not turning off all of the GCC versions for AVX512.
> > And you can test for range or greater then GCC version and
> > it just seems like we are turning off every gcc version, is that true?  
> 
> Do we know exactly which GCC versions are affected?
> 
> > >> Also bug 97 seems a bit obscure reference, maybe you know
> > >> the bug report, but more details would be good?  
> > > 
> > > I sent out the report to dev list two month ago.
> > > And I created the Bug 97 in order to reference it
> > > in the commit message.
> > > I didn't want to repeat same message here and there,
> > > but it would've been better to have some sort of summary
> > > of the Bug, although v3 has a few more words.
> > > However, v3 has been merged.  
> > 
> > Still this is too obscure if nothing else give a link to
> > a specific bug not just 97.  
> 
> The URL is
> 	https://bugs.dpdk.org/show_bug.cgi?id=97
> The bug is also pointing to an email:
> 	https://mails.dpdk.org/archives/dev/2018-September/111522.html
> 
> Summary:
> 	- CPU: Intel Skylake
> 	- Linux environment: Ubuntu 18.04
> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
> 	- Scenario: testpmd crashes when it starts forwarding
> 	- Behaviour: AVX2 version of rte_memcpy() optimized with 512b instructions
> 	- Fix: disable AVX512 optimization with -mno-avx512f
> 
> It seems to have been reproduced only when using mlx5 PMD so far.
> Any other experience?
> 
> 

I did a little checking, there are only a few machine types on Azure that
are Skylake.   These are obviously the high end lateset ones...
 https://azure.microsoft.com/en-us/blog/fv2-vms-are-now-available-the-fastest-vms-on-azure/
  
Ferruh Yigit Nov. 9, 2018, 8:06 p.m. UTC | #14
On 11/9/2018 2:27 PM, Thomas Monjalon wrote:
> 09/11/2018 14:17, Thomas Monjalon:
>> 09/11/2018 11:03, Ferruh Yigit:
>>> On 11/8/2018 11:01 PM, Yongseok Koh wrote:
>>>>
>>>>> On Nov 8, 2018, at 9:21 AM, Ferruh Yigit <ferruh.yigit@intel.com> wrote:
>>>>>
>>>>> On 11/8/2018 3:59 PM, Thomas Monjalon wrote:
>>>>>> Hi,
>>>>>>
>>>>>> We need to gather more information about this bug.
>>>>>> More below.
>>>>>>
>>>>>> 07/11/2018 10:04, Wiles, Keith:
>>>>>>>> On Nov 6, 2018, at 9:30 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>>>>> On Nov 5, 2018, at 6:06 AM, Wiles, Keith <keith.wiles@intel.com> wrote:
>>>>>>>>>> On Nov 2, 2018, at 9:04 PM, Yongseok Koh <yskoh@mellanox.com> wrote:
>>>>>>>>>>
>>>>>>>>>> This is a workaround to prevent a crash, which might be caused by
>>>>>>>>>> optimization of newer gcc (7.3.0) on Intel Skylake.
>>>>>>>>>
>>>>>>>>> Should the code below not also test for the gcc version and
>>>>>>>>> the Sky Lake processor, maybe I am wrong but it seems it is
>>>>>>>>> turning AVX512 for all GCC builds
>>>>>>>>
>>>>>>>> I didn't want to check gcc version as 7.3.0 is very new. Only gcc 8 is newly up since then (gcc 8.2).
>>>>>>>> Also, I wasn't able to test every gcc versions and I wanted to be a bit conservative for this crash.
>>>>>>>> Performance drop (if any) by disabling a new (experimental) feature would be less risky than unaccountable crash.
>>>>>>>> And, it does disable the feature only if CONFIG_RTE_ENABLE_AVX512=n. Please refer to v3.
>>>>>>>
>>>>>>> Are you not turning off all of the GCC versions for AVX512.
>>>>>>> And you can test for range or greater then GCC version and
>>>>>>> it just seems like we are turning off every gcc version, is that true?
>>>>>>
>>>>>> Do we know exactly which GCC versions are affected?
>>>>>>
>>>>>>>>> Also bug 97 seems a bit obscure reference, maybe you know
>>>>>>>>> the bug report, but more details would be good?
>>>>>>>>
>>>>>>>> I sent out the report to dev list two month ago.
>>>>>>>> And I created the Bug 97 in order to reference it
>>>>>>>> in the commit message.
>>>>>>>> I didn't want to repeat same message here and there,
>>>>>>>> but it would've been better to have some sort of summary
>>>>>>>> of the Bug, although v3 has a few more words.
>>>>>>>> However, v3 has been merged.
>>>>>>>
>>>>>>> Still this is too obscure if nothing else give a link to
>>>>>>> a specific bug not just 97.
>>>>>>
>>>>>> The URL is
>>>>>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fbugs.dpdk.org%2Fshow_bug.cgi%3Fid%3D97&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=2o%2Fg203aWrKCYg16S6oI4BcS41igpLu1DloS%2FrRnknc%3D&amp;reserved=0
>>>>>> The bug is also pointing to an email:
>>>>>> 	https://emea01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmails.dpdk.org%2Farchives%2Fdev%2F2018-September%2F111522.html&amp;data=02%7C01%7Cyskoh%40mellanox.com%7C90ff6c361faf422b976108d6459eb490%7Ca652971c7d2e4d9ba6a4d149256f461b%7C0%7C0%7C636772945282345908&amp;sdata=NCFKxaREd69iZ8eyFKg%2FWBP73CLTXkxrNQQeii%2Bbsao%3D&amp;reserved=0
>>>>>>
>>>>>> Summary:
>>>>>> 	- CPU: Intel Skylake
>>>>>> 	- Linux environment: Ubuntu 18.04
>>>>>> 	- Compiler: gcc-7.3 (Ubuntu 7.3.0-16ubuntu3)
>>>>>
>>>>> Is it possible to test a few other gcc versions to check if the issue is
>>>>> specific to this compiler version?
>>>>
>>>> Nothing's impossible but even with my quick search in gcc.gnu.org,
>>>> I could find the following documents mention mavx512f support:
>>>>
>>>> GCC 4.9.0
>>>> April 22, 2014 (changes, documentation)
>>>>  
>>>> GCC 5.1
>>>> April 22, 2015 (changes, documentation)
>>>>  
>>>> GCC 6.4
>>>> July 4, 2017 (changes, documentation)
>>>>  
>>>> GCC 7.1
>>>> May 2, 2017 (changes, documentation)
>>>>  
>>>> GCC 8.1
>>>> May 2, 2018 (changes, documentation)
>>>>
>>>> We altogether have to put quite large resource to verify all of the versions.
>>>>  
>>>> I assumed older than gcc 7 would have the same issue. I know it was a speculation
>>>> but like I mentioned I wanted to be more conservative. I didn't mean this is a permanent fix.
>>>> For two months, we couldn't have any tangible solution (actually nobody cared including myself),
>>>> so I submitted the patch to temporarily disable mavx512f.
>>>>
>>>> I'm still not sure what the best option is...
>>>
>>> For permanent fix we need more information, currently we can't re-produce this
>>> defect. Since you can reproduce it we need your support.
>>>
>>> Right now we don't know if this is compiler issue or code defect in rte_memcpy()
>>> or something else.
>>>
>>> It is easy to disable mavx512f as temporarily solution but it is coming with the
>>> cost of the performance drop, also without knowing the actual root cause I
>>> wouldn't say this is being conservative, actual issue may be just hidden with
>>> this change.
>>>
>>> I think as first thing we need to find a way to reproduce this issue in any
>>> other way than using mlx5 PMD. So that we can put more organized effort to fix this.
>>> I attached a simple unit test for rte_memcpy(), if this is a rte_memcpy() with
>>> avx512f defect as claimed, you should be able to see the issue with that, right?
>>> Did you able to find a chance to test it? Do you observer any crash there?
>>
>> I am able to connect to a machine where the issue is reproduced.
>> So I have tested replacing rte_memcpy with memcpy,
>> and the crash disappears when using memcpy.
>> So it confirms that the issue is in rte_memcpy.
> 
> One workaround is to disable CONFIG_RTE_ENABLE_AVX,
> but it is disabling AVX and AVX2 for all DPDK code.
> 
> A more limited fix (tested) can be to disable AVX2 version of rte_memcpy
> and rely on the AVX version (which is not crashing):
> 
> --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
> -#elif defined RTE_MACHINE_CPUFLAG_AVX2
> +#elif defined RTE_MACHINE_CPUFLAG_AVX2_disable

I put a patch into bugzilla:
https://bugs.dpdk.org/attachment.cgi?id=18&action=diff

Can you please check if this workaround prevents the crash without performance drop.

Also there is another suggestion from Yongseok, that looks simpler, but not
covering CONFIG_RTE_ENABLE_AVX512=y case.
  
Thomas Monjalon Nov. 10, 2018, 2:13 a.m. UTC | #15
Below is my conclusion for this bug.
An expert of x86 is required to follow-up.

Summary:
	- CPU: Intel Skylake
	- Linux environment: Ubuntu 18.04
	- Compiler: GCC 7 or 8
	- Scenario: testpmd crashes when it starts forwarding
	- Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
	- Context: inline rte_memcpy() is called from
			inline rte_mempool_put_bulk(), called from
			mlx5_tx_complete() (inline or not)
	- Analysis: AVX512 optimization changes vmovdqu to vmovdqu8

Latest status can be found in Bugzilla:
	https://bugs.dpdk.org/show_bug.cgi?id=97#c35
  
Ananyev, Konstantin Nov. 11, 2018, 2:15 p.m. UTC | #16
Hi Thomas,

> 
> Below is my conclusion for this bug.
> An expert of x86 is required to follow-up.
> 
> Summary:
> 	- CPU: Intel Skylake
> 	- Linux environment: Ubuntu 18.04
> 	- Compiler: GCC 7 or 8
> 	- Scenario: testpmd crashes when it starts forwarding
> 	- Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
> 	- Context: inline rte_memcpy() is called from
> 			inline rte_mempool_put_bulk(), called from
> 			mlx5_tx_complete() (inline or not)
> 	- Analysis: AVX512 optimization changes vmovdqu to vmovdqu8
> 
> Latest status can be found in Bugzilla:
> 	https://bugs.dpdk.org/show_bug.cgi?id=97#c35


Looking at dissamled output from the bug report, it seems that the
problem is not in vmovdqu8 instruction itself, but in the wrong offsets
generated by the compiler:

   vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
   vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
    vmovups XMMWORD PTR [rsi+0x20],xmm0
    vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
    vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
    vmovups XMMWORD PTR [rsi+0x40],xmm0
    vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]

Should be:
vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x20]
I think.

Same for next two offsets: 0x4 and 0x6 respectively should be 0x40 and 0x60.

Not sure what causing compiler behaves that way.
BTW, looking though testpmd objdump output - it seems that only mlx5 driver
exhibits such problem (I didn't enable mlx4 actually, probably same problem here).
Which looks a bit weird to me.
Konstantin
  
Thomas Monjalon Nov. 11, 2018, 6:15 p.m. UTC | #17
11/11/2018 15:15, Ananyev, Konstantin:
> Hi Thomas,
> 
> > Below is my conclusion for this bug.
> > An expert of x86 is required to follow-up.
> > 
> > Summary:
> > 	- CPU: Intel Skylake
> > 	- Linux environment: Ubuntu 18.04
> > 	- Compiler: GCC 7 or 8
> > 	- Scenario: testpmd crashes when it starts forwarding
> > 	- Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
> > 	- Context: inline rte_memcpy() is called from
> > 			inline rte_mempool_put_bulk(), called from
> > 			mlx5_tx_complete() (inline or not)
> > 	- Analysis: AVX512 optimization changes vmovdqu to vmovdqu8
> > 
> > Latest status can be found in Bugzilla:
> > 	https://bugs.dpdk.org/show_bug.cgi?id=97#c35
> 
> 
> Looking at dissamled output from the bug report, it seems that the
> problem is not in vmovdqu8 instruction itself, but in the wrong offsets
> generated by the compiler:
> 
>    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
>    vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
>     vmovups XMMWORD PTR [rsi+0x20],xmm0
>     vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
>     vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
>     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
>     vmovups XMMWORD PTR [rsi+0x40],xmm0
>     vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
>     vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
> 
> Should be:
> vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x20]
> I think.
> 
> Same for next two offsets: 0x4 and 0x6 respectively should be 0x40 and 0x60.

Yes, you're right, I missed it, thank you!

The full diff is below:

--- bad-avx512-enabled
+++ good-avx512-disabled
-    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
+    vmovdqu xmm0,XMMWORD PTR [rax*8+0x0]
     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
     vmovups XMMWORD PTR [rsi],xmm0
     vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1
-    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
+    vmovdqu xmm0,XMMWORD PTR [rax*8+0x20]
     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
     vmovups XMMWORD PTR [rsi+0x20],xmm0
     vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
-    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
+    vmovdqu xmm0,XMMWORD PTR [rax*8+0x40]
     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
     vmovups XMMWORD PTR [rsi+0x40],xmm0
     vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
-    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
+    vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
     vmovups XMMWORD PTR [rsi+0x60],xmm0
     vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1

> Not sure what causing compiler behaves that way.
> BTW, looking though testpmd objdump output - it seems that only mlx5 driver
> exhibits such problem (I didn't enable mlx4 actually, probably same problem here).
> Which looks a bit weird to me.

Yes it's weird. I don't see how the mlx5 code could influence
the compiler to generate this bad code in AVX512 mode.
  
Christian Ehrhardt Nov. 12, 2018, 9:09 a.m. UTC | #18
> -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
> +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
>      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
>      vmovups XMMWORD PTR [rsi+0x60],xmm0
>      vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1
>
> > Not sure what causing compiler behaves that way.
> > BTW, looking though testpmd objdump output - it seems that only mlx5 driver
> > exhibits such problem (I didn't enable mlx4 actually, probably same problem here).
> > Which looks a bit weird to me.
>
> Yes it's weird. I don't see how the mlx5 code could influence
> the compiler to generate this bad code in AVX512 mode.

Thomas you have all this set up, do you have any chance to test this
on the GCC's in Ubuntu 18.10 and 19.04
If easy I'd love to see results wit hgcc-7 & gcc-8 as in Ubuntu 19.04
(current -dev).
If the above is too hard, at least could you try the gcc-8 in Bionic
is 8.2.0-1ubuntu2~18.04 that is rather close.

If you could share the simplified build options that you need to
reproduce that would help (at least me) - thanks in advance
  
Thomas Monjalon Nov. 12, 2018, 9:21 a.m. UTC | #19
12/11/2018 10:09, Christian Ehrhardt:
> > -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
> > +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
> >      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
> >      vmovups XMMWORD PTR [rsi+0x60],xmm0
> >      vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1
> >
> > > Not sure what causing compiler behaves that way.
> > > BTW, looking though testpmd objdump output - it seems that only mlx5 driver
> > > exhibits such problem (I didn't enable mlx4 actually, probably same problem here).
> > > Which looks a bit weird to me.
> >
> > Yes it's weird. I don't see how the mlx5 code could influence
> > the compiler to generate this bad code in AVX512 mode.
> 
> Thomas you have all this set up, do you have any chance to test this
> on the GCC's in Ubuntu 18.10 and 19.04
> If easy I'd love to see results wit hgcc-7 & gcc-8 as in Ubuntu 19.04
> (current -dev).
> If the above is too hard, at least could you try the gcc-8 in Bionic
> is 8.2.0-1ubuntu2~18.04 that is rather close.

I already tested updated GCC 7 and 8 (it is the same result):
	https://bugs.dpdk.org/show_bug.cgi?id=97#c18

Versions are:
	gcc-7 (Ubuntu 7.3.0-27ubuntu1~18.04) 7.3.0
	gcc-8 (Ubuntu 8.2.0-1ubuntu2~18.04) 8.2.0

> If you could share the simplified build options that you need to
> reproduce that would help (at least me) - thanks in advance

You just need to compile mlx5, nothing special.
  
Ananyev, Konstantin Nov. 12, 2018, 9:26 a.m. UTC | #20
> 11/11/2018 15:15, Ananyev, Konstantin:
> > Hi Thomas,
> >
> > > Below is my conclusion for this bug.
> > > An expert of x86 is required to follow-up.
> > >
> > > Summary:
> > > 	- CPU: Intel Skylake
> > > 	- Linux environment: Ubuntu 18.04
> > > 	- Compiler: GCC 7 or 8
> > > 	- Scenario: testpmd crashes when it starts forwarding
> > > 	- Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
> > > 	- Context: inline rte_memcpy() is called from
> > > 			inline rte_mempool_put_bulk(), called from
> > > 			mlx5_tx_complete() (inline or not)
> > > 	- Analysis: AVX512 optimization changes vmovdqu to vmovdqu8
> > >
> > > Latest status can be found in Bugzilla:
> > > 	https://bugs.dpdk.org/show_bug.cgi?id=97#c35
> >
> >
> > Looking at dissamled output from the bug report, it seems that the
> > problem is not in vmovdqu8 instruction itself, but in the wrong offsets
> > generated by the compiler:
> >
> >    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
> >    vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
> >     vmovups XMMWORD PTR [rsi+0x20],xmm0
> >     vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
> >     vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
> >     vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
> >     vmovups XMMWORD PTR [rsi+0x40],xmm0
> >     vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
> >     vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
> >
> > Should be:
> > vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x20]
> > I think.
> >
> > Same for next two offsets: 0x4 and 0x6 respectively should be 0x40 and 0x60.
> 
> Yes, you're right, I missed it, thank you!
> 
> The full diff is below:
> 
> --- bad-avx512-enabled
> +++ good-avx512-disabled
> -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
> +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x0]
>      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
>      vmovups XMMWORD PTR [rsi],xmm0
>      vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1
> -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
> +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x20]
>      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
>      vmovups XMMWORD PTR [rsi+0x20],xmm0
>      vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
> -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
> +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x40]
>      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
>      vmovups XMMWORD PTR [rsi+0x40],xmm0
>      vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
> -    vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
> +    vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
>      vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
>      vmovups XMMWORD PTR [rsi+0x60],xmm0
>      vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1
> 
> > Not sure what causing compiler behaves that way.
> > BTW, looking though testpmd objdump output - it seems that only mlx5 driver
> > exhibits such problem (I didn't enable mlx4 actually, probably same problem here).
> > Which looks a bit weird to me.
> 
> Yes it's weird. I don't see how the mlx5 code could influence
> the compiler to generate this bad code in AVX512 mode.

Same here, looked through mlx5_rxtx code, it is unclear to me
what triggers the issue.
So far looks like gcc bug to me.
Konstantin
  

Patch

diff --git a/config/x86/meson.build b/config/x86/meson.build
index 33efb5e547..8ddca0ea9f 100644
--- a/config/x86/meson.build
+++ b/config/x86/meson.build
@@ -47,6 +47,11 @@  endif
 if cc.get_define('__AVX512F__', args: march_opt) != ''
 	dpdk_conf.set('RTE_MACHINE_CPUFLAG_AVX512F', 1)
 	compile_time_cpuflags += ['RTE_CPUFLAG_AVX512F']
+else
+# disable AVX512F support of gcc as a workaround for Bug 97
+	if cc.get_id() == 'gcc' and cc.has_argument('-mavx512f')
+		machine_args += '-mno-avx512f'
+	endif
 endif
 
 dpdk_conf.set('RTE_CACHE_LINE_SIZE', 64)
diff --git a/mk/rte.cpuflags.mk b/mk/rte.cpuflags.mk
index 43ed84155b..a8c26fb011 100644
--- a/mk/rte.cpuflags.mk
+++ b/mk/rte.cpuflags.mk
@@ -68,6 +68,13 @@  endif
 ifneq ($(filter $(AUTO_CPUFLAGS),__AVX512F__),)
 ifeq ($(CONFIG_RTE_ENABLE_AVX512),y)
 CPUFLAGS += AVX512F
+else
+# disable AVX512F support of gcc as a workaround for Bug 97
+ifeq ($(CONFIG_RTE_TOOLCHAIN_GCC),y)
+	ifeq ($(shell $(CC) --target-help | grep -q mavx512f && echo 1), 1)
+		MACHINE_CFLAGS += -mno-avx512f
+	endif
+endif
 endif
 endif