[v2,2/3] kni: fix kni fifo synchronization

Message ID 1537364560-4124-2-git-send-email-phil.yang@arm.com (mailing list archive)
State Superseded, archived
Delegated to: Thomas Monjalon
Headers
Series [v2,1/3] config: use one single config option for C11 memory model |

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Phil Yang Sept. 19, 2018, 1:42 p.m. UTC
  With existing code in kni_fifo_put, rx_q values are not being updated
before updating fifo_write. While reading rx_q in kni_net_rx_normal,
This is causing the sync issue on other core. The same situation happens
in kni_fifo_get as well.

So syncing the values by adding C11 atomic memory barriers to make sure
the values being synced before updating fifo_write and fifo_read.

Fixes: 3fc5ca2 ("kni: initial import")
Signed-off-by: Phil Yang <phil.yang@arm.com>
Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
---
 .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
 lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
 2 files changed, 34 insertions(+), 1 deletion(-)
  

Comments

Jerin Jacob Sept. 20, 2018, 8:28 a.m. UTC | #1
-----Original Message-----
> Date: Wed, 19 Sep 2018 21:42:39 +0800
> From: Phil Yang <phil.yang@arm.com>
> To: dev@dpdk.org
> CC: nd@arm.com, jerin.jacob@caviumnetworks.com,
>  kkokkilagadda@caviumnetworks.com, Honnappa.Nagarahalli@arm.com,
>  Gavin.Hu@arm.com
> Subject: [PATCH v2 2/3] kni: fix kni fifo synchronization
> X-Mailer: git-send-email 2.7.4
> 

+ Ferruh Yigit <ferruh.yigit@intel.com>

> 
> With existing code in kni_fifo_put, rx_q values are not being updated
> before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> This is causing the sync issue on other core. The same situation happens
> in kni_fifo_get as well.
> 
> So syncing the values by adding C11 atomic memory barriers to make sure
> the values being synced before updating fifo_write and fifo_read.
> 
> Fixes: 3fc5ca2 ("kni: initial import")
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> ---
>  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
>  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
>  2 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> index cfa9448..1fd713b 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> @@ -54,8 +54,13 @@ struct rte_kni_request {
>   * Writing should never overwrite the read position
>   */
>  struct rte_kni_fifo {
> +#ifndef RTE_USE_C11_MEM_MODEL
>         volatile unsigned write;     /**< Next position to be written*/
>         volatile unsigned read;      /**< Next position to be read */
> +#else
> +       unsigned write;              /**< Next position to be written*/
> +       unsigned read;               /**< Next position to be read */
> +#endif
>         unsigned len;                /**< Circular buffer length */
>         unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
>         void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
> index ac26a8c..f4171a1 100644
> --- a/lib/librte_kni/rte_kni_fifo.h
> +++ b/lib/librte_kni/rte_kni_fifo.h
> @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
>  {
>         unsigned i = 0;
>         unsigned fifo_write = fifo->write;
> -       unsigned fifo_read = fifo->read;
>         unsigned new_write = fifo_write;
> +#ifdef RTE_USE_C11_MEM_MODEL
> +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> +                                                __ATOMIC_ACQUIRE);
> +#else
> +       unsigned fifo_read = fifo->read;
> +#endif

Correct.


> 
>         for (i = 0; i < num; i++) {
>                 new_write = (new_write + 1) & (fifo->len - 1);
> @@ -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
>                 fifo->buffer[fifo_write] = data[i];
>                 fifo_write = new_write;
>         }
> +#ifdef RTE_USE_C11_MEM_MODEL
> +       __atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
> +#else
> +       rte_smp_wmb();
>         fifo->write = fifo_write;
> +#endif

Correct.
>         return i;
>  }
> 
> @@ -51,7 +61,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
>  {
>         unsigned i = 0;
>         unsigned new_read = fifo->read;
> +#ifdef RTE_USE_C11_MEM_MODEL
> +       unsigned fifo_write = __atomic_load_n(&fifo->write, __ATOMIC_ACQUIRE);
> +#else
>         unsigned fifo_write = fifo->write;
> +#endif

Correct.

> +
>         for (i = 0; i < num; i++) {
>                 if (new_read == fifo_write)
>                         break;
> @@ -59,7 +74,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
>                 data[i] = fifo->buffer[new_read];
>                 new_read = (new_read + 1) & (fifo->len - 1);
>         }
> +#ifdef RTE_USE_C11_MEM_MODEL
> +       __atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
> +#else
> +       rte_smp_wmb();
>         fifo->read = new_read;
> +#endif

Correct.

>         return i;
>  }
> 
> @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
>  static inline uint32_t
>  kni_fifo_count(struct rte_kni_fifo *fifo)
>  {
> +#ifdef RTE_USE_C11_MEM_MODEL
> +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> +                                                 __ATOMIC_ACQUIRE);
> +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> +                                                __ATOMIC_ACQUIRE);

Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple rte_smp_rmb() would
be enough here. Right?
or
Do we need __ATOMIC_ACQUIRE for fifo_write case?


Other than that, I prefer to avoid ifdef clutter by introducing two
separate file just like ring C11 implementation.

I don't have strong opinion on this this part, I let KNI MAINTAINER to
decide on how to accommodate this change.

> +       return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1);
> +#else
>         return (fifo->len + fifo->write - fifo->read) & (fifo->len - 1);
> +#endif
>  }
> --
> 2.7.4
>
  
Honnappa Nagarahalli Sept. 20, 2018, 3:20 p.m. UTC | #2
> -----Original Message-----
> > Date: Wed, 19 Sep 2018 21:42:39 +0800
> > From: Phil Yang <phil.yang@arm.com>
> > To: dev@dpdk.org
> > CC: nd@arm.com, jerin.jacob@caviumnetworks.com,
> > kkokkilagadda@caviumnetworks.com, Honnappa.Nagarahalli@arm.com,
> > Gavin.Hu@arm.com
> > Subject: [PATCH v2 2/3] kni: fix kni fifo synchronization
> > X-Mailer: git-send-email 2.7.4
> >
> 
> + Ferruh Yigit <ferruh.yigit@intel.com>
> 
> >
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. The same situation
> > happens in kni_fifo_get as well.
> >
> > So syncing the values by adding C11 atomic memory barriers to make
> > sure the values being synced before updating fifo_write and fifo_read.
> >
> > Fixes: 3fc5ca2 ("kni: initial import")
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> > ---
> >  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
> >  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
> >  2 files changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git
> > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > index cfa9448..1fd713b 100644
> > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > @@ -54,8 +54,13 @@ struct rte_kni_request {
> >   * Writing should never overwrite the read position
> >   */
> >  struct rte_kni_fifo {
> > +#ifndef RTE_USE_C11_MEM_MODEL
> >         volatile unsigned write;     /**< Next position to be written*/
> >         volatile unsigned read;      /**< Next position to be read */
> > +#else
> > +       unsigned write;              /**< Next position to be written*/
> > +       unsigned read;               /**< Next position to be read */
> > +#endif
> >         unsigned len;                /**< Circular buffer length */
> >         unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
> >         void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> > diff --git a/lib/librte_kni/rte_kni_fifo.h
> > b/lib/librte_kni/rte_kni_fifo.h index ac26a8c..f4171a1 100644
> > --- a/lib/librte_kni/rte_kni_fifo.h
> > +++ b/lib/librte_kni/rte_kni_fifo.h
> > @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > **data, unsigned num)  {
> >         unsigned i = 0;
> >         unsigned fifo_write = fifo->write;
> > -       unsigned fifo_read = fifo->read;
> >         unsigned new_write = fifo_write;
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > +                                                __ATOMIC_ACQUIRE);
> > +#else
> > +       unsigned fifo_read = fifo->read; #endif
> 
> Correct.

My apologies, did not follow your comment here. Do you want us to correct anything here? '#endif' is not appearing on the correct line in the email, but it shows up fine on the patch work.

> 
> 
> >
> >         for (i = 0; i < num; i++) {
> >                 new_write = (new_write + 1) & (fifo->len - 1); @@
> > -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data,
> unsigned num)
> >                 fifo->buffer[fifo_write] = data[i];
> >                 fifo_write = new_write;
> >         }
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +       __atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
> > +#else
> > +       rte_smp_wmb();
> >         fifo->write = fifo_write;
> > +#endif
> 
> Correct.
> >         return i;
> >  }
> >
> > @@ -51,7 +61,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > **data, unsigned num)  {
> >         unsigned i = 0;
> >         unsigned new_read = fifo->read;
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > +__ATOMIC_ACQUIRE); #else
> >         unsigned fifo_write = fifo->write;
> > +#endif
> 
> Correct.
> 
> > +
> >         for (i = 0; i < num; i++) {
> >                 if (new_read == fifo_write)
> >                         break;
> > @@ -59,7 +74,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data,
> unsigned num)
> >                 data[i] = fifo->buffer[new_read];
> >                 new_read = (new_read + 1) & (fifo->len - 1);
> >         }
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +       __atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
> > +#else
> > +       rte_smp_wmb();
> >         fifo->read = new_read;
> > +#endif
> 
> Correct.
> 
> >         return i;
> >  }
> >
> > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > **data, unsigned num)  static inline uint32_t  kni_fifo_count(struct
> > rte_kni_fifo *fifo)  {
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > +                                                 __ATOMIC_ACQUIRE);
> > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > +                                                __ATOMIC_ACQUIRE);
> 
> Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple rte_smp_rmb()
> would be enough here. Right?
> or
> Do we need __ATOMIC_ACQUIRE for fifo_write case?
> 
We also had some amount of debate internally on this:
1) We do not want to use rte_smp_rmb() as we want to keep the memory models separated (for ex: while using C11, use C11 everywhere). It is also not sufficient, please see 3) below.
2) This API can get called from writer or reader, so both the loads have to be __ATOMIC_ACQUIRE
3) Other option is to use __ATOMIC_RELAXED. That would allow any loads/stores around of this API to get reordered, especially since this is an inline function. This would put burden on the application to manage the ordering depending on its usage. It will also require the application to understand the implementation of this API.

> 
> Other than that, I prefer to avoid ifdef clutter by introducing two separate file
> just like ring C11 implementation.
> 
> I don't have strong opinion on this this part, I let KNI MAINTAINER to decide
> on how to accommodate this change.

I prefer to change this as well, I am open for suggestions.
Introducing two separate files would be too much for this library. A better way would be to have something similar to 'smp_store_release' provided by the kernel. i.e. create #defines for loads/stores. Hide the clutter behind the #defines.

> 
> > +       return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1);
> > +#else
> >         return (fifo->len + fifo->write - fifo->read) & (fifo->len -
> > 1);
> > +#endif
> >  }
> > --
> > 2.7.4
> >
  
Jerin Jacob Sept. 20, 2018, 3:37 p.m. UTC | #3
-----Original Message-----
> Date: Thu, 20 Sep 2018 15:20:53 +0000
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>, "Phil Yang (Arm
>  Technology China)" <Phil.Yang@arm.com>
> CC: "dev@dpdk.org" <dev@dpdk.org>, nd <nd@arm.com>,
>  "kkokkilagadda@caviumnetworks.com" <kkokkilagadda@caviumnetworks.com>,
>  "Gavin Hu (Arm Technology China)" <Gavin.Hu@arm.com>,
>  "ferruh.yigit@intel.com" <ferruh.yigit@intel.com>
> Subject: RE: [PATCH v2 2/3] kni: fix kni fifo synchronization
> 
> 
> > -----Original Message-----
> > > Date: Wed, 19 Sep 2018 21:42:39 +0800
> > > From: Phil Yang <phil.yang@arm.com>
> > > To: dev@dpdk.org
> > > CC: nd@arm.com, jerin.jacob@caviumnetworks.com,
> > > kkokkilagadda@caviumnetworks.com, Honnappa.Nagarahalli@arm.com,
> > > Gavin.Hu@arm.com
> > > Subject: [PATCH v2 2/3] kni: fix kni fifo synchronization
> > > X-Mailer: git-send-email 2.7.4
> > >
> >
> > + Ferruh Yigit <ferruh.yigit@intel.com>
> >
> > >
> > > With existing code in kni_fifo_put, rx_q values are not being updated
> > > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > > This is causing the sync issue on other core. The same situation
> > > happens in kni_fifo_get as well.
> > >
> > > So syncing the values by adding C11 atomic memory barriers to make
> > > sure the values being synced before updating fifo_write and fifo_read.
> > >
> > > Fixes: 3fc5ca2 ("kni: initial import")
> > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> > > ---
> > >  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
> > >  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
> > >  2 files changed, 34 insertions(+), 1 deletion(-)
> > >
> > > diff --git
> > > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > index cfa9448..1fd713b 100644
> > > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > @@ -54,8 +54,13 @@ struct rte_kni_request {
> > >   * Writing should never overwrite the read position
> > >   */
> > >  struct rte_kni_fifo {
> > > +#ifndef RTE_USE_C11_MEM_MODEL
> > >         volatile unsigned write;     /**< Next position to be written*/
> > >         volatile unsigned read;      /**< Next position to be read */
> > > +#else
> > > +       unsigned write;              /**< Next position to be written*/
> > > +       unsigned read;               /**< Next position to be read */
> > > +#endif
> > >         unsigned len;                /**< Circular buffer length */
> > >         unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
> > >         void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> > > diff --git a/lib/librte_kni/rte_kni_fifo.h
> > > b/lib/librte_kni/rte_kni_fifo.h index ac26a8c..f4171a1 100644
> > > --- a/lib/librte_kni/rte_kni_fifo.h
> > > +++ b/lib/librte_kni/rte_kni_fifo.h
> > > @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > > **data, unsigned num)  {
> > >         unsigned i = 0;
> > >         unsigned fifo_write = fifo->write;
> > > -       unsigned fifo_read = fifo->read;
> > >         unsigned new_write = fifo_write;
> > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > +                                                __ATOMIC_ACQUIRE);
> > > +#else
> > > +       unsigned fifo_read = fifo->read; #endif
> >
> > Correct.
> 
> My apologies, did not follow your comment here. Do you want us to correct anything here? '#endif' is not appearing on the correct line in the email, but it shows up fine on the patch work.

No. What I meant is, code is correct.

> 
> >
> >
> > >
> > >         for (i = 0; i < num; i++) {
> > >                 new_write = (new_write + 1) & (fifo->len - 1); @@
> > > -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data,
> > unsigned num)
> > >                 fifo->buffer[fifo_write] = data[i];
> > >                 fifo_write = new_write;
> > >         }
> > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > +       __atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
> > > +#else
> > > +       rte_smp_wmb();
> > >         fifo->write = fifo_write;
> > > +#endif
> >
> > Correct.
> > >         return i;
> > >  }
> > >
> > > @@ -51,7 +61,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > **data, unsigned num)  {
> > >         unsigned i = 0;
> > >         unsigned new_read = fifo->read;
> > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > +__ATOMIC_ACQUIRE); #else
> > >         unsigned fifo_write = fifo->write;
> > > +#endif
> >
> > Correct.
> >
> > > +
> > >         for (i = 0; i < num; i++) {
> > >                 if (new_read == fifo_write)
> > >                         break;
> > > @@ -59,7 +74,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void **data,
> > unsigned num)
> > >                 data[i] = fifo->buffer[new_read];
> > >                 new_read = (new_read + 1) & (fifo->len - 1);
> > >         }
> > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > +       __atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
> > > +#else
> > > +       rte_smp_wmb();
> > >         fifo->read = new_read;
> > > +#endif
> >
> > Correct.
> >
> > >         return i;
> > >  }
> > >
> > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > **data, unsigned num)  static inline uint32_t  kni_fifo_count(struct
> > > rte_kni_fifo *fifo)  {
> > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > +                                                 __ATOMIC_ACQUIRE);
> > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > +                                                __ATOMIC_ACQUIRE);
> >
> > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple rte_smp_rmb()
> > would be enough here. Right?
> > or
> > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> >
> We also had some amount of debate internally on this:
> 1) We do not want to use rte_smp_rmb() as we want to keep the memory models separated (for ex: while using C11, use C11 everywhere). It is also not sufficient, please see 3) below.

But Nothing technically wrong in using rte_smp_rmb() here in terms
functionally and code generated by the compiler.

> 2) This API can get called from writer or reader, so both the loads have to be __ATOMIC_ACQUIRE
> 3) Other option is to use __ATOMIC_RELAXED. That would allow any loads/stores around of this API to get reordered, especially since this is an inline function. This would put burden on the application to manage the ordering depending on its usage. It will also require the application to understand the implementation of this API.

__ATOMIC_RELAXED may be fine too for _count() case as it may not very
important to get the exact count for the exact very moment, Application can
retry.

I am in favor of performance effective implementation.

> 
> >
> > Other than that, I prefer to avoid ifdef clutter by introducing two separate file
> > just like ring C11 implementation.
> >
> > I don't have strong opinion on this this part, I let KNI MAINTAINER to decide
> > on how to accommodate this change.
> 
> I prefer to change this as well, I am open for suggestions.
> Introducing two separate files would be too much for this library. A better way would be to have something similar to 'smp_store_release' provided by the kernel. i.e. create #defines for loads/stores. Hide the clutter behind the #defines.

No Strong opinion on this, leaving to KNI Maintainer.

This patch needs to split by two,
a) Fixes for non C11 implementation(i.e new addition to rte_smp_wmb())
b) add support for C11 implementation.

> 
> >
> > > +       return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1);
> > > +#else
> > >         return (fifo->len + fifo->write - fifo->read) & (fifo->len -
> > > 1);
> > > +#endif
> > >  }
> > > --
> > > 2.7.4
> > >
  
Honnappa Nagarahalli Sept. 21, 2018, 5:48 a.m. UTC | #4
> > > -----Original Message-----
> > > > Date: Wed, 19 Sep 2018 21:42:39 +0800
> > > > From: Phil Yang <phil.yang@arm.com>
> > > > To: dev@dpdk.org
> > > > CC: nd@arm.com, jerin.jacob@caviumnetworks.com,
> > > > kkokkilagadda@caviumnetworks.com, Honnappa.Nagarahalli@arm.com,
> > > > Gavin.Hu@arm.com
> > > > Subject: [PATCH v2 2/3] kni: fix kni fifo synchronization
> > > > X-Mailer: git-send-email 2.7.4
> > > >
> > >
> > > + Ferruh Yigit <ferruh.yigit@intel.com>
> > >
> > > >
> > > > With existing code in kni_fifo_put, rx_q values are not being
> > > > updated before updating fifo_write. While reading rx_q in
> > > > kni_net_rx_normal, This is causing the sync issue on other core.
> > > > The same situation happens in kni_fifo_get as well.
> > > >
> > > > So syncing the values by adding C11 atomic memory barriers to make
> > > > sure the values being synced before updating fifo_write and fifo_read.
> > > >
> > > > Fixes: 3fc5ca2 ("kni: initial import")
> > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> > > > ---
> > > >  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
> > > >  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
> > > >  2 files changed, 34 insertions(+), 1 deletion(-)
> > > >
> > > > diff --git
> > > > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > index cfa9448..1fd713b 100644
> > > > ---
> > > > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.
> > > > +++ h
> > > > @@ -54,8 +54,13 @@ struct rte_kni_request {
> > > >   * Writing should never overwrite the read position
> > > >   */
> > > >  struct rte_kni_fifo {
> > > > +#ifndef RTE_USE_C11_MEM_MODEL
> > > >         volatile unsigned write;     /**< Next position to be written*/
> > > >         volatile unsigned read;      /**< Next position to be read */
> > > > +#else
> > > > +       unsigned write;              /**< Next position to be written*/
> > > > +       unsigned read;               /**< Next position to be read */
> > > > +#endif
> > > >         unsigned len;                /**< Circular buffer length */
> > > >         unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
> > > >         void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> > > > diff --git a/lib/librte_kni/rte_kni_fifo.h
> > > > b/lib/librte_kni/rte_kni_fifo.h index ac26a8c..f4171a1 100644
> > > > --- a/lib/librte_kni/rte_kni_fifo.h
> > > > +++ b/lib/librte_kni/rte_kni_fifo.h
> > > > @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > > > **data, unsigned num)  {
> > > >         unsigned i = 0;
> > > >         unsigned fifo_write = fifo->write;
> > > > -       unsigned fifo_read = fifo->read;
> > > >         unsigned new_write = fifo_write;
> > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > +
> > > > +__ATOMIC_ACQUIRE); #else
> > > > +       unsigned fifo_read = fifo->read; #endif
> > >
> > > Correct.
> >
> > My apologies, did not follow your comment here. Do you want us to correct
> anything here? '#endif' is not appearing on the correct line in the email, but it
> shows up fine on the patch work.
> 
> No. What I meant is, code is correct.
> 
> >
> > >
> > >
> > > >
> > > >         for (i = 0; i < num; i++) {
> > > >                 new_write = (new_write + 1) & (fifo->len - 1); @@
> > > > -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > > > **data,
> > > unsigned num)
> > > >                 fifo->buffer[fifo_write] = data[i];
> > > >                 fifo_write = new_write;
> > > >         }
> > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > +       __atomic_store_n(&fifo->write, fifo_write,
> > > > +__ATOMIC_RELEASE); #else
> > > > +       rte_smp_wmb();
> > > >         fifo->write = fifo_write;
> > > > +#endif
> > >
> > > Correct.
> > > >         return i;
> > > >  }
> > > >
> > > > @@ -51,7 +61,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > **data, unsigned num)  {
> > > >         unsigned i = 0;
> > > >         unsigned new_read = fifo->read;
> > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > +__ATOMIC_ACQUIRE); #else
> > > >         unsigned fifo_write = fifo->write;
> > > > +#endif
> > >
> > > Correct.
> > >
> > > > +
> > > >         for (i = 0; i < num; i++) {
> > > >                 if (new_read == fifo_write)
> > > >                         break;
> > > > @@ -59,7 +74,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > **data,
> > > unsigned num)
> > > >                 data[i] = fifo->buffer[new_read];
> > > >                 new_read = (new_read + 1) & (fifo->len - 1);
> > > >         }
> > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > +       __atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
> > > > +#else
> > > > +       rte_smp_wmb();
> > > >         fifo->read = new_read;
> > > > +#endif
> > >
> > > Correct.
> > >
> > > >         return i;
> > > >  }
> > > >
> > > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > **data, unsigned num)  static inline uint32_t
> > > > kni_fifo_count(struct rte_kni_fifo *fifo)  {
> > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > +                                                 __ATOMIC_ACQUIRE);
> > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > +
> > > > +__ATOMIC_ACQUIRE);
> > >
> > > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> > > rte_smp_rmb() would be enough here. Right?
> > > or
> > > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> > >
> > We also had some amount of debate internally on this:
> > 1) We do not want to use rte_smp_rmb() as we want to keep the memory
> models separated (for ex: while using C11, use C11 everywhere). It is also not
> sufficient, please see 3) below.
> 
> But Nothing technically wrong in using rte_smp_rmb() here in terms
> functionally and code generated by the compiler.

rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal. 'LDAR' is a better option which is generated when C11 atomics are used.

> 
> > 2) This API can get called from writer or reader, so both the loads
> > have to be __ATOMIC_ACQUIRE
> > 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> loads/stores around of this API to get reordered, especially since this is an
> inline function. This would put burden on the application to manage the
> ordering depending on its usage. It will also require the application to
> understand the implementation of this API.
> 
> __ATOMIC_RELAXED may be fine too for _count() case as it may not very
> important to get the exact count for the exact very moment, Application can
> retry.
> 
> I am in favor of performance effective implementation.

The requirement on the correctness of the count depends on the usage of this function. I see the following usage:

In the file kni_net.c, function: kni_net_tx:

       if (kni_fifo_free_count(kni->tx_q) == 0 ||                              
                        kni_fifo_count(kni->alloc_q) == 0) {                    
                /**                                                             
                 * If no free entry in tx_q or no entry in alloc_q,             
                 * drops skb and goes out.                                      
                 */                                                             
                goto drop;                                                      
        }

There is no retry here, the packet is dropped.

> 
> >
> > >
> > > Other than that, I prefer to avoid ifdef clutter by introducing two
> > > separate file just like ring C11 implementation.
> > >
> > > I don't have strong opinion on this this part, I let KNI MAINTAINER
> > > to decide on how to accommodate this change.
> >
> > I prefer to change this as well, I am open for suggestions.
> > Introducing two separate files would be too much for this library. A better
> way would be to have something similar to 'smp_store_release' provided by
> the kernel. i.e. create #defines for loads/stores. Hide the clutter behind the
> #defines.
> 
> No Strong opinion on this, leaving to KNI Maintainer.
Will wait on this before re-spinning the patch

> 
> This patch needs to split by two,
> a) Fixes for non C11 implementation(i.e new addition to rte_smp_wmb())
> b) add support for C11 implementation.
Agree

> 
> >
> > >
> > > > +       return (fifo->len + fifo_write - fifo_read) & (fifo->len -
> > > > +1); #else
> > > >         return (fifo->len + fifo->write - fifo->read) & (fifo->len
> > > > - 1);
> > > > +#endif
> > > >  }
> > > > --
> > > > 2.7.4
> > > >
  
Jerin Jacob Sept. 21, 2018, 5:55 a.m. UTC | #5
-----Original Message-----
> Date: Fri, 21 Sep 2018 05:48:44 +0000
> From: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> CC: "Phil Yang (Arm Technology China)" <Phil.Yang@arm.com>, "dev@dpdk.org"
>  <dev@dpdk.org>, nd <nd@arm.com>, "kkokkilagadda@caviumnetworks.com"
>  <kkokkilagadda@caviumnetworks.com>, "Gavin Hu (Arm Technology China)"
>  <Gavin.Hu@arm.com>, "ferruh.yigit@intel.com" <ferruh.yigit@intel.com>
> Subject: RE: [PATCH v2 2/3] kni: fix kni fifo synchronization
> 
> 
> > > > -----Original Message-----
> > > > > Date: Wed, 19 Sep 2018 21:42:39 +0800
> > > > > From: Phil Yang <phil.yang@arm.com>
> > > > > To: dev@dpdk.org
> > > > > CC: nd@arm.com, jerin.jacob@caviumnetworks.com,
> > > > > kkokkilagadda@caviumnetworks.com, Honnappa.Nagarahalli@arm.com,
> > > > > Gavin.Hu@arm.com
> > > > > Subject: [PATCH v2 2/3] kni: fix kni fifo synchronization
> > > > > X-Mailer: git-send-email 2.7.4
> > > > >
> > > >
> > > > + Ferruh Yigit <ferruh.yigit@intel.com>
> > > >
> > > > >
> > > > > With existing code in kni_fifo_put, rx_q values are not being
> > > > > updated before updating fifo_write. While reading rx_q in
> > > > > kni_net_rx_normal, This is causing the sync issue on other core.
> > > > > The same situation happens in kni_fifo_get as well.
> > > > >
> > > > > So syncing the values by adding C11 atomic memory barriers to make
> > > > > sure the values being synced before updating fifo_write and fifo_read.
> > > > >
> > > > > Fixes: 3fc5ca2 ("kni: initial import")
> > > > > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > > > > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > > > > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> > > > > ---
> > > > >  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
> > > > >  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
> > > > >  2 files changed, 34 insertions(+), 1 deletion(-)
> > > > >
> > > > > diff --git
> > > > > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > > index cfa9448..1fd713b 100644
> > > > > ---
> > > > > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > > > > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.
> > > > > +++ h
> > > > > @@ -54,8 +54,13 @@ struct rte_kni_request {
> > > > >   * Writing should never overwrite the read position
> > > > >   */
> > > > >  struct rte_kni_fifo {
> > > > > +#ifndef RTE_USE_C11_MEM_MODEL
> > > > >         volatile unsigned write;     /**< Next position to be written*/
> > > > >         volatile unsigned read;      /**< Next position to be read */
> > > > > +#else
> > > > > +       unsigned write;              /**< Next position to be written*/
> > > > > +       unsigned read;               /**< Next position to be read */
> > > > > +#endif
> > > > >         unsigned len;                /**< Circular buffer length */
> > > > >         unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
> > > > >         void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> > > > > diff --git a/lib/librte_kni/rte_kni_fifo.h
> > > > > b/lib/librte_kni/rte_kni_fifo.h index ac26a8c..f4171a1 100644
> > > > > --- a/lib/librte_kni/rte_kni_fifo.h
> > > > > +++ b/lib/librte_kni/rte_kni_fifo.h
> > > > > @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > > > > **data, unsigned num)  {
> > > > >         unsigned i = 0;
> > > > >         unsigned fifo_write = fifo->write;
> > > > > -       unsigned fifo_read = fifo->read;
> > > > >         unsigned new_write = fifo_write;
> > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > > +
> > > > > +__ATOMIC_ACQUIRE); #else
> > > > > +       unsigned fifo_read = fifo->read; #endif
> > > >
> > > > Correct.
> > >
> > > My apologies, did not follow your comment here. Do you want us to correct
> > anything here? '#endif' is not appearing on the correct line in the email, but it
> > shows up fine on the patch work.
> >
> > No. What I meant is, code is correct.
> >
> > >
> > > >
> > > >
> > > > >
> > > > >         for (i = 0; i < num; i++) {
> > > > >                 new_write = (new_write + 1) & (fifo->len - 1); @@
> > > > > -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > > > > **data,
> > > > unsigned num)
> > > > >                 fifo->buffer[fifo_write] = data[i];
> > > > >                 fifo_write = new_write;
> > > > >         }
> > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > +       __atomic_store_n(&fifo->write, fifo_write,
> > > > > +__ATOMIC_RELEASE); #else
> > > > > +       rte_smp_wmb();
> > > > >         fifo->write = fifo_write;
> > > > > +#endif
> > > >
> > > > Correct.
> > > > >         return i;
> > > > >  }
> > > > >
> > > > > @@ -51,7 +61,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > > **data, unsigned num)  {
> > > > >         unsigned i = 0;
> > > > >         unsigned new_read = fifo->read;
> > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > > +__ATOMIC_ACQUIRE); #else
> > > > >         unsigned fifo_write = fifo->write;
> > > > > +#endif
> > > >
> > > > Correct.
> > > >
> > > > > +
> > > > >         for (i = 0; i < num; i++) {
> > > > >                 if (new_read == fifo_write)
> > > > >                         break;
> > > > > @@ -59,7 +74,12 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > > **data,
> > > > unsigned num)
> > > > >                 data[i] = fifo->buffer[new_read];
> > > > >                 new_read = (new_read + 1) & (fifo->len - 1);
> > > > >         }
> > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > +       __atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
> > > > > +#else
> > > > > +       rte_smp_wmb();
> > > > >         fifo->read = new_read;
> > > > > +#endif
> > > >
> > > > Correct.
> > > >
> > > > >         return i;
> > > > >  }
> > > > >
> > > > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> > > > > **data, unsigned num)  static inline uint32_t
> > > > > kni_fifo_count(struct rte_kni_fifo *fifo)  {
> > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > > +                                                 __ATOMIC_ACQUIRE);
> > > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > > +
> > > > > +__ATOMIC_ACQUIRE);
> > > >
> > > > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> > > > rte_smp_rmb() would be enough here. Right?
> > > > or
> > > > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> > > >
> > > We also had some amount of debate internally on this:
> > > 1) We do not want to use rte_smp_rmb() as we want to keep the memory
> > models separated (for ex: while using C11, use C11 everywhere). It is also not
> > sufficient, please see 3) below.
> >
> > But Nothing technically wrong in using rte_smp_rmb() here in terms
> > functionally and code generated by the compiler.
> 
> rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal. 'LDAR' is a better option which is generated when C11 atomics are used.

Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?

> 
> >
> > > 2) This API can get called from writer or reader, so both the loads
> > > have to be __ATOMIC_ACQUIRE
> > > 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> > loads/stores around of this API to get reordered, especially since this is an
> > inline function. This would put burden on the application to manage the
> > ordering depending on its usage. It will also require the application to
> > understand the implementation of this API.
> >
> > __ATOMIC_RELAXED may be fine too for _count() case as it may not very
> > important to get the exact count for the exact very moment, Application can
> > retry.
> >
> > I am in favor of performance effective implementation.
> 
> The requirement on the correctness of the count depends on the usage of this function. I see the following usage:
> 
> In the file kni_net.c, function: kni_net_tx:
> 
>        if (kni_fifo_free_count(kni->tx_q) == 0 ||
>                         kni_fifo_count(kni->alloc_q) == 0) {
>                 /**
>                  * If no free entry in tx_q or no entry in alloc_q,
>                  * drops skb and goes out.
>                  */
>                 goto drop;
>         }
> 
> There is no retry here, the packet is dropped.

OK. Then pick an implementation which is an optimal this case.
I think, then rte_smp_rmb() makes sense here as
a) no #ifdef clutter
b) it is optimal compared to 2 x LDAR


> 
> >
> > >
> > > >
> > > > Other than that, I prefer to avoid ifdef clutter by introducing two
> > > > separate file just like ring C11 implementation.
> > > >
> > > > I don't have strong opinion on this this part, I let KNI MAINTAINER
> > > > to decide on how to accommodate this change.
> > >
> > > I prefer to change this as well, I am open for suggestions.
> > > Introducing two separate files would be too much for this library. A better
> > way would be to have something similar to 'smp_store_release' provided by
> > the kernel. i.e. create #defines for loads/stores. Hide the clutter behind the
> > #defines.
> >
> > No Strong opinion on this, leaving to KNI Maintainer.
> Will wait on this before re-spinning the patch
> 
> >
> > This patch needs to split by two,
> > a) Fixes for non C11 implementation(i.e new addition to rte_smp_wmb())
> > b) add support for C11 implementation.
> Agree
> 
> >
> > >
> > > >
> > > > > +       return (fifo->len + fifo_write - fifo_read) & (fifo->len -
> > > > > +1); #else
> > > > >         return (fifo->len + fifo->write - fifo->read) & (fifo->len
> > > > > - 1);
> > > > > +#endif
> > > > >  }
> > > > > --
> > > > > 2.7.4
> > > > >
  
Honnappa Nagarahalli Sept. 21, 2018, 6:37 a.m. UTC | #6
> > > > > >
> > > > > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo,
> > > > > > void **data, unsigned num)  static inline uint32_t
> > > > > > kni_fifo_count(struct rte_kni_fifo *fifo)  {
> > > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > > > +                                                 __ATOMIC_ACQUIRE);
> > > > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > > > +
> > > > > > +__ATOMIC_ACQUIRE);
> > > > >
> > > > > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> > > > > rte_smp_rmb() would be enough here. Right?
> > > > > or
> > > > > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> > > > >
> > > > We also had some amount of debate internally on this:
> > > > 1) We do not want to use rte_smp_rmb() as we want to keep the
> > > > memory
> > > models separated (for ex: while using C11, use C11 everywhere). It
> > > is also not sufficient, please see 3) below.
> > >
> > > But Nothing technically wrong in using rte_smp_rmb() here in terms
> > > functionally and code generated by the compiler.
> >
> > rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal.
> 'LDAR' is a better option which is generated when C11 atomics are used.
> 
> Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?

Good point. I am not sure which one is optimal, it needs to be measured. 'DMB ISHLD' orders 'all' earlier loads against 'all' later loads and stores. 'LDAR' orders the 'specific' load with 'all' later loads and stores.

> 
> >
> > >
> > > > 2) This API can get called from writer or reader, so both the
> > > > loads have to be __ATOMIC_ACQUIRE
> > > > 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> > > loads/stores around of this API to get reordered, especially since
> > > this is an inline function. This would put burden on the application
> > > to manage the ordering depending on its usage. It will also require
> > > the application to understand the implementation of this API.
> > >
> > > __ATOMIC_RELAXED may be fine too for _count() case as it may not
> > > very important to get the exact count for the exact very moment,
> > > Application can retry.
> > >
> > > I am in favor of performance effective implementation.
> >
> > The requirement on the correctness of the count depends on the usage of
> this function. I see the following usage:
> >
> > In the file kni_net.c, function: kni_net_tx:
> >
> >        if (kni_fifo_free_count(kni->tx_q) == 0 ||
> >                         kni_fifo_count(kni->alloc_q) == 0) {
> >                 /**
> >                  * If no free entry in tx_q or no entry in alloc_q,
> >                  * drops skb and goes out.
> >                  */
> >                 goto drop;
> >         }
> >
> > There is no retry here, the packet is dropped.
> 
> OK. Then pick an implementation which is an optimal this case.
> I think, then rte_smp_rmb() makes sense here as
> a) no #ifdef clutter
> b) it is optimal compared to 2 x LDAR
> 
As I understand, one of the principals of using C11 model is to match the store releases and load acquires. IMO, combining C11 memory model with barrier based functions makes the code unreadable.
I realized rte_smp_rmb() is required for x86 as well to prevent compiler reordering. We can add that in the non-C11 case. This way, we will have clean code for both the options (similar to rte_ring).
So, if 'RTE_USE_C11_MEM_MODEL' is set to 'n', then the 'rte_smp_rmb' would be used.

We can look at handling the #ifdef clutter based on Ferruh's feedback.

> 
> >
> > >
> > > >
> > > > >
> > > > > Other than that, I prefer to avoid ifdef clutter by introducing
> > > > > two separate file just like ring C11 implementation.
> > > > >
> > > > > I don't have strong opinion on this this part, I let KNI
> > > > > MAINTAINER to decide on how to accommodate this change.
> > > >
> > > > I prefer to change this as well, I am open for suggestions.
> > > > Introducing two separate files would be too much for this library.
> > > > A better
> > > way would be to have something similar to 'smp_store_release'
> > > provided by the kernel. i.e. create #defines for loads/stores. Hide
> > > the clutter behind the #defines.
> > >
> > > No Strong opinion on this, leaving to KNI Maintainer.
> > Will wait on this before re-spinning the patch
> >
> > >
> > > This patch needs to split by two,
> > > a) Fixes for non C11 implementation(i.e new addition to
> > > rte_smp_wmb())
> > > b) add support for C11 implementation.
> > Agree
> >
> > >
> > > >
> > > > >
> > > > > > +       return (fifo->len + fifo_write - fifo_read) &
> > > > > > +(fifo->len - 1); #else
> > > > > >         return (fifo->len + fifo->write - fifo->read) &
> > > > > > (fifo->len
> > > > > > - 1);
Requires rte_smp_rmb() for x86 to prevent compiler reordering.

> > > > > > +#endif
> > > > > >  }
> > > > > > --
> > > > > > 2.7.4
> > > > > >
  
Phil Yang Sept. 21, 2018, 9 a.m. UTC | #7
+ Ola Liljedahl <Ola.Liljedahl@arm.com>

Thanks,
Phil Yang

> -----Original Message-----
> From: Honnappa Nagarahalli
> Sent: Friday, September 21, 2018 2:37 PM
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> Cc: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>; kkokkilagadda@caviumnetworks.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; ferruh.yigit@intel.com
> Subject: RE: [PATCH v2 2/3] kni: fix kni fifo synchronization
> 
> > > > > > >
> > > > > > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo,
> > > > > > > void **data, unsigned num)  static inline uint32_t
> > > > > > > kni_fifo_count(struct rte_kni_fifo *fifo)  {
> > > > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > > > > +                                                 __ATOMIC_ACQUIRE);
> > > > > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > > > > +
> > > > > > > +__ATOMIC_ACQUIRE);
> > > > > >
> > > > > > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> > > > > > rte_smp_rmb() would be enough here. Right?
> > > > > > or
> > > > > > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> > > > > >
> > > > > We also had some amount of debate internally on this:
> > > > > 1) We do not want to use rte_smp_rmb() as we want to keep the
> > > > > memory
> > > > models separated (for ex: while using C11, use C11 everywhere). It
> > > > is also not sufficient, please see 3) below.
> > > >
> > > > But Nothing technically wrong in using rte_smp_rmb() here in terms
> > > > functionally and code generated by the compiler.
> > >
> > > rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal.
> > 'LDAR' is a better option which is generated when C11 atomics are used.
> >
> > Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?
> 
> Good point. I am not sure which one is optimal, it needs to be measured. 'DMB
> ISHLD' orders 'all' earlier loads against 'all' later loads and stores. 'LDAR' orders
> the 'specific' load with 'all' later loads and stores.
> 
> >
> > >
> > > >
> > > > > 2) This API can get called from writer or reader, so both the
> > > > > loads have to be __ATOMIC_ACQUIRE
> > > > > 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> > > > loads/stores around of this API to get reordered, especially since
> > > > this is an inline function. This would put burden on the
> > > > application to manage the ordering depending on its usage. It will
> > > > also require the application to understand the implementation of this API.
> > > >
> > > > __ATOMIC_RELAXED may be fine too for _count() case as it may not
> > > > very important to get the exact count for the exact very moment,
> > > > Application can retry.
> > > >
> > > > I am in favor of performance effective implementation.
> > >
> > > The requirement on the correctness of the count depends on the usage
> > > of
> > this function. I see the following usage:
> > >
> > > In the file kni_net.c, function: kni_net_tx:
> > >
> > >        if (kni_fifo_free_count(kni->tx_q) == 0 ||
> > >                         kni_fifo_count(kni->alloc_q) == 0) {
> > >                 /**
> > >                  * If no free entry in tx_q or no entry in alloc_q,
> > >                  * drops skb and goes out.
> > >                  */
> > >                 goto drop;
> > >         }
> > >
> > > There is no retry here, the packet is dropped.
> >
> > OK. Then pick an implementation which is an optimal this case.
> > I think, then rte_smp_rmb() makes sense here as
> > a) no #ifdef clutter
> > b) it is optimal compared to 2 x LDAR
> >
> As I understand, one of the principals of using C11 model is to match the store
> releases and load acquires. IMO, combining C11 memory model with barrier
> based functions makes the code unreadable.
> I realized rte_smp_rmb() is required for x86 as well to prevent compiler
> reordering. We can add that in the non-C11 case. This way, we will have clean
> code for both the options (similar to rte_ring).
> So, if 'RTE_USE_C11_MEM_MODEL' is set to 'n', then the 'rte_smp_rmb' would
> be used.
> 
> We can look at handling the #ifdef clutter based on Ferruh's feedback.
> 
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > Other than that, I prefer to avoid ifdef clutter by
> > > > > > introducing two separate file just like ring C11 implementation.
> > > > > >
> > > > > > I don't have strong opinion on this this part, I let KNI
> > > > > > MAINTAINER to decide on how to accommodate this change.
> > > > >
> > > > > I prefer to change this as well, I am open for suggestions.
> > > > > Introducing two separate files would be too much for this library.
> > > > > A better
> > > > way would be to have something similar to 'smp_store_release'
> > > > provided by the kernel. i.e. create #defines for loads/stores.
> > > > Hide the clutter behind the #defines.
> > > >
> > > > No Strong opinion on this, leaving to KNI Maintainer.
> > > Will wait on this before re-spinning the patch
> > >
> > > >
> > > > This patch needs to split by two,
> > > > a) Fixes for non C11 implementation(i.e new addition to
> > > > rte_smp_wmb())
> > > > b) add support for C11 implementation.
> > > Agree
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > +       return (fifo->len + fifo_write - fifo_read) &
> > > > > > > +(fifo->len - 1); #else
> > > > > > >         return (fifo->len + fifo->write - fifo->read) &
> > > > > > > (fifo->len
> > > > > > > - 1);
> Requires rte_smp_rmb() for x86 to prevent compiler reordering.
> 
> > > > > > > +#endif
> > > > > > >  }
> > > > > > > --
> > > > > > > 2.7.4
> > > > > > >
  
Honnappa Nagarahalli Sept. 25, 2018, 4:44 a.m. UTC | #8
Hi Ferruh,
	Just wanted to get your attention to this conversation. Appreciate your feedback on handling the #ifdef clutter.

Thank you,
Honnappa

> -----Original Message-----
> From: dev <dev-bounces@dpdk.org> On Behalf Of Honnappa Nagarahalli
> Sent: Friday, September 21, 2018 1:37 AM
> To: Jerin Jacob <jerin.jacob@caviumnetworks.com>
> Cc: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org;
> nd <nd@arm.com>; kkokkilagadda@caviumnetworks.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>; ferruh.yigit@intel.com
> Subject: Re: [dpdk-dev] [PATCH v2 2/3] kni: fix kni fifo synchronization
> 
> > > > > > >
> > > > > > > @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo,
> > > > > > > void **data, unsigned num)  static inline uint32_t
> > > > > > > kni_fifo_count(struct rte_kni_fifo *fifo)  {
> > > > > > > +#ifdef RTE_USE_C11_MEM_MODEL
> > > > > > > +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> > > > > > > +                                                 __ATOMIC_ACQUIRE);
> > > > > > > +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> > > > > > > +
> > > > > > > +__ATOMIC_ACQUIRE);
> > > > > >
> > > > > > Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> > > > > > rte_smp_rmb() would be enough here. Right?
> > > > > > or
> > > > > > Do we need __ATOMIC_ACQUIRE for fifo_write case?
> > > > > >
> > > > > We also had some amount of debate internally on this:
> > > > > 1) We do not want to use rte_smp_rmb() as we want to keep the
> > > > > memory
> > > > models separated (for ex: while using C11, use C11 everywhere). It
> > > > is also not sufficient, please see 3) below.
> > > >
> > > > But Nothing technically wrong in using rte_smp_rmb() here in terms
> > > > functionally and code generated by the compiler.
> > >
> > > rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not
> optimal.
> > 'LDAR' is a better option which is generated when C11 atomics are used.
> >
> > Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?
> 
> Good point. I am not sure which one is optimal, it needs to be measured.
> 'DMB ISHLD' orders 'all' earlier loads against 'all' later loads and stores.
> 'LDAR' orders the 'specific' load with 'all' later loads and stores.
> 
> >
> > >
> > > >
> > > > > 2) This API can get called from writer or reader, so both the
> > > > > loads have to be __ATOMIC_ACQUIRE
> > > > > 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> > > > loads/stores around of this API to get reordered, especially since
> > > > this is an inline function. This would put burden on the
> > > > application to manage the ordering depending on its usage. It will
> > > > also require the application to understand the implementation of this
> API.
> > > >
> > > > __ATOMIC_RELAXED may be fine too for _count() case as it may not
> > > > very important to get the exact count for the exact very moment,
> > > > Application can retry.
> > > >
> > > > I am in favor of performance effective implementation.
> > >
> > > The requirement on the correctness of the count depends on the usage
> > > of
> > this function. I see the following usage:
> > >
> > > In the file kni_net.c, function: kni_net_tx:
> > >
> > >        if (kni_fifo_free_count(kni->tx_q) == 0 ||
> > >                         kni_fifo_count(kni->alloc_q) == 0) {
> > >                 /**
> > >                  * If no free entry in tx_q or no entry in alloc_q,
> > >                  * drops skb and goes out.
> > >                  */
> > >                 goto drop;
> > >         }
> > >
> > > There is no retry here, the packet is dropped.
> >
> > OK. Then pick an implementation which is an optimal this case.
> > I think, then rte_smp_rmb() makes sense here as
> > a) no #ifdef clutter
> > b) it is optimal compared to 2 x LDAR
> >
> As I understand, one of the principals of using C11 model is to match the store
> releases and load acquires. IMO, combining C11 memory model with barrier
> based functions makes the code unreadable.
> I realized rte_smp_rmb() is required for x86 as well to prevent compiler
> reordering. We can add that in the non-C11 case. This way, we will have clean
> code for both the options (similar to rte_ring).
> So, if 'RTE_USE_C11_MEM_MODEL' is set to 'n', then the 'rte_smp_rmb'
> would be used.
> 
> We can look at handling the #ifdef clutter based on Ferruh's feedback.
> 
> >
> > >
> > > >
> > > > >
> > > > > >
> > > > > > Other than that, I prefer to avoid ifdef clutter by
> > > > > > introducing two separate file just like ring C11 implementation.
> > > > > >
> > > > > > I don't have strong opinion on this this part, I let KNI
> > > > > > MAINTAINER to decide on how to accommodate this change.
> > > > >
> > > > > I prefer to change this as well, I am open for suggestions.
> > > > > Introducing two separate files would be too much for this library.
> > > > > A better
> > > > way would be to have something similar to 'smp_store_release'
> > > > provided by the kernel. i.e. create #defines for loads/stores.
> > > > Hide the clutter behind the #defines.
> > > >
> > > > No Strong opinion on this, leaving to KNI Maintainer.
> > > Will wait on this before re-spinning the patch
> > >
> > > >
> > > > This patch needs to split by two,
> > > > a) Fixes for non C11 implementation(i.e new addition to
> > > > rte_smp_wmb())
> > > > b) add support for C11 implementation.
> > > Agree
> > >
> > > >
> > > > >
> > > > > >
> > > > > > > +       return (fifo->len + fifo_write - fifo_read) &
> > > > > > > +(fifo->len - 1); #else
> > > > > > >         return (fifo->len + fifo->write - fifo->read) &
> > > > > > > (fifo->len
> > > > > > > - 1);
> Requires rte_smp_rmb() for x86 to prevent compiler reordering.
> 
> > > > > > > +#endif
> > > > > > >  }
> > > > > > > --
> > > > > > > 2.7.4
> > > > > > >
  
Ferruh Yigit Sept. 26, 2018, 11:42 a.m. UTC | #9
On 9/21/2018 7:37 AM, Honnappa Nagarahalli wrote:
>>>>>>>
>>>>>>> @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo,
>>>>>>> void **data, unsigned num)  static inline uint32_t
>>>>>>> kni_fifo_count(struct rte_kni_fifo *fifo)  {
>>>>>>> +#ifdef RTE_USE_C11_MEM_MODEL
>>>>>>> +       unsigned fifo_write = __atomic_load_n(&fifo->write,
>>>>>>> +                                                 __ATOMIC_ACQUIRE);
>>>>>>> +       unsigned fifo_read = __atomic_load_n(&fifo->read,
>>>>>>> +
>>>>>>> +__ATOMIC_ACQUIRE);
>>>>>>
>>>>>> Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
>>>>>> rte_smp_rmb() would be enough here. Right?
>>>>>> or
>>>>>> Do we need __ATOMIC_ACQUIRE for fifo_write case?
>>>>>>
>>>>> We also had some amount of debate internally on this:
>>>>> 1) We do not want to use rte_smp_rmb() as we want to keep the
>>>>> memory
>>>> models separated (for ex: while using C11, use C11 everywhere). It
>>>> is also not sufficient, please see 3) below.
>>>>
>>>> But Nothing technically wrong in using rte_smp_rmb() here in terms
>>>> functionally and code generated by the compiler.
>>>
>>> rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal.
>> 'LDAR' is a better option which is generated when C11 atomics are used.
>>
>> Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?
> 
> Good point. I am not sure which one is optimal, it needs to be measured. 'DMB ISHLD' orders 'all' earlier loads against 'all' later loads and stores. 'LDAR' orders the 'specific' load with 'all' later loads and stores.
> 
>>
>>>
>>>>
>>>>> 2) This API can get called from writer or reader, so both the
>>>>> loads have to be __ATOMIC_ACQUIRE
>>>>> 3) Other option is to use __ATOMIC_RELAXED. That would allow any
>>>> loads/stores around of this API to get reordered, especially since
>>>> this is an inline function. This would put burden on the application
>>>> to manage the ordering depending on its usage. It will also require
>>>> the application to understand the implementation of this API.
>>>>
>>>> __ATOMIC_RELAXED may be fine too for _count() case as it may not
>>>> very important to get the exact count for the exact very moment,
>>>> Application can retry.
>>>>
>>>> I am in favor of performance effective implementation.
>>>
>>> The requirement on the correctness of the count depends on the usage of
>> this function. I see the following usage:
>>>
>>> In the file kni_net.c, function: kni_net_tx:
>>>
>>>        if (kni_fifo_free_count(kni->tx_q) == 0 ||
>>>                         kni_fifo_count(kni->alloc_q) == 0) {
>>>                 /**
>>>                  * If no free entry in tx_q or no entry in alloc_q,
>>>                  * drops skb and goes out.
>>>                  */
>>>                 goto drop;
>>>         }
>>>
>>> There is no retry here, the packet is dropped.
>>
>> OK. Then pick an implementation which is an optimal this case.
>> I think, then rte_smp_rmb() makes sense here as
>> a) no #ifdef clutter
>> b) it is optimal compared to 2 x LDAR
>>
> As I understand, one of the principals of using C11 model is to match the store releases and load acquires. IMO, combining C11 memory model with barrier based functions makes the code unreadable.
> I realized rte_smp_rmb() is required for x86 as well to prevent compiler reordering. We can add that in the non-C11 case. This way, we will have clean code for both the options (similar to rte_ring).
> So, if 'RTE_USE_C11_MEM_MODEL' is set to 'n', then the 'rte_smp_rmb' would be used.
> 
> We can look at handling the #ifdef clutter based on Ferruh's feedback.

Hi Honnappa, Jerin,

Sorry for delay, I missed that this is waiting my input.

+1 to remove #ifdef, but I don't think a separate file is required for this,
specially when it will be duplication of same implementation, nothing arch
specific implementation.
+1 Honnappa's suggestion to hide ifdef's behind APIs, plus those APIs can be
reused later...

And +1 to split into two patches, one for fix to current code and one for c11
atomic implementation support.

I have some basic questions on the patch, will send in different thread.

Thanks,
ferruh

> 
>>
>>>
>>>>
>>>>>
>>>>>>
>>>>>> Other than that, I prefer to avoid ifdef clutter by introducing
>>>>>> two separate file just like ring C11 implementation.
>>>>>>
>>>>>> I don't have strong opinion on this this part, I let KNI
>>>>>> MAINTAINER to decide on how to accommodate this change.
>>>>>
>>>>> I prefer to change this as well, I am open for suggestions.
>>>>> Introducing two separate files would be too much for this library.
>>>>> A better
>>>> way would be to have something similar to 'smp_store_release'
>>>> provided by the kernel. i.e. create #defines for loads/stores. Hide
>>>> the clutter behind the #defines.
>>>>
>>>> No Strong opinion on this, leaving to KNI Maintainer.
>>> Will wait on this before re-spinning the patch
>>>
>>>>
>>>> This patch needs to split by two,
>>>> a) Fixes for non C11 implementation(i.e new addition to
>>>> rte_smp_wmb())
>>>> b) add support for C11 implementation.
>>> Agree
>>>
>>>>
>>>>>
>>>>>>
>>>>>>> +       return (fifo->len + fifo_write - fifo_read) &
>>>>>>> +(fifo->len - 1); #else
>>>>>>>         return (fifo->len + fifo->write - fifo->read) &
>>>>>>> (fifo->len
>>>>>>> - 1);
> Requires rte_smp_rmb() for x86 to prevent compiler reordering.
> 
>>>>>>> +#endif
>>>>>>>  }
>>>>>>> --
>>>>>>> 2.7.4
>>>>>>>
  
Ferruh Yigit Sept. 26, 2018, 11:45 a.m. UTC | #10
On 9/19/2018 2:42 PM, dev-bounces@dpdk.org wrote:
> With existing code in kni_fifo_put, rx_q values are not being updated
> before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> This is causing the sync issue on other core. The same situation happens
> in kni_fifo_get as well.
> 
> So syncing the values by adding C11 atomic memory barriers to make sure
> the values being synced before updating fifo_write and fifo_read.
> 
> Fixes: 3fc5ca2 ("kni: initial import")
> Signed-off-by: Phil Yang <phil.yang@arm.com>
> Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> ---
>  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
>  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
>  2 files changed, 34 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> index cfa9448..1fd713b 100644
> --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> @@ -54,8 +54,13 @@ struct rte_kni_request {
>   * Writing should never overwrite the read position
>   */
>  struct rte_kni_fifo {
> +#ifndef RTE_USE_C11_MEM_MODEL
>  	volatile unsigned write;     /**< Next position to be written*/
>  	volatile unsigned read;      /**< Next position to be read */
> +#else
> +	unsigned write;              /**< Next position to be written*/
> +	unsigned read;               /**< Next position to be read */
> +#endif
>  	unsigned len;                /**< Circular buffer length */
>  	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
>  	void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
> index ac26a8c..f4171a1 100644
> --- a/lib/librte_kni/rte_kni_fifo.h
> +++ b/lib/librte_kni/rte_kni_fifo.h
> @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
>  {
>  	unsigned i = 0;
>  	unsigned fifo_write = fifo->write;
> -	unsigned fifo_read = fifo->read;
>  	unsigned new_write = fifo_write;
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	unsigned fifo_read = __atomic_load_n(&fifo->read,
> +						 __ATOMIC_ACQUIRE);
> +#else
> +	unsigned fifo_read = fifo->read;
> +#endif

Why atomic load preferred against "volatile", won't both end up accessing
memory, is atomic load faster?

>  
>  	for (i = 0; i < num; i++) {
>  		new_write = (new_write + 1) & (fifo->len - 1);
> @@ -39,7 +44,12 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
>  		fifo->buffer[fifo_write] = data[i];
>  		fifo_write = new_write;
>  	}
> +#ifdef RTE_USE_C11_MEM_MODEL
> +	__atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
> +#else
> +	rte_smp_wmb();
>  	fifo->write = fifo_write;
> +#endif

How atomic store guaranties "fifo->buffer[fifo_write] = data[i];" will wait
"fifo->write = fifo_write;"? Is atomic store also behave as write memory barrier?
  
Phil Yang Sept. 27, 2018, 9:06 a.m. UTC | #11
Thanks for your comments.

I'll update it in the next version.

> -----Original Message-----
> From: Ferruh Yigit <ferruh.yigit@intel.com>
> Sent: Wednesday, September 26, 2018 7:43 PM
> To: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>; Jerin Jacob
> <jerin.jacob@caviumnetworks.com>
> Cc: Phil Yang (Arm Technology China) <Phil.Yang@arm.com>; dev@dpdk.org; nd
> <nd@arm.com>; kkokkilagadda@caviumnetworks.com; Gavin Hu (Arm
> Technology China) <Gavin.Hu@arm.com>
> Subject: Re: [PATCH v2 2/3] kni: fix kni fifo synchronization
> 
> On 9/21/2018 7:37 AM, Honnappa Nagarahalli wrote:
> >>>>>>>
> >>>>>>> @@ -69,5 +89,13 @@ kni_fifo_get(struct rte_kni_fifo *fifo, void
> >>>>>>> **data, unsigned num)  static inline uint32_t
> >>>>>>> kni_fifo_count(struct rte_kni_fifo *fifo)  {
> >>>>>>> +#ifdef RTE_USE_C11_MEM_MODEL
> >>>>>>> +       unsigned fifo_write = __atomic_load_n(&fifo->write,
> >>>>>>> +                                                 __ATOMIC_ACQUIRE);
> >>>>>>> +       unsigned fifo_read = __atomic_load_n(&fifo->read,
> >>>>>>> +
> >>>>>>> +__ATOMIC_ACQUIRE);
> >>>>>>
> >>>>>> Isn't too  heavy to have two __ATOMIC_ACQUIREs? a simple
> >>>>>> rte_smp_rmb() would be enough here. Right?
> >>>>>> or
> >>>>>> Do we need __ATOMIC_ACQUIRE for fifo_write case?
> >>>>>>
> >>>>> We also had some amount of debate internally on this:
> >>>>> 1) We do not want to use rte_smp_rmb() as we want to keep the
> >>>>> memory
> >>>> models separated (for ex: while using C11, use C11 everywhere). It
> >>>> is also not sufficient, please see 3) below.
> >>>>
> >>>> But Nothing technically wrong in using rte_smp_rmb() here in terms
> >>>> functionally and code generated by the compiler.
> >>>
> >>> rte_smp_rmb() generates 'DMB ISHLD'. This works fine, but it is not optimal.
> >> 'LDAR' is a better option which is generated when C11 atomics are used.
> >>
> >> Yes. But which one is optimal 1 x DMB ISHLD vs 2 x LDAR ?
> >
> > Good point. I am not sure which one is optimal, it needs to be measured. 'DMB
> ISHLD' orders 'all' earlier loads against 'all' later loads and stores. 'LDAR' orders
> the 'specific' load with 'all' later loads and stores.
> >
> >>
> >>>
> >>>>
> >>>>> 2) This API can get called from writer or reader, so both the
> >>>>> loads have to be __ATOMIC_ACQUIRE
> >>>>> 3) Other option is to use __ATOMIC_RELAXED. That would allow any
> >>>> loads/stores around of this API to get reordered, especially since
> >>>> this is an inline function. This would put burden on the
> >>>> application to manage the ordering depending on its usage. It will
> >>>> also require the application to understand the implementation of this API.
> >>>>
> >>>> __ATOMIC_RELAXED may be fine too for _count() case as it may not
> >>>> very important to get the exact count for the exact very moment,
> >>>> Application can retry.
> >>>>
> >>>> I am in favor of performance effective implementation.
> >>>
> >>> The requirement on the correctness of the count depends on the usage
> >>> of
> >> this function. I see the following usage:
> >>>
> >>> In the file kni_net.c, function: kni_net_tx:
> >>>
> >>>        if (kni_fifo_free_count(kni->tx_q) == 0 ||
> >>>                         kni_fifo_count(kni->alloc_q) == 0) {
> >>>                 /**
> >>>                  * If no free entry in tx_q or no entry in alloc_q,
> >>>                  * drops skb and goes out.
> >>>                  */
> >>>                 goto drop;
> >>>         }
> >>>
> >>> There is no retry here, the packet is dropped.
> >>
> >> OK. Then pick an implementation which is an optimal this case.
> >> I think, then rte_smp_rmb() makes sense here as
> >> a) no #ifdef clutter
> >> b) it is optimal compared to 2 x LDAR
> >>
> > As I understand, one of the principals of using C11 model is to match the store
> releases and load acquires. IMO, combining C11 memory model with barrier
> based functions makes the code unreadable.
> > I realized rte_smp_rmb() is required for x86 as well to prevent compiler
> reordering. We can add that in the non-C11 case. This way, we will have clean
> code for both the options (similar to rte_ring).
> > So, if 'RTE_USE_C11_MEM_MODEL' is set to 'n', then the 'rte_smp_rmb' would
> be used.
> >
> > We can look at handling the #ifdef clutter based on Ferruh's feedback.
> 
> Hi Honnappa, Jerin,
> 
> Sorry for delay, I missed that this is waiting my input.
> 
> +1 to remove #ifdef, but I don't think a separate file is required for
> +this,
> specially when it will be duplication of same implementation, nothing arch
> specific implementation.
> +1 Honnappa's suggestion to hide ifdef's behind APIs, plus those APIs
> +can be
> reused later...
> 
> And +1 to split into two patches, one for fix to current code and one for c11
> atomic implementation support.
> 
> I have some basic questions on the patch, will send in different thread.
> 
> Thanks,
> ferruh
> 
> >
> >>
> >>>
> >>>>
> >>>>>
> >>>>>>
> >>>>>> Other than that, I prefer to avoid ifdef clutter by introducing
> >>>>>> two separate file just like ring C11 implementation.
> >>>>>>
> >>>>>> I don't have strong opinion on this this part, I let KNI
> >>>>>> MAINTAINER to decide on how to accommodate this change.
> >>>>>
> >>>>> I prefer to change this as well, I am open for suggestions.
> >>>>> Introducing two separate files would be too much for this library.
> >>>>> A better
> >>>> way would be to have something similar to 'smp_store_release'
> >>>> provided by the kernel. i.e. create #defines for loads/stores. Hide
> >>>> the clutter behind the #defines.
> >>>>
> >>>> No Strong opinion on this, leaving to KNI Maintainer.
> >>> Will wait on this before re-spinning the patch
> >>>
> >>>>
> >>>> This patch needs to split by two,
> >>>> a) Fixes for non C11 implementation(i.e new addition to
> >>>> rte_smp_wmb())
> >>>> b) add support for C11 implementation.
> >>> Agree
> >>>
> >>>>
> >>>>>
> >>>>>>
> >>>>>>> +       return (fifo->len + fifo_write - fifo_read) & (fifo->len
> >>>>>>> +- 1); #else
> >>>>>>>         return (fifo->len + fifo->write - fifo->read) &
> >>>>>>> (fifo->len
> >>>>>>> - 1);
> > Requires rte_smp_rmb() for x86 to prevent compiler reordering.
> >
> >>>>>>> +#endif
> >>>>>>>  }
> >>>>>>> --
> >>>>>>> 2.7.4
> >>>>>>>

Thanks
Phil
  
Honnappa Nagarahalli Oct. 1, 2018, 4:52 a.m. UTC | #12
> 
> On 9/19/2018 2:42 PM, dev-bounces@dpdk.org wrote:
> > With existing code in kni_fifo_put, rx_q values are not being updated
> > before updating fifo_write. While reading rx_q in kni_net_rx_normal,
> > This is causing the sync issue on other core. The same situation
> > happens in kni_fifo_get as well.
> >
> > So syncing the values by adding C11 atomic memory barriers to make
> > sure the values being synced before updating fifo_write and fifo_read.
> >
> > Fixes: 3fc5ca2 ("kni: initial import")
> > Signed-off-by: Phil Yang <phil.yang@arm.com>
> > Reviewed-by: Honnappa Nagarahalli <Honnappa.Nagarahalli@arm.com>
> > Reviewed-by: Gavin Hu <Gavin.Hu@arm.com>
> > ---
> >  .../linuxapp/eal/include/exec-env/rte_kni_common.h |  5 ++++
> >  lib/librte_kni/rte_kni_fifo.h                      | 30 +++++++++++++++++++++-
> >  2 files changed, 34 insertions(+), 1 deletion(-)
> >
> > diff --git
> > a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > index cfa9448..1fd713b 100644
> > --- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > +++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
> > @@ -54,8 +54,13 @@ struct rte_kni_request {
> >   * Writing should never overwrite the read position
> >   */
> >  struct rte_kni_fifo {
> > +#ifndef RTE_USE_C11_MEM_MODEL
> >  	volatile unsigned write;     /**< Next position to be written*/
> >  	volatile unsigned read;      /**< Next position to be read */
> > +#else
> > +	unsigned write;              /**< Next position to be written*/
> > +	unsigned read;               /**< Next position to be read */
> > +#endif
> >  	unsigned len;                /**< Circular buffer length */
> >  	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
> >  	void *volatile buffer[];     /**< The buffer contains mbuf pointers */
> > diff --git a/lib/librte_kni/rte_kni_fifo.h
> > b/lib/librte_kni/rte_kni_fifo.h index ac26a8c..f4171a1 100644
> > --- a/lib/librte_kni/rte_kni_fifo.h
> > +++ b/lib/librte_kni/rte_kni_fifo.h
> > @@ -28,8 +28,13 @@ kni_fifo_put(struct rte_kni_fifo *fifo, void
> > **data, unsigned num)  {
> >  	unsigned i = 0;
> >  	unsigned fifo_write = fifo->write;
> > -	unsigned fifo_read = fifo->read;
> >  	unsigned new_write = fifo_write;
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	unsigned fifo_read = __atomic_load_n(&fifo->read,
> > +						 __ATOMIC_ACQUIRE);
> > +#else
> > +	unsigned fifo_read = fifo->read;
> > +#endif
> 
> Why atomic load preferred against "volatile", won't both end up accessing
> memory, is atomic load faster?
> 
My understanding is that with the introduction of C11 atomics, 'volatile' was recommended to be used for memory-mapped I/O locations only. Hence, we removed the 'volatile' for the variables while using C11 (keeping it does not hurt either). However, this also means that every load and store of the variable has to be done using the C11 atomics including relaxed loads.

The '__atomic_load_n' above is providing the memory ordering which the normal load will not provide. For relaxed memory ordered architectures like Arm, the ordering needs to be done explicitly to provide correct functionality.

> >
> >  	for (i = 0; i < num; i++) {
> >  		new_write = (new_write + 1) & (fifo->len - 1); @@ -39,7
> +44,12 @@
> > kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
> >  		fifo->buffer[fifo_write] = data[i];
> >  		fifo_write = new_write;
> >  	}
> > +#ifdef RTE_USE_C11_MEM_MODEL
> > +	__atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
> #else
> > +	rte_smp_wmb();
> >  	fifo->write = fifo_write;
> > +#endif
> 
> How atomic store guaranties "fifo->buffer[fifo_write] = data[i];" will wait
> "fifo->write = fifo_write;"? Is atomic store also behave as write memory
> barrier?
__atomic_store_n with __ATOMIC_RELEASE will prevent memory reordering of fifo->write with any preceding loads or stores. This is called one-way barrier providing load-store and store-store fence [1].

[1] https://preshing.com/20120913/acquire-and-release-semantics/
  

Patch

diff --git a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
index cfa9448..1fd713b 100644
--- a/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
+++ b/lib/librte_eal/linuxapp/eal/include/exec-env/rte_kni_common.h
@@ -54,8 +54,13 @@  struct rte_kni_request {
  * Writing should never overwrite the read position
  */
 struct rte_kni_fifo {
+#ifndef RTE_USE_C11_MEM_MODEL
 	volatile unsigned write;     /**< Next position to be written*/
 	volatile unsigned read;      /**< Next position to be read */
+#else
+	unsigned write;              /**< Next position to be written*/
+	unsigned read;               /**< Next position to be read */
+#endif
 	unsigned len;                /**< Circular buffer length */
 	unsigned elem_size;          /**< Pointer size - for 32/64 bit OS */
 	void *volatile buffer[];     /**< The buffer contains mbuf pointers */
diff --git a/lib/librte_kni/rte_kni_fifo.h b/lib/librte_kni/rte_kni_fifo.h
index ac26a8c..f4171a1 100644
--- a/lib/librte_kni/rte_kni_fifo.h
+++ b/lib/librte_kni/rte_kni_fifo.h
@@ -28,8 +28,13 @@  kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
 {
 	unsigned i = 0;
 	unsigned fifo_write = fifo->write;
-	unsigned fifo_read = fifo->read;
 	unsigned new_write = fifo_write;
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned fifo_read = __atomic_load_n(&fifo->read,
+						 __ATOMIC_ACQUIRE);
+#else
+	unsigned fifo_read = fifo->read;
+#endif
 
 	for (i = 0; i < num; i++) {
 		new_write = (new_write + 1) & (fifo->len - 1);
@@ -39,7 +44,12 @@  kni_fifo_put(struct rte_kni_fifo *fifo, void **data, unsigned num)
 		fifo->buffer[fifo_write] = data[i];
 		fifo_write = new_write;
 	}
+#ifdef RTE_USE_C11_MEM_MODEL
+	__atomic_store_n(&fifo->write, fifo_write, __ATOMIC_RELEASE);
+#else
+	rte_smp_wmb();
 	fifo->write = fifo_write;
+#endif
 	return i;
 }
 
@@ -51,7 +61,12 @@  kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
 {
 	unsigned i = 0;
 	unsigned new_read = fifo->read;
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned fifo_write = __atomic_load_n(&fifo->write, __ATOMIC_ACQUIRE);
+#else
 	unsigned fifo_write = fifo->write;
+#endif
+
 	for (i = 0; i < num; i++) {
 		if (new_read == fifo_write)
 			break;
@@ -59,7 +74,12 @@  kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
 		data[i] = fifo->buffer[new_read];
 		new_read = (new_read + 1) & (fifo->len - 1);
 	}
+#ifdef RTE_USE_C11_MEM_MODEL
+	__atomic_store_n(&fifo->read, new_read, __ATOMIC_RELEASE);
+#else
+	rte_smp_wmb();
 	fifo->read = new_read;
+#endif
 	return i;
 }
 
@@ -69,5 +89,13 @@  kni_fifo_get(struct rte_kni_fifo *fifo, void **data, unsigned num)
 static inline uint32_t
 kni_fifo_count(struct rte_kni_fifo *fifo)
 {
+#ifdef RTE_USE_C11_MEM_MODEL
+	unsigned fifo_write = __atomic_load_n(&fifo->write,
+						  __ATOMIC_ACQUIRE);
+	unsigned fifo_read = __atomic_load_n(&fifo->read,
+						 __ATOMIC_ACQUIRE);
+	return (fifo->len + fifo_write - fifo_read) & (fifo->len - 1);
+#else
 	return (fifo->len + fifo->write - fifo->read) & (fifo->len - 1);
+#endif
 }