[dpdk-dev,v5,06/20] event/sw: add support for event queues

Message ID 1490374395-149320-7-git-send-email-harry.van.haaren@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Jerin Jacob
Headers

Checks

Context Check Description
ci/checkpatch warning coding style issues
ci/Intel-compilation success Compilation OK

Commit Message

Van Haaren, Harry March 24, 2017, 4:53 p.m. UTC
  From: Bruce Richardson <bruce.richardson@intel.com>

Add in the data structures for the event queues, and the eventdev
functions to create and destroy those queues.

Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
---
 drivers/event/sw/iq_ring.h  | 176 ++++++++++++++++++++++++++++++++++++++++++++
 drivers/event/sw/sw_evdev.c | 166 +++++++++++++++++++++++++++++++++++++++++
 drivers/event/sw/sw_evdev.h |   5 ++
 3 files changed, 347 insertions(+)
 create mode 100644 drivers/event/sw/iq_ring.h
  

Comments

Jerin Jacob March 27, 2017, 7:45 a.m. UTC | #1
On Fri, Mar 24, 2017 at 04:53:01PM +0000, Harry van Haaren wrote:
> From: Bruce Richardson <bruce.richardson@intel.com>
> 
> Add in the data structures for the event queues, and the eventdev
> functions to create and destroy those queues.
> 
> Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> ---
>  drivers/event/sw/iq_ring.h  | 176 ++++++++++++++++++++++++++++++++++++++++++++
>  drivers/event/sw/sw_evdev.c | 166 +++++++++++++++++++++++++++++++++++++++++
>  drivers/event/sw/sw_evdev.h |   5 ++
>  3 files changed, 347 insertions(+)
>  create mode 100644 drivers/event/sw/iq_ring.h
> 
> diff --git a/drivers/event/sw/iq_ring.h b/drivers/event/sw/iq_ring.h
> new file mode 100644
> index 0000000..d480d15
> --- /dev/null
> +++ b/drivers/event/sw/iq_ring.h
> @@ -0,0 +1,176 @@
> +/*-
> + *   BSD LICENSE
> + *
> + *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
> + *
> + *   Redistribution and use in source and binary forms, with or without
> + *   modification, are permitted provided that the following conditions
> + *   are met:
> + *
> + *     * Redistributions of source code must retain the above copyright
> + *       notice, this list of conditions and the following disclaimer.
> + *     * Redistributions in binary form must reproduce the above copyright
> + *       notice, this list of conditions and the following disclaimer in
> + *       the documentation and/or other materials provided with the
> + *       distribution.
> + *     * Neither the name of Intel Corporation nor the names of its
> + *       contributors may be used to endorse or promote products derived
> + *       from this software without specific prior written permission.
> + *
> + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +
> +/*
> + * Ring structure definitions used for the internal ring buffers of the
> + * SW eventdev implementation. These are designed for single-core use only.
> + */

Plan is to replace this file with generic rte_ring once Bruce's ring
rework[1] comes in master branch. Right ?

[1] http://dpdk.org/ml/archives/dev/2017-March/061372.html

> +#ifndef _IQ_RING_
> +#define _IQ_RING_
> +
> +#include <stdint.h>
> +
> +#include <rte_common.h>
> +#include <rte_memory.h>

> +++ b/drivers/event/sw/sw_evdev.c
> @@ -38,12 +38,176 @@
>  #include <rte_ring.h>
>  
>  #include "sw_evdev.h"
> +#include "iq_ring.h"
>  
>  #define EVENTDEV_NAME_SW_PMD event_sw
>  #define NUMA_NODE_ARG "numa_node"
>  #define SCHED_QUANTA_ARG "sched_quanta"
>  #define CREDIT_QUANTA_ARG "credit_quanta"
>  
> +static int32_t
> +qid_init(struct sw_evdev *sw, unsigned int idx, int type,
> +		const struct rte_event_queue_conf *queue_conf)
> +{
> +	unsigned int i;
> +	int dev_id = sw->data->dev_id;
> +	int socket_id = sw->data->socket_id;
> +	char buf[IQ_RING_NAMESIZE];
> +	struct sw_qid *qid = &sw->qids[idx];
> +
> +	for (i = 0; i < SW_IQS_MAX; i++) {

Just for my understanding, Are 4(SW_IQS_MAX) iq rings created to address
different priority for each enqueue operation? What is the significance of
4(SW_IQS_MAX) here?

> +		snprintf(buf, sizeof(buf), "q_%u_iq_%d", idx, i);
> +		qid->iq[i] = iq_ring_create(buf, socket_id);
> +		if (!qid->iq[i]) {
> +			SW_LOG_DBG("ring create failed");
> +			goto cleanup;
> +		}
> +	}
> +
> +
> +static int
> +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
> +		const struct rte_event_queue_conf *conf)
> +{
> +	int type;
> +
> +	switch (conf->event_queue_cfg) {
> +	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
> +		type = SW_SCHED_TYPE_DIRECT;
> +		break;

event_queue_cfg is a bitmap. It is valid to have
RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY.
i.e An atomic schedule type queue and it has only one port linked to
dequeue the events.
So in the above context, The switch case is not correct. i.e
it goes to the default condition. Right?
Is this intentional?

If I understand it correctly, Based on the use case(grouped based event
pipelining), you have shared in
the documentation patch. RTE_EVENT_QUEUE_CFG_SINGLE_LINK used for last
stage(last queue). One option is if SW PMD cannot support
RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY mode
then even tough application sets the RTE_EVENT_QUEUE_CFG_SINGLE_LINK |
RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, driver can ignore
RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY. But I am not sure the case where
application sets RTE_EVENT_QUEUE_CFG_SINGLE_LINK in the middle of the pipeline.

Thoughts?

> +	case RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY:
> +		type = RTE_SCHED_TYPE_ATOMIC;
> +		break;
> +	case RTE_EVENT_QUEUE_CFG_ORDERED_ONLY:
> +		type = RTE_SCHED_TYPE_ORDERED;
> +		break;
> +	case RTE_EVENT_QUEUE_CFG_PARALLEL_ONLY:
> +		type = RTE_SCHED_TYPE_PARALLEL;
> +		break;
> +	case RTE_EVENT_QUEUE_CFG_ALL_TYPES:
> +		SW_LOG_ERR("QUEUE_CFG_ALL_TYPES not supported\n");
> +		return -ENOTSUP;
> +	default:
> +		SW_LOG_ERR("Unknown queue type %d requested\n",
> +			   conf->event_queue_cfg);
> +		return -EINVAL;
> +	}
> +
> +	struct sw_evdev *sw = sw_pmd_priv(dev);
> +	return qid_init(sw, queue_id, type, conf);
> +}
  
Bruce Richardson March 27, 2017, 8:47 a.m. UTC | #2
On Mon, Mar 27, 2017 at 01:15:06PM +0530, Jerin Jacob wrote:
> On Fri, Mar 24, 2017 at 04:53:01PM +0000, Harry van Haaren wrote:
> > From: Bruce Richardson <bruce.richardson@intel.com>
> > 
> > Add in the data structures for the event queues, and the eventdev
> > functions to create and destroy those queues.
> > 
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > ---
> >  drivers/event/sw/iq_ring.h  | 176 ++++++++++++++++++++++++++++++++++++++++++++
> >  drivers/event/sw/sw_evdev.c | 166 +++++++++++++++++++++++++++++++++++++++++
> >  drivers/event/sw/sw_evdev.h |   5 ++
> >  3 files changed, 347 insertions(+)
> >  create mode 100644 drivers/event/sw/iq_ring.h
> > 
> > diff --git a/drivers/event/sw/iq_ring.h b/drivers/event/sw/iq_ring.h
> > new file mode 100644
> > index 0000000..d480d15
> > --- /dev/null
> > +++ b/drivers/event/sw/iq_ring.h
> > @@ -0,0 +1,176 @@
> > +/*-
> > + *   BSD LICENSE
> > + *
> > + *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
> > + *
> > + *   Redistribution and use in source and binary forms, with or without
> > + *   modification, are permitted provided that the following conditions
> > + *   are met:
> > + *
> > + *     * Redistributions of source code must retain the above copyright
> > + *       notice, this list of conditions and the following disclaimer.
> > + *     * Redistributions in binary form must reproduce the above copyright
> > + *       notice, this list of conditions and the following disclaimer in
> > + *       the documentation and/or other materials provided with the
> > + *       distribution.
> > + *     * Neither the name of Intel Corporation nor the names of its
> > + *       contributors may be used to endorse or promote products derived
> > + *       from this software without specific prior written permission.
> > + *
> > + *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> > + *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> > + *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> > + *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> > + *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> > + *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> > + *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> > + *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> > + *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> > + *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> > + *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> > + */
> > +
> > +/*
> > + * Ring structure definitions used for the internal ring buffers of the
> > + * SW eventdev implementation. These are designed for single-core use only.
> > + */
> 
> Plan is to replace this file with generic rte_ring once Bruce's ring
> rework[1] comes in master branch. Right ?
> 
> [1] http://dpdk.org/ml/archives/dev/2017-March/061372.html
> 
Yes, we hope to be able to leverage that rework in future.

/Bruce
  
Van Haaren, Harry March 27, 2017, 3:17 p.m. UTC | #3
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Monday, March 27, 2017 8:45 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> 
> On Fri, Mar 24, 2017 at 04:53:01PM +0000, Harry van Haaren wrote:
> > From: Bruce Richardson <bruce.richardson@intel.com>
> >
> > Add in the data structures for the event queues, and the eventdev
> > functions to create and destroy those queues.
> >
> > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > ---

<snip>

> > +static int32_t
> > +qid_init(struct sw_evdev *sw, unsigned int idx, int type,
> > +		const struct rte_event_queue_conf *queue_conf)
> > +{
> > +	unsigned int i;
> > +	int dev_id = sw->data->dev_id;
> > +	int socket_id = sw->data->socket_id;
> > +	char buf[IQ_RING_NAMESIZE];
> > +	struct sw_qid *qid = &sw->qids[idx];
> > +
> > +	for (i = 0; i < SW_IQS_MAX; i++) {
> 
> Just for my understanding, Are 4(SW_IQS_MAX) iq rings created to address
> different priority for each enqueue operation? What is the significance of
> 4(SW_IQS_MAX) here?

Yes each IQ represents a priority level. There is a compile-time define (SW_IQS_MAX) which allows setting the number of internal-queues at each queue stage. The default number of priorities is currently 4.


> > +static int
> > +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
> > +		const struct rte_event_queue_conf *conf)
> > +{
> > +	int type;
> > +
> > +	switch (conf->event_queue_cfg) {
> > +	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
> > +		type = SW_SCHED_TYPE_DIRECT;
> > +		break;
> 
> event_queue_cfg is a bitmap. It is valid to have
> RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY.
> i.e An atomic schedule type queue and it has only one port linked to
> dequeue the events.
> So in the above context, The switch case is not correct. i.e
> it goes to the default condition. Right?
> Is this intentional?
> 
> If I understand it correctly, Based on the use case(grouped based event
> pipelining), you have shared in
> the documentation patch. RTE_EVENT_QUEUE_CFG_SINGLE_LINK used for last
> stage(last queue). One option is if SW PMD cannot support
> RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY mode
> then even tough application sets the RTE_EVENT_QUEUE_CFG_SINGLE_LINK |
> RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, driver can ignore
> RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY. But I am not sure the case where
> application sets RTE_EVENT_QUEUE_CFG_SINGLE_LINK in the middle of the pipeline.
> 
> Thoughts?


I don't like the idea of the SW PMD ignoring flags for queues - the PMD has no idea if the queue is the final or middle of the pipeline as it's the applications usage which defines that.


Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the current API doesn't prohibit it, but I don't see the actual use-case in which that may be useful. Atomic implies load-balancing is occurring, single link implies there is only one consuming core. Those seem like opposites to me?

Unless anybody sees value in queue's having both, I suggest we update the documentation to specify that a queue is either load balanced, or single-link, and that setting both flags will result in -ENOTSUP being returned. (This check can be added to EventDev layer if consistent for all PMDs).


Counter-thoughts?


> > +	case RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY:
> > +		type = RTE_SCHED_TYPE_ATOMIC;
> > +		break;
> > +	case RTE_EVENT_QUEUE_CFG_ORDERED_ONLY:
> > +		type = RTE_SCHED_TYPE_ORDERED;
> > +		break;
> > +	case RTE_EVENT_QUEUE_CFG_PARALLEL_ONLY:
> > +		type = RTE_SCHED_TYPE_PARALLEL;
> > +		break;
> > +	case RTE_EVENT_QUEUE_CFG_ALL_TYPES:
> > +		SW_LOG_ERR("QUEUE_CFG_ALL_TYPES not supported\n");
> > +		return -ENOTSUP;
> > +	default:
> > +		SW_LOG_ERR("Unknown queue type %d requested\n",
> > +			   conf->event_queue_cfg);
> > +		return -EINVAL;
> > +	}
> > +
> > +	struct sw_evdev *sw = sw_pmd_priv(dev);
> > +	return qid_init(sw, queue_id, type, conf);
> > +}
  
Jerin Jacob March 28, 2017, 10:43 a.m. UTC | #4
On Mon, Mar 27, 2017 at 03:17:48PM +0000, Van Haaren, Harry wrote:
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Monday, March 27, 2017 8:45 AM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> > 
> > On Fri, Mar 24, 2017 at 04:53:01PM +0000, Harry van Haaren wrote:
> > > From: Bruce Richardson <bruce.richardson@intel.com>
> > >
> > > Add in the data structures for the event queues, and the eventdev
> > > functions to create and destroy those queues.
> > >
> > > Signed-off-by: Bruce Richardson <bruce.richardson@intel.com>
> > > Signed-off-by: Harry van Haaren <harry.van.haaren@intel.com>
> > > ---
> 
> <snip>
> 
> > > +static int32_t
> > > +qid_init(struct sw_evdev *sw, unsigned int idx, int type,
> > > +		const struct rte_event_queue_conf *queue_conf)
> > > +{
> > > +	unsigned int i;
> > > +	int dev_id = sw->data->dev_id;
> > > +	int socket_id = sw->data->socket_id;
> > > +	char buf[IQ_RING_NAMESIZE];
> > > +	struct sw_qid *qid = &sw->qids[idx];
> > > +
> > > +	for (i = 0; i < SW_IQS_MAX; i++) {
> > 
> > Just for my understanding, Are 4(SW_IQS_MAX) iq rings created to address
> > different priority for each enqueue operation? What is the significance of
> > 4(SW_IQS_MAX) here?
> 
> Yes each IQ represents a priority level. There is a compile-time define (SW_IQS_MAX) which allows setting the number of internal-queues at each queue stage. The default number of priorities is currently 4.

OK. The reason why I asked because, If i understood it correctly the
PRIO_TO_IQ is not normalizing it correctly if SW_IQS_MAX == 4.

I thought following mapping will be the correct normalization if SW_IQS_MAX
== 4

What do you think?

priority----iq
0 - 63    -> 0
64 -127   -> 1
127 -191  -> 2
192 - 255 -> 3

Snippet from header file:
uint8_t priority;
/**< Event priority relative to other events in the
 * event queue. The requested priority should in the
 * range of  [RTE_EVENT_DEV_PRIORITY_HIGHEST,
 * RTE_EVENT_DEV_PRIORITY_LOWEST].
 * The implementation shall normalize the requested
 * priority to supported priority value.
 * Valid when the device has
 * RTE_EVENT_DEV_CAP_EVENT_QOS capability.
 */

> 
> 
> > > +static int
> > > +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
> > > +		const struct rte_event_queue_conf *conf)
> > > +{
> > > +	int type;
> > > +
> > > +	switch (conf->event_queue_cfg) {
> > > +	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
> > > +		type = SW_SCHED_TYPE_DIRECT;
> > > +		break;
> > 
> > event_queue_cfg is a bitmap. It is valid to have
> > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY.
> > i.e An atomic schedule type queue and it has only one port linked to
> > dequeue the events.
> > So in the above context, The switch case is not correct. i.e
> > it goes to the default condition. Right?
> > Is this intentional?
> > 
> > If I understand it correctly, Based on the use case(grouped based event
> > pipelining), you have shared in
> > the documentation patch. RTE_EVENT_QUEUE_CFG_SINGLE_LINK used for last
> > stage(last queue). One option is if SW PMD cannot support
> > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY mode
> > then even tough application sets the RTE_EVENT_QUEUE_CFG_SINGLE_LINK |
> > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, driver can ignore
> > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY. But I am not sure the case where
> > application sets RTE_EVENT_QUEUE_CFG_SINGLE_LINK in the middle of the pipeline.
> > 
> > Thoughts?
> 
> 
> I don't like the idea of the SW PMD ignoring flags for queues - the PMD has no idea if the queue is the final or middle of the pipeline as it's the applications usage which defines that.
> 
> 
> Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the current API doesn't prohibit it, but I don't see the actual use-case in which that may be useful. Atomic implies load-balancing is occurring, single link implies there is only one consuming core. Those seem like opposites to me?
> 
> Unless anybody sees value in queue's having both, I suggest we update the documentation to specify that a queue is either load balanced, or single-link, and that setting both flags will result in -ENOTSUP being returned. (This check can be added to EventDev layer if consistent for all PMDs).

If I understand it correctly(Based on the previous discussions),
HW implementations(Cavium or NXP) does not
need to use RTE_EVENT_QUEUE_CFG_* flags for the operations(sched type
will be derived from event.sched_type on enqueue). So that means we are
free to tailor the header file based on the SW PMD requirement on this.
But semantically it has to be inline with rest of the header file.We can
work together to make it happen.

A few question on everyone benefit:

1) Does RTE_EVENT_QUEUE_CFG_SINGLE_LINK has any other meaning other than an
event queue linked only to single port?  Based on the discussions, It was
add in the header file so that SW PMD can know upfront only single port
will be linked to the given event queue. It is added as an optimization for SW
PMD. Does it has any functional expectation?


2) Based on following topology given in documentation patch for queue
based event pipelining,

  rx_port        w1_port
	 \     /         \
	  qid0 - w2_port - qid1
	       \         /     \
		 w3_port        tx_port

a) I understand, rx_port is feeding events to qid0
b) But, Do you see any issue with following model? IMO, It scales well
linearly based on number of cores available to work(Since it is ATOMIC to
ATOMIC). Nothing wrong with
qid1 just connects to tx_port, I am just trying understand the rational
behind it?

  rx_port        w1_port         w1_port
	 \     /         \     /
	  qid0 - w2_port - qid1- w2_port
	       \         /     \
		 w3_port         w3_port

3)
> Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the current API doesn't prohibit it, but I don't see the actual use-case in which that may be useful. Atomic implies load-balancing is occurring, single link implies there is only one consuming core. Those seem like opposites to me?

I can think about the following use case:

topology:

  rx_port        w1_port
	 \     /         \
	  qid0 - w2_port - qid1
	       \         /     \
		 w3_port        tx_port

Use case:

Queue based event pipeling:
ORERDED(Stage1) to ATOMIC(Stage2) pipeline:
- For ingress order maintenance
- For executing Stage 1 in parallel for better scaling
i.e A fat flow can spray over N cores while maintaining the ingress
order when it sends out on the wire(after consuming from tx_port)

I am not sure how SW PMD work in the use case of ingress order maintenance.

But the HW and header file expects this form:
Snippet from header file:
--
 * The source flow ordering from an event queue is maintained when events are
 * enqueued to their destination queue within the same ordered flow context.
 *
 * Events from the source queue appear in their original order when dequeued
 * from a destination queue.
--
Here qid0 is source queue with ORDERED sched_type and qid1 is destination
queue with ATOMIC sched_type. qid1 can be linked to only port(tx_port).

Are we on same page? If not, let me know the differences? We will try to
accommodate the same in header file.

> 

> 
> 
> Counter-thoughts?


> 
> 
> > > +	case RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY:
> > > +		type = RTE_SCHED_TYPE_ATOMIC;
> > > +		break;
> > > +	case RTE_EVENT_QUEUE_CFG_ORDERED_ONLY:
> > > +		type = RTE_SCHED_TYPE_ORDERED;
> > > +		break;
> > > +	case RTE_EVENT_QUEUE_CFG_PARALLEL_ONLY:
> > > +		type = RTE_SCHED_TYPE_PARALLEL;
> > > +		break;
> > > +	case RTE_EVENT_QUEUE_CFG_ALL_TYPES:
> > > +		SW_LOG_ERR("QUEUE_CFG_ALL_TYPES not supported\n");
> > > +		return -ENOTSUP;
> > > +	default:
> > > +		SW_LOG_ERR("Unknown queue type %d requested\n",
> > > +			   conf->event_queue_cfg);
> > > +		return -EINVAL;
> > > +	}
> > > +
> > > +	struct sw_evdev *sw = sw_pmd_priv(dev);
> > > +	return qid_init(sw, queue_id, type, conf);
> > > +}
  
Van Haaren, Harry March 28, 2017, 12:42 p.m. UTC | #5
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Tuesday, March 28, 2017 11:43 AM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> 
> On Mon, Mar 27, 2017 at 03:17:48PM +0000, Van Haaren, Harry wrote:
> > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > Sent: Monday, March 27, 2017 8:45 AM
> > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > > Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues

<snip code + details>

> > > Just for my understanding, Are 4(SW_IQS_MAX) iq rings created to address
> > > different priority for each enqueue operation? What is the significance of
> > > 4(SW_IQS_MAX) here?
> >
> > Yes each IQ represents a priority level. There is a compile-time define (SW_IQS_MAX) which
> allows setting the number of internal-queues at each queue stage. The default number of
> priorities is currently 4.
> 
> OK. The reason why I asked because, If i understood it correctly the
> PRIO_TO_IQ is not normalizing it correctly if SW_IQS_MAX == 4.
> 
> I thought following mapping will be the correct normalization if SW_IQS_MAX
> == 4
> 
> What do you think?

<snip code suggestion + api header>

Good catch - agreed, will fix.


> > > > +static int
> > > > +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
> > > > +		const struct rte_event_queue_conf *conf)
> > > > +{
> > > > +	int type;
> > > > +
> > > > +	switch (conf->event_queue_cfg) {
> > > > +	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
> > > > +		type = SW_SCHED_TYPE_DIRECT;
> > > > +		break;
> > >
> > > event_queue_cfg is a bitmap. It is valid to have
> > > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY.
> > > i.e An atomic schedule type queue and it has only one port linked to
> > > dequeue the events.
> > > So in the above context, The switch case is not correct. i.e
> > > it goes to the default condition. Right?
> > > Is this intentional?
> > >
> > > If I understand it correctly, Based on the use case(grouped based event
> > > pipelining), you have shared in
> > > the documentation patch. RTE_EVENT_QUEUE_CFG_SINGLE_LINK used for last
> > > stage(last queue). One option is if SW PMD cannot support
> > > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY mode
> > > then even tough application sets the RTE_EVENT_QUEUE_CFG_SINGLE_LINK |
> > > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, driver can ignore
> > > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY. But I am not sure the case where
> > > application sets RTE_EVENT_QUEUE_CFG_SINGLE_LINK in the middle of the pipeline.
> > >
> > > Thoughts?
> >
> >
> > I don't like the idea of the SW PMD ignoring flags for queues - the PMD has no idea if the
> queue is the final or middle of the pipeline as it's the applications usage which defines that.
> >
> >
> > Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the
> current API doesn't prohibit it, but I don't see the actual use-case in which that may be
> useful. Atomic implies load-balancing is occurring, single link implies there is only one
> consuming core. Those seem like opposites to me?
> >
> > Unless anybody sees value in queue's having both, I suggest we update the documentation to
> specify that a queue is either load balanced, or single-link, and that setting both flags will
> result in -ENOTSUP being returned. (This check can be added to EventDev layer if consistent for
> all PMDs).
> 
> If I understand it correctly(Based on the previous discussions),
> HW implementations(Cavium or NXP) does not
> need to use RTE_EVENT_QUEUE_CFG_* flags for the operations(sched type
> will be derived from event.sched_type on enqueue). So that means we are
> free to tailor the header file based on the SW PMD requirement on this.
> But semantically it has to be inline with rest of the header file.We can
> work together to make it happen.

OK :)


> A few question on everyone benefit:
> 
> 1) Does RTE_EVENT_QUEUE_CFG_SINGLE_LINK has any other meaning other than an
> event queue linked only to single port?  Based on the discussions, It was
> add in the header file so that SW PMD can know upfront only single port
> will be linked to the given event queue. It is added as an optimization for SW
> PMD. Does it has any functional expectation?

In the context of the SW PMD, SINGLE_LINK means that a specific queue and port have a unique relationship in that there is only connection. This allows bypassing of Atomic, Ordering and Load-Balancing code. The result is a good performance increase, particularly if the worker port dequeue depth is large, as then large bursts of packets can be dequeued with little overhead.

As a result, (ATOMIC | SINGLE_LINK) is not a supported combination for the sw pmd queue types.
To be more precise, a SINGLE_LINK is its own queue type, and can not be OR-ed with any other type.


> 2) Based on following topology given in documentation patch for queue
> based event pipelining,
> 
>   rx_port    w1_port
> 	 \     /         \
> 	  qid0 - w2_port - qid1
> 	       \         /     \
> 		    w3_port        tx_port
> 
> a) I understand, rx_port is feeding events to qid0
> b) But, Do you see any issue with following model? IMO, It scales well
> linearly based on number of cores available to work(Since it is ATOMIC to
> ATOMIC). Nothing wrong with
> qid1 just connects to tx_port, I am just trying understand the rational
> behind it?
> 
>   rx_port   w1_port         w1_port
> 	 \     /         \     /
> 	  qid0 - w2_port - qid1- w2_port
> 	       \         /     \
> 		   w3_port         w3_port


This is also a valid model from the SW eventdev. 
The value of using a SINGLE_LINK at the end of a pipeline is
A) can TX all traffic on a single core (using a single queue)
B) re-ordering of traffic from the previous stage is possible

To illustrate (B), a very simple pipeline here

 RX port -> QID #1 (Ordered) -> workers(eg 4 ports) -> QID # 2 (SINGLE_LINK to tx) -> TX port

Here, QID #1 is allowed to send the packets out of order to the 4 worker ports - because they are later passed back to the eventdev for re-ordering before they get to the SINGLE_LINK stage, and then TX in the correct order.


> 3)
> > Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the
> current API doesn't prohibit it, but I don't see the actual use-case in which that may be
> useful. Atomic implies load-balancing is occurring, single link implies there is only one
> consuming core. Those seem like opposites to me?
> 
> I can think about the following use case:
> 
> topology:
> 
>   rx_port    w1_port
> 	 \     /         \
> 	  qid0 - w2_port - qid1
> 	       \         /     \
> 		    w3_port        tx_port
> 
> Use case:
> 
> Queue based event pipeling:
> ORERDED(Stage1) to ATOMIC(Stage2) pipeline:
> - For ingress order maintenance
> - For executing Stage 1 in parallel for better scaling
> i.e A fat flow can spray over N cores while maintaining the ingress
> order when it sends out on the wire(after consuming from tx_port)
> 
> I am not sure how SW PMD work in the use case of ingress order maintenance.

I think my illustration of (B) above is the same use-case as you have here. Instead of using an ATOMIC stage2, the SW PMD benefits from using the SINGLE_LINK port/queue, and the SINGLE_LINK queue ensures ingress order is also egress order to the TX port.


> But the HW and header file expects this form:
> Snippet from header file:
> --
>  * The source flow ordering from an event queue is maintained when events are
>  * enqueued to their destination queue within the same ordered flow context.
>  *
>  * Events from the source queue appear in their original order when dequeued
>  * from a destination queue.
> --
> Here qid0 is source queue with ORDERED sched_type and qid1 is destination
> queue with ATOMIC sched_type. qid1 can be linked to only port(tx_port).
> 
> Are we on same page? If not, let me know the differences? We will try to
> accommodate the same in header file.

Yes I think we are saying the same thing, using slightly different words.

To summarize;
- SW PMD sees SINGLE_LINK as its own queue type, and does not support load-balanced (Atomic Ordered, Parallel) queue functionality.
- SW PMD would use a SINGLE_LINK queue/port for the final stage of a pipeline
   A) to allow re-ordering to happen if required
   B) to merge traffic from multiple ports into a single stream for TX

A possible solution;
1) The application creates a SINGLE_LINK for the purpose of ensuring re-ordering is taking place as expected, and linking only one port for TX.
2) SW PMDs can create a SINGLE_LINK queue type, and benefit from the optimization
3) HW PMDs can ignore the "SINGLE_LINK" aspect and uses an ATOMIC instead (as per your example in 3) above)

The application doesn't have to change anything, and just configures its pipeline. The PMD is able to optimize if it makes sense (SW) or just use another queue type to provide the same functionality to the application (HW).

Thoughts? -Harry
  
Jerin Jacob March 28, 2017, 5:36 p.m. UTC | #6
On Tue, Mar 28, 2017 at 12:42:27PM +0000, Van Haaren, Harry wrote:
> > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > Sent: Tuesday, March 28, 2017 11:43 AM
> > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> > 
> > On Mon, Mar 27, 2017 at 03:17:48PM +0000, Van Haaren, Harry wrote:
> > > > From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> > > > Sent: Monday, March 27, 2017 8:45 AM
> > > > To: Van Haaren, Harry <harry.van.haaren@intel.com>
> > > > Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> > > > Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> 
> <snip code + details>
> 
> > > > Just for my understanding, Are 4(SW_IQS_MAX) iq rings created to address
> > > > different priority for each enqueue operation? What is the significance of
> > > > 4(SW_IQS_MAX) here?
> > >
> > > Yes each IQ represents a priority level. There is a compile-time define (SW_IQS_MAX) which
> > allows setting the number of internal-queues at each queue stage. The default number of
> > priorities is currently 4.
> > 
> > OK. The reason why I asked because, If i understood it correctly the
> > PRIO_TO_IQ is not normalizing it correctly if SW_IQS_MAX == 4.
> > 
> > I thought following mapping will be the correct normalization if SW_IQS_MAX
> > == 4
> > 
> > What do you think?
> 
> <snip code suggestion + api header>
> 
> Good catch - agreed, will fix.
> 
> 
> > > > > +static int
> > > > > +sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
> > > > > +		const struct rte_event_queue_conf *conf)
> > > > > +{
> > > > > +	int type;
> > > > > +
> > > > > +	switch (conf->event_queue_cfg) {
> > > > > +	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
> > > > > +		type = SW_SCHED_TYPE_DIRECT;
> > > > > +		break;
> > > >
> > > > event_queue_cfg is a bitmap. It is valid to have
> > > > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY.
> > > > i.e An atomic schedule type queue and it has only one port linked to
> > > > dequeue the events.
> > > > So in the above context, The switch case is not correct. i.e
> > > > it goes to the default condition. Right?
> > > > Is this intentional?
> > > >
> > > > If I understand it correctly, Based on the use case(grouped based event
> > > > pipelining), you have shared in
> > > > the documentation patch. RTE_EVENT_QUEUE_CFG_SINGLE_LINK used for last
> > > > stage(last queue). One option is if SW PMD cannot support
> > > > RTE_EVENT_QUEUE_CFG_SINGLE_LINK | RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY mode
> > > > then even tough application sets the RTE_EVENT_QUEUE_CFG_SINGLE_LINK |
> > > > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY, driver can ignore
> > > > RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY. But I am not sure the case where
> > > > application sets RTE_EVENT_QUEUE_CFG_SINGLE_LINK in the middle of the pipeline.
> > > >
> > > > Thoughts?
> > >
> > >
> > > I don't like the idea of the SW PMD ignoring flags for queues - the PMD has no idea if the
> > queue is the final or middle of the pipeline as it's the applications usage which defines that.
> > >
> > >
> > > Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the
> > current API doesn't prohibit it, but I don't see the actual use-case in which that may be
> > useful. Atomic implies load-balancing is occurring, single link implies there is only one
> > consuming core. Those seem like opposites to me?
> > >
> > > Unless anybody sees value in queue's having both, I suggest we update the documentation to
> > specify that a queue is either load balanced, or single-link, and that setting both flags will
> > result in -ENOTSUP being returned. (This check can be added to EventDev layer if consistent for
> > all PMDs).
> > 
> > If I understand it correctly(Based on the previous discussions),
> > HW implementations(Cavium or NXP) does not
> > need to use RTE_EVENT_QUEUE_CFG_* flags for the operations(sched type
> > will be derived from event.sched_type on enqueue). So that means we are
> > free to tailor the header file based on the SW PMD requirement on this.
> > But semantically it has to be inline with rest of the header file.We can
> > work together to make it happen.
> 
> OK :)
> 
> 
> > A few question on everyone benefit:
> > 
> > 1) Does RTE_EVENT_QUEUE_CFG_SINGLE_LINK has any other meaning other than an
> > event queue linked only to single port?  Based on the discussions, It was
> > add in the header file so that SW PMD can know upfront only single port
> > will be linked to the given event queue. It is added as an optimization for SW
> > PMD. Does it has any functional expectation?
> 
> In the context of the SW PMD, SINGLE_LINK means that a specific queue and port have a unique relationship in that there is only connection. This allows bypassing of Atomic, Ordering and Load-Balancing code. The result is a good performance increase, particularly if the worker port dequeue depth is large, as then large bursts of packets can be dequeued with little overhead.
> 
> As a result, (ATOMIC | SINGLE_LINK) is not a supported combination for the sw pmd queue types.
> To be more precise, a SINGLE_LINK is its own queue type, and can not be OR-ed with any other type.
> 
> 
> > 2) Based on following topology given in documentation patch for queue
> > based event pipelining,
> > 
> >   rx_port    w1_port
> > 	 \     /         \
> > 	  qid0 - w2_port - qid1
> > 	       \         /     \
> > 		    w3_port        tx_port
> > 
> > a) I understand, rx_port is feeding events to qid0
> > b) But, Do you see any issue with following model? IMO, It scales well
> > linearly based on number of cores available to work(Since it is ATOMIC to
> > ATOMIC). Nothing wrong with
> > qid1 just connects to tx_port, I am just trying understand the rational
> > behind it?
> > 
> >   rx_port   w1_port         w1_port
> > 	 \     /         \     /
> > 	  qid0 - w2_port - qid1- w2_port
> > 	       \         /     \
> > 		   w3_port         w3_port
> 
> 
> This is also a valid model from the SW eventdev. 

OK. If understand it correctly, On the above topology,  Even though you
make qid2 as ATOMIC. SW PMD will not maintain ingress order when comes out of
qid1 on different workers. A SINGLE_LINK queue with one port attached
scheme is required at end of the pipeline or where ever ordering has to be
maintained. Is my understanding correct?


> The value of using a SINGLE_LINK at the end of a pipeline is
> A) can TX all traffic on a single core (using a single queue)
> B) re-ordering of traffic from the previous stage is possible
> 
> To illustrate (B), a very simple pipeline here
> 
>  RX port -> QID #1 (Ordered) -> workers(eg 4 ports) -> QID # 2 (SINGLE_LINK to tx) -> TX port
> 
> Here, QID #1 is allowed to send the packets out of order to the 4 worker ports - because they are later passed back to the eventdev for re-ordering before they get to the SINGLE_LINK stage, and then TX in the correct order.
> 
> 
> > 3)
> > > Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand the
> > current API doesn't prohibit it, but I don't see the actual use-case in which that may be
> > useful. Atomic implies load-balancing is occurring, single link implies there is only one
> > consuming core. Those seem like opposites to me?
> > 
> > I can think about the following use case:
> > 
> > topology:
> > 
> >   rx_port    w1_port
> > 	 \     /         \
> > 	  qid0 - w2_port - qid1
> > 	       \         /     \
> > 		    w3_port        tx_port
> > 
> > Use case:
> > 
> > Queue based event pipeling:
> > ORERDED(Stage1) to ATOMIC(Stage2) pipeline:
> > - For ingress order maintenance
> > - For executing Stage 1 in parallel for better scaling
> > i.e A fat flow can spray over N cores while maintaining the ingress
> > order when it sends out on the wire(after consuming from tx_port)
> > 
> > I am not sure how SW PMD work in the use case of ingress order maintenance.
> 
> I think my illustration of (B) above is the same use-case as you have here. Instead of using an ATOMIC stage2, the SW PMD benefits from using the SINGLE_LINK port/queue, and the SINGLE_LINK queue ensures ingress order is also egress order to the TX port.
> 
> 
> > But the HW and header file expects this form:
> > Snippet from header file:
> > --
> >  * The source flow ordering from an event queue is maintained when events are
> >  * enqueued to their destination queue within the same ordered flow context.
> >  *
> >  * Events from the source queue appear in their original order when dequeued
> >  * from a destination queue.
> > --
> > Here qid0 is source queue with ORDERED sched_type and qid1 is destination
> > queue with ATOMIC sched_type. qid1 can be linked to only port(tx_port).
> > 
> > Are we on same page? If not, let me know the differences? We will try to
> > accommodate the same in header file.
> 
> Yes I think we are saying the same thing, using slightly different words.
> 
> To summarize;
> - SW PMD sees SINGLE_LINK as its own queue type, and does not support load-balanced (Atomic Ordered, Parallel) queue functionality.
> - SW PMD would use a SINGLE_LINK queue/port for the final stage of a pipeline
>    A) to allow re-ordering to happen if required
>    B) to merge traffic from multiple ports into a single stream for TX
> 
> A possible solution;
> 1) The application creates a SINGLE_LINK for the purpose of ensuring re-ordering is taking place as expected, and linking only one port for TX.

The only issue is in Low-end cores case it wont scale. TX core will become as
bottleneck and we need to have different pipelines based on the amount of traffic(40G or 10G)
a core can handle.

> 2) SW PMDs can create a SINGLE_LINK queue type, and benefit from the optimization

Yes.

> 3) HW PMDs can ignore the "SINGLE_LINK" aspect and uses an ATOMIC instead (as per your example in 3) above)

But topology will be fixed for both HW and SW. An extra port and
extra core needs to wasted for ordering business in case HW. Right?

I think, we can roll out something based on capability.

> 
> The application doesn't have to change anything, and just configures its pipeline. The PMD is able to optimize if it makes sense (SW) or just use another queue type to provide the same functionality to the application (HW).
> 
> Thoughts? -Harry
  
Van Haaren, Harry March 29, 2017, 8:28 a.m. UTC | #7
> From: Jerin Jacob [mailto:jerin.jacob@caviumnetworks.com]
> Sent: Tuesday, March 28, 2017 6:36 PM
> To: Van Haaren, Harry <harry.van.haaren@intel.com>
> Cc: dev@dpdk.org; Richardson, Bruce <bruce.richardson@intel.com>
> Subject: Re: [PATCH v5 06/20] event/sw: add support for event queues
> 

<snip IQ priority question>


> > > A few question on everyone benefit:
> > >
> > > 1) Does RTE_EVENT_QUEUE_CFG_SINGLE_LINK has any other meaning other than an
> > > event queue linked only to single port?  Based on the discussions, It was
> > > add in the header file so that SW PMD can know upfront only single port
> > > will be linked to the given event queue. It is added as an optimization for SW
> > > PMD. Does it has any functional expectation?
> >
> > In the context of the SW PMD, SINGLE_LINK means that a specific queue and port have a unique
> relationship in that there is only connection. This allows bypassing of Atomic, Ordering and
> Load-Balancing code. The result is a good performance increase, particularly if the worker port
> dequeue depth is large, as then large bursts of packets can be dequeued with little overhead.
> >
> > As a result, (ATOMIC | SINGLE_LINK) is not a supported combination for the sw pmd queue
> types.
> > To be more precise, a SINGLE_LINK is its own queue type, and can not be OR-ed with any other
> type.
> >
> >
> > > 2) Based on following topology given in documentation patch for queue
> > > based event pipelining,
> > >
> > >   rx_port    w1_port
> > > 	 \     /         \
> > > 	  qid0 - w2_port - qid1
> > > 	       \         /     \
> > > 		    w3_port        tx_port
> > >
> > > a) I understand, rx_port is feeding events to qid0
> > > b) But, Do you see any issue with following model? IMO, It scales well
> > > linearly based on number of cores available to work(Since it is ATOMIC to
> > > ATOMIC). Nothing wrong with
> > > qid1 just connects to tx_port, I am just trying understand the rational
> > > behind it?
> > >
> > >   rx_port   w1_port         w1_port
> > > 	 \     /         \     /
> > > 	  qid0 - w2_port - qid1- w2_port
> > > 	       \         /     \
> > > 		   w3_port         w3_port
> >
> >
> > This is also a valid model from the SW eventdev.
> 
> OK. If understand it correctly, On the above topology,  Even though you
> make qid2 as ATOMIC. SW PMD will not maintain ingress order when comes out of
> qid1 on different workers.


If qid0 is ORDERED, and qid1 is Atomic, then the following happens;
- after qid 0, the packets are sprayed across cores,
- they are returned out of order by worker cores
- *at the start* of qid1, packets are re-ordered back into ingress order (maintain 100% of ordering)
- on dequeue from qid1, the atomic flow distribution will keep order per flow


> A SINGLE_LINK queue with one port attached
> scheme is required at end of the pipeline or where ever ordering has to be
> maintained. Is my understanding correct?


Not quite, the SINGLE_LINK is not required at the end - we just see it as useful for common use cases.
If not useful, there is no reason (due to SW PMD) for an application to create this SINGLE_LINK to finish the pipeline.
If you have three cores that wish to TX, the above pipeline is 100% valid in the SW PMD case.


> > The value of using a SINGLE_LINK at the end of a pipeline is
> > A) can TX all traffic on a single core (using a single queue)
> > B) re-ordering of traffic from the previous stage is possible
> >
> > To illustrate (B), a very simple pipeline here
> >
> >  RX port -> QID #1 (Ordered) -> workers(eg 4 ports) -> QID # 2 (SINGLE_LINK to tx) -> TX port
> >
> > Here, QID #1 is allowed to send the packets out of order to the 4 worker ports - because they
> are later passed back to the eventdev for re-ordering before they get to the SINGLE_LINK stage,
> and then TX in the correct order.
> >
> >
> > > 3)
> > > > Does anybody have a need for a queue to be both Atomic *and* Single-link?  I understand
> the
> > > current API doesn't prohibit it, but I don't see the actual use-case in which that may be
> > > useful. Atomic implies load-balancing is occurring, single link implies there is only one
> > > consuming core. Those seem like opposites to me?
> > >
> > > I can think about the following use case:
> > >
> > > topology:
> > >
> > >   rx_port    w1_port
> > > 	 \     /         \
> > > 	  qid0 - w2_port - qid1
> > > 	       \         /     \
> > > 		    w3_port        tx_port
> > >
> > > Use case:
> > >
> > > Queue based event pipeling:
> > > ORERDED(Stage1) to ATOMIC(Stage2) pipeline:
> > > - For ingress order maintenance
> > > - For executing Stage 1 in parallel for better scaling
> > > i.e A fat flow can spray over N cores while maintaining the ingress
> > > order when it sends out on the wire(after consuming from tx_port)
> > >
> > > I am not sure how SW PMD work in the use case of ingress order maintenance.
> >
> > I think my illustration of (B) above is the same use-case as you have here. Instead of using
> an ATOMIC stage2, the SW PMD benefits from using the SINGLE_LINK port/queue, and the
> SINGLE_LINK queue ensures ingress order is also egress order to the TX port.
> >
> >
> > > But the HW and header file expects this form:
> > > Snippet from header file:
> > > --
> > >  * The source flow ordering from an event queue is maintained when events are
> > >  * enqueued to their destination queue within the same ordered flow context.
> > >  *
> > >  * Events from the source queue appear in their original order when dequeued
> > >  * from a destination queue.
> > > --
> > > Here qid0 is source queue with ORDERED sched_type and qid1 is destination
> > > queue with ATOMIC sched_type. qid1 can be linked to only port(tx_port).
> > >
> > > Are we on same page? If not, let me know the differences? We will try to
> > > accommodate the same in header file.
> >
> > Yes I think we are saying the same thing, using slightly different words.
> >
> > To summarize;
> > - SW PMD sees SINGLE_LINK as its own queue type, and does not support load-balanced (Atomic
> Ordered, Parallel) queue functionality.
> > - SW PMD would use a SINGLE_LINK queue/port for the final stage of a pipeline
> >    A) to allow re-ordering to happen if required
> >    B) to merge traffic from multiple ports into a single stream for TX
> >
> > A possible solution;
> > 1) The application creates a SINGLE_LINK for the purpose of ensuring re-ordering is taking
> place as expected, and linking only one port for TX.
> 
> The only issue is in Low-end cores case it wont scale. TX core will become as
> bottleneck and we need to have different pipelines based on the amount of traffic(40G or 10G)
> a core can handle.


See above - the SINGLE_LINK isn't required to maintain ordering. Using multiple TX cores is also valid in SW PMD.


> > 2) SW PMDs can create a SINGLE_LINK queue type, and benefit from the optimization
> 
> Yes.
> 
> > 3) HW PMDs can ignore the "SINGLE_LINK" aspect and uses an ATOMIC instead (as per your
> example in 3) above)
> 
> But topology will be fixed for both HW and SW. An extra port and
> extra core needs to wasted for ordering business in case HW. Right?


Nope, no wasting cores, see above :) The SINGLE_LINK is just an easy way to "fan in" traffic from lots of cores to one core (in a performant way in SW) to allow a single core do TX. A typical use-case might be putting RX and TX on the same core - TX is just a dequeue from a port with a SINGLE_LINK queue, and an enqueue to NIC.


Summary from the SW PMD point-of-view; 
- SINGLE_LINK is its own queue type
- SINGLE_LINK queue can NOT schedule according to (Atomic, Ordered or Parallel) rules

Is that acceptable from an API and HW point of view? 

If so, I will send a new patch for the API to specify more clearly what SINGLE_LINK is.
If not, I'm open to using a capability flag to solve the problem but my understanding right now is that there is no need.



> I think, we can roll out something based on capability.

Yes, if required that would be a good solution.


> > The application doesn't have to change anything, and just configures its pipeline. The PMD is
> able to optimize if it makes sense (SW) or just use another queue type to provide the same
> functionality to the application (HW).
> >
> > Thoughts? -Harry
  

Patch

diff --git a/drivers/event/sw/iq_ring.h b/drivers/event/sw/iq_ring.h
new file mode 100644
index 0000000..d480d15
--- /dev/null
+++ b/drivers/event/sw/iq_ring.h
@@ -0,0 +1,176 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2016-2017 Intel Corporation. All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * Ring structure definitions used for the internal ring buffers of the
+ * SW eventdev implementation. These are designed for single-core use only.
+ */
+#ifndef _IQ_RING_
+#define _IQ_RING_
+
+#include <stdint.h>
+
+#include <rte_common.h>
+#include <rte_memory.h>
+#include <rte_malloc.h>
+#include <rte_eventdev.h>
+
+#define IQ_RING_NAMESIZE 12
+#define QID_IQ_DEPTH 512
+#define QID_IQ_MASK (uint16_t)(QID_IQ_DEPTH - 1)
+
+struct iq_ring {
+	char name[IQ_RING_NAMESIZE] __rte_cache_aligned;
+	uint16_t write_idx;
+	uint16_t read_idx;
+
+	struct rte_event ring[QID_IQ_DEPTH];
+};
+
+#ifndef force_inline
+#define force_inline inline __attribute__((always_inline))
+#endif
+
+static inline struct iq_ring *
+iq_ring_create(const char *name, unsigned int socket_id)
+{
+	struct iq_ring *retval;
+
+	retval = rte_malloc_socket(NULL, sizeof(*retval), 0, socket_id);
+	if (retval == NULL)
+		goto end;
+
+	snprintf(retval->name, sizeof(retval->name), "%s", name);
+	retval->write_idx = retval->read_idx = 0;
+end:
+	return retval;
+}
+
+static inline void
+iq_ring_destroy(struct iq_ring *r)
+{
+	rte_free(r);
+}
+
+static force_inline uint16_t
+iq_ring_count(const struct iq_ring *r)
+{
+	return r->write_idx - r->read_idx;
+}
+
+static force_inline uint16_t
+iq_ring_free_count(const struct iq_ring *r)
+{
+	return QID_IQ_MASK - iq_ring_count(r);
+}
+
+static force_inline uint16_t
+iq_ring_enqueue_burst(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes)
+{
+	const uint16_t read = r->read_idx;
+	uint16_t write = r->write_idx;
+	const uint16_t space = read + QID_IQ_MASK - write;
+	uint16_t i;
+
+	if (space < nb_qes)
+		nb_qes = space;
+
+	for (i = 0; i < nb_qes; i++, write++)
+		r->ring[write & QID_IQ_MASK] = qes[i];
+
+	r->write_idx = write;
+
+	return nb_qes;
+}
+
+static force_inline uint16_t
+iq_ring_dequeue_burst(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes)
+{
+	uint16_t read = r->read_idx;
+	const uint16_t write = r->write_idx;
+	const uint16_t items = write - read;
+	uint16_t i;
+
+	for (i = 0; i < nb_qes; i++, read++)
+		qes[i] = r->ring[read & QID_IQ_MASK];
+
+	if (items < nb_qes)
+		nb_qes = items;
+
+	r->read_idx += nb_qes;
+
+	return nb_qes;
+}
+
+/* assumes there is space, from a previous dequeue_burst */
+static force_inline uint16_t
+iq_ring_put_back(struct iq_ring *r, struct rte_event *qes, uint16_t nb_qes)
+{
+	uint16_t i, read = r->read_idx;
+
+	for (i = nb_qes; i-- > 0; )
+		r->ring[--read & QID_IQ_MASK] = qes[i];
+
+	r->read_idx = read;
+	return nb_qes;
+}
+
+static force_inline const struct rte_event *
+iq_ring_peek(const struct iq_ring *r)
+{
+	return &r->ring[r->read_idx & QID_IQ_MASK];
+}
+
+static force_inline void
+iq_ring_pop(struct iq_ring *r)
+{
+	r->read_idx++;
+}
+
+static force_inline int
+iq_ring_enqueue(struct iq_ring *r, const struct rte_event *qe)
+{
+	const uint16_t read = r->read_idx;
+	const uint16_t write = r->write_idx;
+	const uint16_t space = read + QID_IQ_MASK - write;
+
+	if (space == 0)
+		return -1;
+
+	r->ring[write & QID_IQ_MASK] = *qe;
+
+	r->write_idx = write + 1;
+
+	return 0;
+}
+
+#endif
diff --git a/drivers/event/sw/sw_evdev.c b/drivers/event/sw/sw_evdev.c
index d1fa3a7..eaf8e77 100644
--- a/drivers/event/sw/sw_evdev.c
+++ b/drivers/event/sw/sw_evdev.c
@@ -38,12 +38,176 @@ 
 #include <rte_ring.h>
 
 #include "sw_evdev.h"
+#include "iq_ring.h"
 
 #define EVENTDEV_NAME_SW_PMD event_sw
 #define NUMA_NODE_ARG "numa_node"
 #define SCHED_QUANTA_ARG "sched_quanta"
 #define CREDIT_QUANTA_ARG "credit_quanta"
 
+static int32_t
+qid_init(struct sw_evdev *sw, unsigned int idx, int type,
+		const struct rte_event_queue_conf *queue_conf)
+{
+	unsigned int i;
+	int dev_id = sw->data->dev_id;
+	int socket_id = sw->data->socket_id;
+	char buf[IQ_RING_NAMESIZE];
+	struct sw_qid *qid = &sw->qids[idx];
+
+	for (i = 0; i < SW_IQS_MAX; i++) {
+		snprintf(buf, sizeof(buf), "q_%u_iq_%d", idx, i);
+		qid->iq[i] = iq_ring_create(buf, socket_id);
+		if (!qid->iq[i]) {
+			SW_LOG_DBG("ring create failed");
+			goto cleanup;
+		}
+	}
+
+	/* Initialize the FID structures to no pinning (-1), and zero packets */
+	const struct sw_fid_t fid = {.cq = -1, .pcount = 0};
+	for (i = 0; i < RTE_DIM(qid->fids); i++)
+		qid->fids[i] = fid;
+
+	qid->id = idx;
+	qid->type = type;
+	qid->priority = queue_conf->priority;
+
+	if (qid->type == RTE_SCHED_TYPE_ORDERED) {
+		char ring_name[RTE_RING_NAMESIZE];
+		uint32_t window_size;
+
+		/* rte_ring and window_size_mask require require window_size to
+		 * be a power-of-2.
+		 */
+		window_size = rte_align32pow2(
+				queue_conf->nb_atomic_order_sequences);
+
+		qid->window_size = window_size - 1;
+
+		if (!window_size) {
+			SW_LOG_DBG(
+				"invalid reorder_window_size for ordered queue\n"
+				);
+			goto cleanup;
+		}
+
+		snprintf(buf, sizeof(buf), "sw%d_iq_%d_rob", dev_id, i);
+		qid->reorder_buffer = rte_zmalloc_socket(buf,
+				window_size * sizeof(qid->reorder_buffer[0]),
+				0, socket_id);
+		if (!qid->reorder_buffer) {
+			SW_LOG_DBG("reorder_buffer malloc failed\n");
+			goto cleanup;
+		}
+
+		memset(&qid->reorder_buffer[0],
+		       0,
+		       window_size * sizeof(qid->reorder_buffer[0]));
+
+		snprintf(ring_name, sizeof(ring_name), "sw%d_q%d_freelist",
+				dev_id, idx);
+
+		/* lookup the ring, and if it already exists, free it */
+		struct rte_ring *cleanup = rte_ring_lookup(ring_name);
+		if (cleanup)
+			rte_ring_free(cleanup);
+
+		qid->reorder_buffer_freelist = rte_ring_create(ring_name,
+				window_size,
+				socket_id,
+				RING_F_SP_ENQ | RING_F_SC_DEQ);
+		if (!qid->reorder_buffer_freelist) {
+			SW_LOG_DBG("freelist ring create failed");
+			goto cleanup;
+		}
+
+		/* Populate the freelist with reorder buffer entries. Enqueue
+		 * 'window_size - 1' entries because the rte_ring holds only
+		 * that many.
+		 */
+		for (i = 0; i < window_size - 1; i++) {
+			if (rte_ring_sp_enqueue(qid->reorder_buffer_freelist,
+						&qid->reorder_buffer[i]) < 0)
+				goto cleanup;
+		}
+
+		qid->reorder_buffer_index = 0;
+		qid->cq_next_tx = 0;
+	}
+
+	qid->initialized = 1;
+
+	return 0;
+
+cleanup:
+	for (i = 0; i < SW_IQS_MAX; i++) {
+		if (qid->iq[i])
+			iq_ring_destroy(qid->iq[i]);
+	}
+
+	if (qid->reorder_buffer) {
+		rte_free(qid->reorder_buffer);
+		qid->reorder_buffer = NULL;
+	}
+
+	if (qid->reorder_buffer_freelist) {
+		rte_ring_free(qid->reorder_buffer_freelist);
+		qid->reorder_buffer_freelist = NULL;
+	}
+
+	return -EINVAL;
+}
+
+static int
+sw_queue_setup(struct rte_eventdev *dev, uint8_t queue_id,
+		const struct rte_event_queue_conf *conf)
+{
+	int type;
+
+	switch (conf->event_queue_cfg) {
+	case RTE_EVENT_QUEUE_CFG_SINGLE_LINK:
+		type = SW_SCHED_TYPE_DIRECT;
+		break;
+	case RTE_EVENT_QUEUE_CFG_ATOMIC_ONLY:
+		type = RTE_SCHED_TYPE_ATOMIC;
+		break;
+	case RTE_EVENT_QUEUE_CFG_ORDERED_ONLY:
+		type = RTE_SCHED_TYPE_ORDERED;
+		break;
+	case RTE_EVENT_QUEUE_CFG_PARALLEL_ONLY:
+		type = RTE_SCHED_TYPE_PARALLEL;
+		break;
+	case RTE_EVENT_QUEUE_CFG_ALL_TYPES:
+		SW_LOG_ERR("QUEUE_CFG_ALL_TYPES not supported\n");
+		return -ENOTSUP;
+	default:
+		SW_LOG_ERR("Unknown queue type %d requested\n",
+			   conf->event_queue_cfg);
+		return -EINVAL;
+	}
+
+	struct sw_evdev *sw = sw_pmd_priv(dev);
+	return qid_init(sw, queue_id, type, conf);
+}
+
+static void
+sw_queue_release(struct rte_eventdev *dev, uint8_t id)
+{
+	struct sw_evdev *sw = sw_pmd_priv(dev);
+	struct sw_qid *qid = &sw->qids[id];
+	uint32_t i;
+
+	for (i = 0; i < SW_IQS_MAX; i++)
+		iq_ring_destroy(qid->iq[i]);
+
+	if (qid->type == RTE_SCHED_TYPE_ORDERED) {
+		rte_free(qid->reorder_buffer);
+		rte_ring_free(qid->reorder_buffer_freelist);
+	}
+	memset(qid, 0, sizeof(*qid));
+}
+
 static void
 sw_queue_def_conf(struct rte_eventdev *dev, uint8_t queue_id,
 				 struct rte_event_queue_conf *conf)
@@ -147,6 +311,8 @@  sw_probe(const char *name, const char *params)
 			.dev_infos_get = sw_info_get,
 
 			.queue_def_conf = sw_queue_def_conf,
+			.queue_setup = sw_queue_setup,
+			.queue_release = sw_queue_release,
 			.port_def_conf = sw_port_def_conf,
 	};
 
diff --git a/drivers/event/sw/sw_evdev.h b/drivers/event/sw/sw_evdev.h
index fda57df..ddf0cd2 100644
--- a/drivers/event/sw/sw_evdev.h
+++ b/drivers/event/sw/sw_evdev.h
@@ -52,6 +52,8 @@ 
 #define EVENTDEV_NAME_SW_PMD event_sw
 #define SW_PMD_NAME RTE_STR(event_sw)
 
+#define SW_SCHED_TYPE_DIRECT (RTE_SCHED_TYPE_PARALLEL + 1)
+
 #ifdef RTE_LIBRTE_PMD_EVDEV_SW_DEBUG
 #define SW_LOG_INFO(fmt, args...) \
 	RTE_LOG(INFO, EVENTDEV, "[%s] %s() line %u: " fmt "\n", \
@@ -139,6 +141,9 @@  struct sw_evdev {
 	 */
 	uint32_t nb_events_limit;
 
+	/* Internal queues - one per logical queue */
+	struct sw_qid qids[RTE_EVENT_MAX_QUEUES_PER_DEV] __rte_cache_aligned;
+
 	int32_t sched_quanta;
 
 	uint32_t credit_update_quanta;